From dd58ef019b700900793a1eb48b52123db01b654e Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Wed, 30 Dec 2015 11:46:15 +0000 Subject: Vendor import of llvm trunk r256633: https://llvm.org/svn/llvm-project/llvm/trunk@256633 --- .../aarch64-2014-08-11-MachineCombinerCrash.ll | 46 +- test/CodeGen/AArch64/aarch64-addv.ll | 98 + test/CodeGen/AArch64/aarch64-deferred-spilling.ll | 514 + .../AArch64/aarch64-dynamic-stack-layout.ll | 24 +- .../AArch64/aarch64-interleaved-accesses.ll | 147 +- test/CodeGen/AArch64/aarch64-loop-gep-opt.ll | 50 + test/CodeGen/AArch64/aarch64-minmaxv.ll | 511 + test/CodeGen/AArch64/aarch64-smax-constantfold.ll | 12 + test/CodeGen/AArch64/addsub_ext.ll | 146 + test/CodeGen/AArch64/alloca.ll | 4 +- .../AArch64/arm64-2011-03-17-AsmPrinterCrash.ll | 14 +- test/CodeGen/AArch64/arm64-aapcs-be.ll | 2 +- test/CodeGen/AArch64/arm64-aapcs.ll | 21 +- test/CodeGen/AArch64/arm64-abi_align.ll | 2 +- test/CodeGen/AArch64/arm64-addr-type-promotion.ll | 9 +- .../AArch64/arm64-alloca-frame-pointer-offset.ll | 6 +- test/CodeGen/AArch64/arm64-arith.ll | 3 +- test/CodeGen/AArch64/arm64-atomic-128.ll | 7 +- test/CodeGen/AArch64/arm64-atomic.ll | 70 +- test/CodeGen/AArch64/arm64-builtins-linux.ll | 11 + test/CodeGen/AArch64/arm64-ccmp-heuristics.ll | 4 +- test/CodeGen/AArch64/arm64-ccmp.ll | 166 +- test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll | 17 + test/CodeGen/AArch64/arm64-collect-loh.ll | 604 + test/CodeGen/AArch64/arm64-fast-isel-br.ll | 15 +- test/CodeGen/AArch64/arm64-fmax-safe.ll | 53 + test/CodeGen/AArch64/arm64-fmax.ll | 46 +- test/CodeGen/AArch64/arm64-fp128.ll | 31 +- test/CodeGen/AArch64/arm64-hello.ll | 4 +- test/CodeGen/AArch64/arm64-indexed-memory.ll | 33 + test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll | 26 +- test/CodeGen/AArch64/arm64-inline-asm.ll | 2 +- test/CodeGen/AArch64/arm64-join-reserved.ll | 2 +- test/CodeGen/AArch64/arm64-large-frame.ll | 2 +- test/CodeGen/AArch64/arm64-ld-from-st.ll | 666 + test/CodeGen/AArch64/arm64-ldp.ll | 188 +- test/CodeGen/AArch64/arm64-long-shift.ll | 80 +- .../AArch64/arm64-misaligned-memcpy-inline.ll | 2 +- test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll | 406 + test/CodeGen/AArch64/arm64-neon-2velem.ll | 55 + test/CodeGen/AArch64/arm64-neon-copy.ll | 17 +- .../AArch64/arm64-patchpoint-webkit_jscc.ll | 8 +- test/CodeGen/AArch64/arm64-platform-reg.ll | 4 +- test/CodeGen/AArch64/arm64-popcnt.ll | 8 +- test/CodeGen/AArch64/arm64-rounding.ll | 62 +- test/CodeGen/AArch64/arm64-shrink-wrapping.ll | 95 +- test/CodeGen/AArch64/arm64-spill-lr.ll | 6 +- test/CodeGen/AArch64/arm64-stackmap.ll | 4 +- test/CodeGen/AArch64/arm64-stp.ll | 34 +- test/CodeGen/AArch64/arm64-strict-align.ll | 5 +- test/CodeGen/AArch64/arm64-tls-dynamic-together.ll | 43 +- test/CodeGen/AArch64/arm64-trunc-store.ll | 2 +- test/CodeGen/AArch64/arm64-vabs.ll | 66 + test/CodeGen/AArch64/arm64-variadic-aapcs.ll | 2 +- test/CodeGen/AArch64/arm64-vector-ext.ll | 54 +- test/CodeGen/AArch64/arm64-vminmaxnm.ll | 17 +- test/CodeGen/AArch64/arm64-xaluo.ll | 4 +- test/CodeGen/AArch64/atomic-ops.ll | 20 +- test/CodeGen/AArch64/bitcast-v2i8.ll | 2 +- test/CodeGen/AArch64/bitfield-insert.ll | 41 + test/CodeGen/AArch64/bitfield.ll | 46 +- test/CodeGen/AArch64/bitreverse.ll | 87 + test/CodeGen/AArch64/combine-comparisons-by-cse.ll | 26 + test/CodeGen/AArch64/cpus.ll | 1 + test/CodeGen/AArch64/cxx-tlscc.ll | 76 + test/CodeGen/AArch64/dag-combine-select.ll | 47 + test/CodeGen/AArch64/divrem.ll | 22 + test/CodeGen/AArch64/emutls.ll | 116 + test/CodeGen/AArch64/emutls_generic.ll | 59 + test/CodeGen/AArch64/eon.ll | 29 + test/CodeGen/AArch64/f16-instructions.ll | 111 +- test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll | 19 + .../CodeGen/AArch64/fast-isel-branch-cond-split.ll | 52 +- test/CodeGen/AArch64/fast-isel-cmp-vec.ll | 100 + test/CodeGen/AArch64/fast-isel-folded-shift.ll | 125 + test/CodeGen/AArch64/fast-isel-logic-op.ll | 2 +- test/CodeGen/AArch64/fastcc-reserved.ll | 4 +- test/CodeGen/AArch64/fastcc.ll | 8 +- test/CodeGen/AArch64/fcvt_combine.ll | 154 + test/CodeGen/AArch64/fdiv_combine.ll | 115 + test/CodeGen/AArch64/fold-constants.ll | 19 +- test/CodeGen/AArch64/fp16-v4-instructions.ll | 51 +- test/CodeGen/AArch64/fp16-v8-instructions.ll | 63 + test/CodeGen/AArch64/free-zext.ll | 59 +- test/CodeGen/AArch64/func-argpassing.ll | 4 +- test/CodeGen/AArch64/func-calls.ll | 4 +- test/CodeGen/AArch64/global-alignment.ll | 2 +- test/CodeGen/AArch64/global-merge-1.ll | 16 +- test/CodeGen/AArch64/global-merge-2.ll | 34 +- test/CodeGen/AArch64/global-merge-3.ll | 46 +- test/CodeGen/AArch64/global-merge-4.ll | 6 +- test/CodeGen/AArch64/global-merge-group-by-use.ll | 6 +- .../global-merge-ignore-single-use-minsize.ll | 2 +- .../AArch64/global-merge-ignore-single-use.ll | 2 +- test/CodeGen/AArch64/ldst-opt.ll | 477 +- test/CodeGen/AArch64/merge-store.ll | 30 + test/CodeGen/AArch64/misched-fusion.ll | 34 + test/CodeGen/AArch64/mul-lohi.ll | 29 + test/CodeGen/AArch64/nest-register.ll | 2 +- test/CodeGen/AArch64/nontemporal.ll | 339 + test/CodeGen/AArch64/pic-eh-stubs.ll | 2 +- test/CodeGen/AArch64/readcyclecounter.ll | 15 + test/CodeGen/AArch64/regress-tblgen-chains.ll | 4 +- test/CodeGen/AArch64/remat.ll | 1 + test/CodeGen/AArch64/rotate.ll | 14 + test/CodeGen/AArch64/round-conv.ll | 330 + test/CodeGen/AArch64/shrink-wrap.ll | 184 + test/CodeGen/AArch64/stackmap-frame-setup.ll | 20 + test/CodeGen/AArch64/tail-call.ll | 6 +- test/CodeGen/AArch64/tailcall-explicit-sret.ll | 2 +- test/CodeGen/AArch64/tbi.ll | 102 + test/CodeGen/AArch64/vector-fcopysign.ll | 178 + test/CodeGen/AArch64/xbfiz.ll | 30 + test/CodeGen/AMDGPU/add.ll | 14 +- test/CodeGen/AMDGPU/address-space.ll | 6 +- test/CodeGen/AMDGPU/addrspacecast.ll | 66 + test/CodeGen/AMDGPU/and.ll | 101 +- test/CodeGen/AMDGPU/annotate-kernel-features.ll | 193 + test/CodeGen/AMDGPU/array-ptr-calc-i32.ll | 8 +- test/CodeGen/AMDGPU/bitreverse.ll | 115 + test/CodeGen/AMDGPU/calling-conventions.ll | 20 + test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll | 98 + test/CodeGen/AMDGPU/cgp-addressing-modes.ll | 254 +- test/CodeGen/AMDGPU/ci-use-flat-for-global.ll | 15 + test/CodeGen/AMDGPU/ctpop64.ll | 22 +- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 12 +- test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll | 52 + .../ds-negative-offset-addressing-mode-loop.ll | 10 +- test/CodeGen/AMDGPU/ds-sub-offset.ll | 125 + test/CodeGen/AMDGPU/ds_read2.ll | 10 +- test/CodeGen/AMDGPU/ds_read2_superreg.ll | 89 +- test/CodeGen/AMDGPU/ds_read2st64.ll | 8 +- test/CodeGen/AMDGPU/ds_write2.ll | 9 +- test/CodeGen/AMDGPU/ds_write2st64.ll | 4 +- test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 11 + test/CodeGen/AMDGPU/extract-vector-elt-i64.ll | 43 + test/CodeGen/AMDGPU/fadd64.ll | 50 +- test/CodeGen/AMDGPU/fceil64.ll | 12 +- test/CodeGen/AMDGPU/fcmp.ll | 2 +- test/CodeGen/AMDGPU/flat-address-space.ll | 77 +- test/CodeGen/AMDGPU/flat-scratch-reg.ll | 36 + test/CodeGen/AMDGPU/fma-combine.ll | 200 + test/CodeGen/AMDGPU/fmax_legacy.ll | 40 + test/CodeGen/AMDGPU/fmin_legacy.ll | 63 + test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll | 102 + test/CodeGen/AMDGPU/fneg-fabs.ll | 27 +- test/CodeGen/AMDGPU/ftrunc.f64.ll | 12 +- test/CodeGen/AMDGPU/gep-address-space.ll | 34 +- test/CodeGen/AMDGPU/global-constant.ll | 27 + test/CodeGen/AMDGPU/global-extload-i32.ll | 327 +- test/CodeGen/AMDGPU/global_atomics.ll | 20 +- test/CodeGen/AMDGPU/half.ll | 256 +- test/CodeGen/AMDGPU/hsa-globals.ll | 132 + test/CodeGen/AMDGPU/hsa-group-segment.ll | 14 + test/CodeGen/AMDGPU/hsa.ll | 36 +- test/CodeGen/AMDGPU/image-attributes.ll | 206 + test/CodeGen/AMDGPU/image-resource-id.ll | 409 + test/CodeGen/AMDGPU/imm.ll | 24 +- test/CodeGen/AMDGPU/indirect-addressing-si.ll | 67 +- test/CodeGen/AMDGPU/indirect-private-64.ll | 34 +- test/CodeGen/AMDGPU/inline-constraints.ll | 23 + test/CodeGen/AMDGPU/insert_vector_elt.ll | 103 +- test/CodeGen/AMDGPU/kernel-args.ll | 26 +- test/CodeGen/AMDGPU/large-alloca-compute.ll | 57 + test/CodeGen/AMDGPU/large-alloca-graphics.ll | 47 + test/CodeGen/AMDGPU/large-alloca.ll | 15 - test/CodeGen/AMDGPU/literals.ll | 8 +- test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll | 4 +- test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll | 2 +- test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll | 28 - test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll | 6 +- test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll | 1 - test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll | 37 + test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll | 2 +- test/CodeGen/AMDGPU/llvm.SI.packf16.ll | 29 + test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll | 16 + .../AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll | 14 + .../AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll | 16 + test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll | 16 + test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll | 30 + test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll | 24 + test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll | 29 + .../CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll | 29 + test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll | 27 + test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll | 27 + test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll | 2 +- test/CodeGen/AMDGPU/llvm.dbg.value.ll | 12 +- test/CodeGen/AMDGPU/llvm.memcpy.ll | 66 +- test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll | 184 + test/CodeGen/AMDGPU/llvm.round.f64.ll | 5 +- test/CodeGen/AMDGPU/load.ll | 34 +- test/CodeGen/AMDGPU/local-memory-two-objects.ll | 4 +- test/CodeGen/AMDGPU/local-memory.ll | 4 +- test/CodeGen/AMDGPU/max.ll | 116 +- test/CodeGen/AMDGPU/merge-stores.ll | 196 +- test/CodeGen/AMDGPU/min.ll | 171 +- .../AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll | 36 + test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll | 52 + test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll | 18 + test/CodeGen/AMDGPU/no-shrink-extloads.ll | 12 + test/CodeGen/AMDGPU/opencl-image-metadata.ll | 24 + test/CodeGen/AMDGPU/operand-folding.ll | 2 +- test/CodeGen/AMDGPU/or.ll | 2 +- .../partially-dead-super-register-immediate.ll | 28 + test/CodeGen/AMDGPU/private-memory.ll | 12 + test/CodeGen/AMDGPU/register-count-comments.ll | 3 +- test/CodeGen/AMDGPU/reorder-stores.ll | 58 +- test/CodeGen/AMDGPU/s_movk_i32.ll | 18 +- test/CodeGen/AMDGPU/salu-to-valu.ll | 418 +- test/CodeGen/AMDGPU/sampler-resource-id.ll | 65 + .../AMDGPU/schedule-vs-if-nested-loop-failure.ll | 2 +- test/CodeGen/AMDGPU/scratch-buffer.ll | 2 +- test/CodeGen/AMDGPU/select64.ll | 8 +- test/CodeGen/AMDGPU/set-dx10.ll | 48 +- test/CodeGen/AMDGPU/setcc-opt.ll | 22 +- test/CodeGen/AMDGPU/sext-in-reg.ll | 54 +- test/CodeGen/AMDGPU/shl.ll | 15 +- test/CodeGen/AMDGPU/shl_add_constant.ll | 6 +- test/CodeGen/AMDGPU/shl_add_ptr.ll | 2 +- .../si-instr-info-correct-implicit-operands.ll | 16 + test/CodeGen/AMDGPU/si-literal-folding.ll | 17 + test/CodeGen/AMDGPU/si-sgpr-spill.ll | 10 + test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll | 7 +- test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 6 +- test/CodeGen/AMDGPU/sminmax.ll | 130 + test/CodeGen/AMDGPU/smrd.ll | 73 +- test/CodeGen/AMDGPU/split-scalar-i64-add.ll | 42 +- .../AMDGPU/split-vector-memoperand-offsets.ll | 104 + test/CodeGen/AMDGPU/sra.ll | 8 +- test/CodeGen/AMDGPU/srl.ll | 13 +- test/CodeGen/AMDGPU/store-barrier.ll | 4 +- test/CodeGen/AMDGPU/store.ll | 25 +- test/CodeGen/AMDGPU/store_typed.ll | 24 + test/CodeGen/AMDGPU/sub.ll | 14 +- test/CodeGen/AMDGPU/trunc.ll | 8 +- test/CodeGen/AMDGPU/udivrem.ll | 130 +- test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 6 +- test/CodeGen/AMDGPU/unsupported-cc.ll | 32 +- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll | 167 + test/CodeGen/AMDGPU/valu-i1.ll | 16 +- .../vgpr-spill-emergency-stack-slot-compute.ll | 585 + .../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 494 + test/CodeGen/AMDGPU/vop-shrink.ll | 4 +- test/CodeGen/AMDGPU/wait.ll | 61 +- test/CodeGen/AMDGPU/work-item-intrinsics.ll | 263 +- test/CodeGen/AMDGPU/xor.ll | 2 +- test/CodeGen/AMDGPU/zero_extend.ll | 3 +- test/CodeGen/ARM/2007-03-13-InstrSched.ll | 2 +- test/CodeGen/ARM/2009-10-16-Scope.ll | 6 +- test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll | 6 +- test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll | 24 +- test/CodeGen/ARM/2010-05-21-BuildVector.ll | 4 +- test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll | 4 +- test/CodeGen/ARM/2010-06-21-nondarwin-tc.ll | 2 +- .../ARM/2010-06-25-Thumb2ITInvalidIterator.ll | 12 +- .../ARM/2010-06-29-PartialRedefFastAlloc.ll | 4 +- test/CodeGen/ARM/2010-08-04-StackVariable.ll | 24 +- test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll | 42 +- test/CodeGen/ARM/2011-06-29-MergeGlobalsAlign.ll | 2 +- test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll | 42 +- test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll | 4 +- test/CodeGen/ARM/2011-10-26-memset-inline.ll | 2 +- .../CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll | 10 +- test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll | 4 +- test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll | 14 +- test/CodeGen/ARM/2012-11-14-subs_carry.ll | 10 +- test/CodeGen/ARM/2013-10-11-select-stalls.ll | 13 +- .../ARM/2014-01-09-pseudo_expand_implicit_reg.ll | 4 +- test/CodeGen/ARM/MachO-subtypes.ll | 68 + test/CodeGen/ARM/Windows/division.ll | 38 + .../Windows/integer-floating-point-conversion.ll | 74 - test/CodeGen/ARM/Windows/libcalls.ll | 75 + test/CodeGen/ARM/Windows/no-eabi.ll | 10 + test/CodeGen/ARM/Windows/no-frame-register.ll | 22 + test/CodeGen/ARM/Windows/overflow.ll | 77 + test/CodeGen/ARM/adv-copy-opt.ll | 14 +- test/CodeGen/ARM/aliases.ll | 30 +- test/CodeGen/ARM/align-sp-adjustment.ll | 47 + test/CodeGen/ARM/apcs-vfp.ll | 153 + test/CodeGen/ARM/arm-eabi.ll | 63 + test/CodeGen/ARM/arm-interleaved-accesses.ll | 190 +- test/CodeGen/ARM/arm-shrink-wrapping-linux.ll | 142 + test/CodeGen/ARM/arm-shrink-wrapping.ll | 683 + test/CodeGen/ARM/atomic-64bit.ll | 6 + test/CodeGen/ARM/atomic-cmp.ll | 4 +- test/CodeGen/ARM/atomic-cmpxchg.ll | 98 +- test/CodeGen/ARM/atomic-op.ll | 46 +- test/CodeGen/ARM/atomic-ops-v8.ll | 38 +- test/CodeGen/ARM/avoid-cpsr-rmw.ll | 16 +- test/CodeGen/ARM/bfi.ll | 95 + .../ARM/build-attributes-optimization-minsize.ll | 18 + .../ARM/build-attributes-optimization-mixed.ll | 23 + .../ARM/build-attributes-optimization-optnone.ll | 18 + .../ARM/build-attributes-optimization-optsize.ll | 18 + test/CodeGen/ARM/build-attributes-optimization.ll | 23 + test/CodeGen/ARM/build-attributes.ll | 142 +- test/CodeGen/ARM/call-tc.ll | 8 +- test/CodeGen/ARM/cfi-alignment.ll | 48 + test/CodeGen/ARM/cmpxchg-idioms.ll | 6 +- test/CodeGen/ARM/cmpxchg-weak.ll | 56 +- test/CodeGen/ARM/coalesce-dbgvalue.ll | 10 +- test/CodeGen/ARM/coalesce-subregs.ll | 38 +- test/CodeGen/ARM/combine-vmovdrr.ll | 72 + test/CodeGen/ARM/constants.ll | 6 +- test/CodeGen/ARM/dagcombine-concatvector.ll | 4 +- test/CodeGen/ARM/debug-frame-vararg.ll | 14 +- test/CodeGen/ARM/debug-frame.ll | 28 +- test/CodeGen/ARM/debug-info-arg.ll | 20 +- test/CodeGen/ARM/debug-info-blocks.ll | 40 +- test/CodeGen/ARM/debug-info-branch-folding.ll | 32 +- test/CodeGen/ARM/debug-info-d16-reg.ll | 38 +- test/CodeGen/ARM/debug-info-no-frame.ll | 8 +- test/CodeGen/ARM/debug-info-qreg.ll | 28 +- test/CodeGen/ARM/debug-info-s16-reg.ll | 38 +- test/CodeGen/ARM/debug-info-sreg2.ll | 10 +- test/CodeGen/ARM/debug-segmented-stacks.ll | 14 +- test/CodeGen/ARM/debugtrap.ll | 17 + test/CodeGen/ARM/div.ll | 71 +- test/CodeGen/ARM/divmod-eabi.ll | 4 +- test/CodeGen/ARM/eh-resume-darwin.ll | 8 +- test/CodeGen/ARM/emutls.ll | 258 + test/CodeGen/ARM/emutls1.ll | 31 + test/CodeGen/ARM/emutls_generic.ll | 61 + test/CodeGen/ARM/fast-isel-align.ll | 24 +- test/CodeGen/ARM/fast-isel-ext.ll | 35 - test/CodeGen/ARM/fast-isel-mvn.ll | 10 +- test/CodeGen/ARM/fast-isel-pic.ll | 23 +- test/CodeGen/ARM/fold-stack-adjust.ll | 18 +- test/CodeGen/ARM/fp16-args.ll | 40 + test/CodeGen/ARM/fp16-promote.ll | 471 +- test/CodeGen/ARM/fp16.ll | 62 +- test/CodeGen/ARM/fparith.ll | 4 +- test/CodeGen/ARM/gep-optimization.ll | 77 + test/CodeGen/ARM/global-merge-1.ll | 6 +- test/CodeGen/ARM/global-merge-external.ll | 46 + test/CodeGen/ARM/globals.ll | 9 +- test/CodeGen/ARM/ifcvt-branch-weight-bug.ll | 14 +- test/CodeGen/ARM/ifcvt-branch-weight.ll | 2 +- test/CodeGen/ARM/ifcvt-iter-indbr.ll | 6 + test/CodeGen/ARM/ifcvt4.ll | 6 +- test/CodeGen/ARM/ifcvt5.ll | 4 +- test/CodeGen/ARM/ifcvt6.ll | 2 +- test/CodeGen/ARM/ifcvt8.ll | 4 +- test/CodeGen/ARM/inlineasm-switch-mode.ll | 4 +- test/CodeGen/ARM/ldm-stm-base-materialization.ll | 93 + test/CodeGen/ARM/ldrd.ll | 58 +- test/CodeGen/ARM/legalize-unaligned-load.ll | 35 + test/CodeGen/ARM/load-global.ll | 12 +- test/CodeGen/ARM/load-store-flags.ll | 4 +- test/CodeGen/ARM/load.ll | 571 +- test/CodeGen/ARM/machine-cse-cmp.ll | 2 +- test/CodeGen/ARM/memcpy-inline.ll | 2 +- test/CodeGen/ARM/memcpy-ldm-stm.ll | 94 + test/CodeGen/ARM/memfunc.ll | 255 +- test/CodeGen/ARM/minmax.ll | 193 + test/CodeGen/ARM/neon_minmax.ll | 1 + test/CodeGen/ARM/neon_spill.ll | 6 +- test/CodeGen/ARM/neon_vabs.ll | 38 + test/CodeGen/ARM/neon_vshl_minint.ll | 13 + test/CodeGen/ARM/out-of-registers.ll | 8 +- test/CodeGen/ARM/pr25317.ll | 11 + test/CodeGen/ARM/pr25838.ll | 34 + test/CodeGen/ARM/rbit.ll | 11 + test/CodeGen/ARM/reg_sequence.ll | 64 +- test/CodeGen/ARM/rotate.ll | 14 + test/CodeGen/ARM/sat-arith.ll | 63 + test/CodeGen/ARM/sched-it-debug-nodes.ll | 88 - test/CodeGen/ARM/setjmp_longjmp.ll | 113 + test/CodeGen/ARM/shifter_operand.ll | 228 +- test/CodeGen/ARM/sjlj-prepare-critical-edge.ll | 2 +- .../ARM/sjljehprepare-lower-empty-struct.ll | 1 + test/CodeGen/ARM/softfp-fabs-fneg.ll | 41 + test/CodeGen/ARM/special-reg-mcore.ll | 2 +- test/CodeGen/ARM/spill-q.ll | 28 +- test/CodeGen/ARM/ssat-lower.ll | 11 + test/CodeGen/ARM/ssat-upper.ll | 11 + test/CodeGen/ARM/subtarget-no-movt.ll | 45 + test/CodeGen/ARM/tail-merge-branch-weight.ll | 2 +- test/CodeGen/ARM/taildup-branch-weight.ll | 4 +- test/CodeGen/ARM/test-sharedidx.ll | 15 +- test/CodeGen/ARM/thumb-alignment.ll | 2 +- test/CodeGen/ARM/thumb1-ldst-opt.ll | 27 + test/CodeGen/ARM/thumb1_return_sequence.ll | 70 +- test/CodeGen/ARM/thumb2-it-block.ll | 24 +- test/CodeGen/ARM/thumb_indirect_calls.ll | 5 +- test/CodeGen/ARM/tls-models.ll | 74 +- test/CodeGen/ARM/tls3.ll | 29 +- test/CodeGen/ARM/unaligned_load_store.ll | 4 +- test/CodeGen/ARM/unaligned_load_store_vfp.ll | 98 + test/CodeGen/ARM/usat-lower.ll | 11 + test/CodeGen/ARM/usat-upper.ll | 11 + test/CodeGen/ARM/v7k-abi-align.ll | 152 + test/CodeGen/ARM/v7k-libcalls.ll | 154 + test/CodeGen/ARM/v7k-sincos.ll | 16 + test/CodeGen/ARM/vcge.ll | 4 +- test/CodeGen/ARM/vcombine.ll | 64 +- test/CodeGen/ARM/vcvt_combine.ll | 103 +- test/CodeGen/ARM/vdiv_combine.ll | 17 + test/CodeGen/ARM/vdup.ll | 16 + test/CodeGen/ARM/vector-DAGCombine.ll | 4 +- test/CodeGen/ARM/vector-load.ll | 4 +- test/CodeGen/ARM/vector-store.ll | 6 +- test/CodeGen/ARM/vext.ll | 34 +- test/CodeGen/ARM/vfp-reg-stride.ll | 42 + test/CodeGen/ARM/vfp-regs-dwarf.ll | 6 +- test/CodeGen/ARM/vld-vst-upgrade.ll | 139 + test/CodeGen/ARM/vld1.ll | 52 +- test/CodeGen/ARM/vld2.ll | 40 +- test/CodeGen/ARM/vld3.ll | 42 +- test/CodeGen/ARM/vld4.ll | 42 +- test/CodeGen/ARM/vlddup.ll | 30 +- test/CodeGen/ARM/vldlane.ll | 92 +- test/CodeGen/ARM/vminmaxnm-safe.ll | 396 + test/CodeGen/ARM/vminmaxnm.ll | 358 +- test/CodeGen/ARM/vmov.ll | 4 +- test/CodeGen/ARM/vmul.ll | 14 +- test/CodeGen/ARM/vpadd.ll | 2 +- test/CodeGen/ARM/vselect_imax.ll | 26 +- test/CodeGen/ARM/vst1.ll | 48 +- test/CodeGen/ARM/vst2.ll | 44 +- test/CodeGen/ARM/vst3.ll | 42 +- test/CodeGen/ARM/vst4.ll | 42 +- test/CodeGen/ARM/vstlane.ll | 90 +- test/CodeGen/ARM/vtrn.ll | 124 +- test/CodeGen/ARM/vuzp.ll | 136 +- test/CodeGen/ARM/vzip.ll | 82 +- test/CodeGen/BPF/sockex2.ll | 2 +- test/CodeGen/CPP/gep.ll | 10 + test/CodeGen/Generic/2009-03-17-LSR-APInt.ll | 28 +- test/CodeGen/Generic/ForceStackAlign.ll | 27 + test/CodeGen/Generic/MachineBranchProb.ll | 8 +- test/CodeGen/Generic/dbg_value.ll | 5 +- test/CodeGen/Generic/lit.local.cfg | 3 + test/CodeGen/Generic/overloaded-intrinsic-name.ll | 32 +- test/CodeGen/Generic/vector.ll | 6 + test/CodeGen/Hexagon/NVJumpCmp.ll | 89 + test/CodeGen/Hexagon/absaddr-store.ll | 1 + test/CodeGen/Hexagon/adde.ll | 6 +- test/CodeGen/Hexagon/alu64.ll | 134 +- test/CodeGen/Hexagon/bit-eval.ll | 53 + test/CodeGen/Hexagon/bit-loop.ll | 80 + test/CodeGen/Hexagon/cfi-late.ll | 65 + test/CodeGen/Hexagon/clr_set_toggle.ll | 2 +- test/CodeGen/Hexagon/combine.ll | 2 +- test/CodeGen/Hexagon/combine_ir.ll | 16 +- test/CodeGen/Hexagon/early-if-conversion-bug1.ll | 412 + test/CodeGen/Hexagon/early-if-phi-i1.ll | 17 + test/CodeGen/Hexagon/early-if-spare.ll | 57 + test/CodeGen/Hexagon/early-if.ll | 75 + test/CodeGen/Hexagon/extload-combine.ll | 2 +- test/CodeGen/Hexagon/hwloop-dbg.ll | 12 +- test/CodeGen/Hexagon/i16_VarArg.ll | 2 +- test/CodeGen/Hexagon/i1_VarArg.ll | 2 +- test/CodeGen/Hexagon/i8_VarArg.ll | 2 +- test/CodeGen/Hexagon/ifcvt-edge-weight.ll | 64 + test/CodeGen/Hexagon/memcpy-likely-aligned.ll | 32 + test/CodeGen/Hexagon/mux-basic.ll | 28 + test/CodeGen/Hexagon/opt-fabs.ll | 2 +- test/CodeGen/Hexagon/pic-jumptables.ll | 48 + test/CodeGen/Hexagon/pic-simple.ll | 22 + test/CodeGen/Hexagon/pic-static.ll | 21 + test/CodeGen/Hexagon/relax.ll | 9 +- test/CodeGen/Hexagon/sdr-basic.ll | 15 + test/CodeGen/Hexagon/sdr-shr32.ll | 22 + test/CodeGen/Hexagon/simple_addend.ll | 2 +- test/CodeGen/Hexagon/store-widen-aliased-load.ll | 21 + test/CodeGen/Hexagon/store-widen-negv.ll | 11 + test/CodeGen/Hexagon/store-widen-negv2.ll | 19 + test/CodeGen/Hexagon/store-widen.ll | 18 + test/CodeGen/Hexagon/struct_args.ll | 2 +- test/CodeGen/Hexagon/sube.ll | 8 +- test/CodeGen/Hexagon/tail-dup-subreg-abort.ll | 28 + test/CodeGen/Hexagon/tfr-to-combine.ll | 2 +- test/CodeGen/Hexagon/union-1.ll | 2 - test/CodeGen/Hexagon/v60Intrins.ll | 2559 +++ test/CodeGen/Hexagon/v60Vasr.ll | 247 + test/CodeGen/Hexagon/v60small.ll | 51 + test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll | 2 +- test/CodeGen/Hexagon/vect/vect-loadv4i16.ll | 2 +- test/CodeGen/Hexagon/vect/vect-shuffle.ll | 2 +- test/CodeGen/Hexagon/vect/vect-splat.ll | 2 +- test/CodeGen/Hexagon/vect/vect-xor.ll | 2 +- test/CodeGen/Inputs/DbgValueOtherTargets.ll | 8 +- test/CodeGen/MIR/AArch64/cfi-def-cfa.mir | 31 + .../MIR/AArch64/expected-target-flag-name.mir | 23 + .../MIR/AArch64/invalid-target-flag-name.mir | 23 + test/CodeGen/MIR/AArch64/lit.local.cfg | 8 + test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir | 28 + .../MIR/AArch64/stack-object-local-offset.mir | 41 + test/CodeGen/MIR/AArch64/target-flags.mir | 39 + .../MIR/AMDGPU/expected-target-index-name.mir | 64 + .../MIR/AMDGPU/invalid-target-index-operand.mir | 64 + test/CodeGen/MIR/AMDGPU/lit.local.cfg | 2 + test/CodeGen/MIR/AMDGPU/target-index-operands.mir | 104 + test/CodeGen/MIR/ARM/ARMLoadStoreDBG.mir | 165 + test/CodeGen/MIR/ARM/bundled-instructions.mir | 75 + test/CodeGen/MIR/ARM/cfi-same-value.mir | 80 + test/CodeGen/MIR/ARM/expected-closing-brace.mir | 50 + .../MIR/ARM/extraneous-closing-brace-error.mir | 20 + test/CodeGen/MIR/ARM/lit.local.cfg | 2 + .../MIR/ARM/nested-instruction-bundle-error.mir | 30 + test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir | 160 + test/CodeGen/MIR/Generic/basic-blocks.mir | 49 + .../Generic/expected-colon-after-basic-block.mir | 16 + .../expected-mbb-reference-for-successor-mbb.mir | 28 + test/CodeGen/MIR/Generic/frame-info.mir | 89 + .../Generic/function-missing-machine-function.mir | 13 + .../MIR/Generic/invalid-jump-table-kind.mir | 53 + test/CodeGen/MIR/Generic/lit.local.cfg | 3 + .../CodeGen/MIR/Generic/llvm-ir-error-reported.mir | 22 + test/CodeGen/MIR/Generic/llvmIR.mir | 37 + test/CodeGen/MIR/Generic/llvmIRMissing.mir | 9 + .../machine-basic-block-ir-block-reference.mir | 17 + .../machine-basic-block-redefinition-error.mir | 18 + .../machine-basic-block-undefined-ir-block.mir | 15 + .../Generic/machine-basic-block-unknown-name.mir | 18 + .../machine-function-missing-body-error.mir | 15 + .../Generic/machine-function-missing-function.mir | 23 + .../MIR/Generic/machine-function-missing-name.mir | 26 + .../machine-function-redefinition-error.mir | 10 + test/CodeGen/MIR/Generic/machine-function.mir | 66 + test/CodeGen/MIR/Generic/register-info.mir | 40 + ...ted-global-value-or-symbol-after-call-entry.mir | 41 + test/CodeGen/MIR/Mips/lit.local.cfg | 2 + test/CodeGen/MIR/Mips/memory-operands.mir | 102 + .../MIR/NVPTX/expected-floating-point-literal.mir | 24 + .../NVPTX/floating-point-immediate-operands.mir | 81 + .../NVPTX/floating-point-invalid-type-error.mir | 24 + test/CodeGen/MIR/NVPTX/lit.local.cfg | 2 + test/CodeGen/MIR/PowerPC/lit.local.cfg | 2 + .../MIR/PowerPC/unordered-implicit-registers.mir | 45 + test/CodeGen/MIR/X86/basic-block-liveins.mir | 57 +- .../X86/basic-block-not-at-start-of-line-error.mir | 41 + test/CodeGen/MIR/X86/block-address-operands.mir | 121 + test/CodeGen/MIR/X86/callee-saved-info.mir | 95 + test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir | 29 + test/CodeGen/MIR/X86/cfi-def-cfa-register.mir | 32 + test/CodeGen/MIR/X86/cfi-offset.mir | 47 + .../X86/constant-pool-item-redefinition-error.mir | 25 + test/CodeGen/MIR/X86/constant-pool.mir | 139 + test/CodeGen/MIR/X86/constant-value-error.mir | 25 + test/CodeGen/MIR/X86/dead-register-flag.mir | 14 +- .../MIR/X86/def-register-already-tied-error.mir | 25 + .../MIR/X86/duplicate-memory-operand-flag.mir | 27 + .../MIR/X86/duplicate-register-flag-error.mir | 35 + .../MIR/X86/early-clobber-register-flag.mir | 45 + .../MIR/X86/expected-align-in-memory-operand.mir | 30 + ...ted-alignment-after-align-in-memory-operand.mir | 30 + .../X86/expected-basic-block-at-start-of-body.mir | 40 + .../expected-block-reference-in-blockaddress.mir | 30 + .../MIR/X86/expected-comma-after-cfi-register.mir | 42 + .../X86/expected-comma-after-memory-operand.mir | 25 + .../X86/expected-different-implicit-operand.mir | 28 +- .../expected-different-implicit-register-flag.mir | 28 +- .../MIR/X86/expected-from-in-memory-operand.mir | 24 + ...ected-function-reference-after-blockaddress.mir | 30 + .../expected-global-value-after-blockaddress.mir | 30 + .../MIR/X86/expected-integer-after-offset-sign.mir | 24 + .../MIR/X86/expected-integer-after-tied-def.mir | 25 + .../X86/expected-integer-in-successor-weight.mir | 38 + .../expected-load-or-store-in-memory-operand.mir | 23 + test/CodeGen/MIR/X86/expected-machine-operand.mir | 12 +- ...expected-metadata-node-after-debug-location.mir | 59 + .../X86/expected-metadata-node-after-exclaim.mir | 59 + .../X86/expected-metadata-node-in-stack-object.mir | 25 + .../expected-named-register-in-allocation-hint.mir | 29 + ...ted-named-register-in-callee-saved-register.mir | 88 + ...expected-named-register-in-functions-livein.mir | 27 + .../MIR/X86/expected-named-register-livein.mir | 15 +- .../MIR/X86/expected-newline-at-end-of-list.mir | 41 + test/CodeGen/MIR/X86/expected-number-after-bb.mir | 28 +- .../MIR/X86/expected-offset-after-cfi-operand.mir | 27 + .../expected-pointer-value-in-memory-operand.mir | 24 + .../expected-positive-alignment-after-align.mir | 30 + .../X86/expected-register-after-cfi-operand.mir | 42 + .../MIR/X86/expected-register-after-flags.mir | 12 +- ...xpected-size-integer-after-memory-operation.mir | 24 + test/CodeGen/MIR/X86/expected-stack-object.mir | 67 + .../MIR/X86/expected-subregister-after-colon.mir | 18 +- test/CodeGen/MIR/X86/expected-target-flag-name.mir | 24 + .../MIR/X86/expected-tied-def-after-lparen.mir | 25 + .../MIR/X86/expected-value-in-memory-operand.mir | 24 + ...pected-virtual-register-in-functions-livein.mir | 27 + test/CodeGen/MIR/X86/external-symbol-operands.mir | 64 + .../MIR/X86/fixed-stack-memory-operands.mir | 39 + .../X86/fixed-stack-object-redefinition-error.mir | 28 + test/CodeGen/MIR/X86/fixed-stack-objects.mir | 12 +- .../MIR/X86/frame-info-save-restore-points.mir | 73 + .../MIR/X86/frame-info-stack-references.mir | 79 + .../MIR/X86/frame-setup-instruction-flag.mir | 35 + test/CodeGen/MIR/X86/function-liveins.mir | 37 + test/CodeGen/MIR/X86/global-value-operands.mir | 127 +- test/CodeGen/MIR/X86/immediate-operands.mir | 28 +- test/CodeGen/MIR/X86/implicit-register-flag.mir | 65 +- test/CodeGen/MIR/X86/inline-asm-registers.mir | 54 + .../MIR/X86/instructions-debug-location.mir | 98 + .../CodeGen/MIR/X86/invalid-constant-pool-item.mir | 25 + .../CodeGen/MIR/X86/invalid-metadata-node-type.mir | 53 + test/CodeGen/MIR/X86/invalid-target-flag-name.mir | 24 + .../MIR/X86/invalid-tied-def-index-error.mir | 25 + test/CodeGen/MIR/X86/jump-table-info.mir | 150 + .../MIR/X86/jump-table-redefinition-error.mir | 76 + test/CodeGen/MIR/X86/killed-register-flag.mir | 38 +- .../MIR/X86/large-cfi-offset-number-error.mir | 27 + .../MIR/X86/large-immediate-operand-error.mir | 18 + test/CodeGen/MIR/X86/large-index-number-error.mir | 26 +- test/CodeGen/MIR/X86/large-offset-number-error.mir | 24 + .../MIR/X86/large-size-in-memory-operand-error.mir | 24 + test/CodeGen/MIR/X86/liveout-register-mask.mir | 42 + .../MIR/X86/machine-basic-block-operands.mir | 68 +- test/CodeGen/MIR/X86/machine-instructions.mir | 14 +- test/CodeGen/MIR/X86/machine-verifier.mir | 22 + test/CodeGen/MIR/X86/memory-operands.mir | 508 + test/CodeGen/MIR/X86/metadata-operands.mir | 63 + test/CodeGen/MIR/X86/missing-closing-quote.mir | 22 + test/CodeGen/MIR/X86/missing-comma.mir | 12 +- test/CodeGen/MIR/X86/missing-implicit-operand.mir | 30 +- test/CodeGen/MIR/X86/missing-instruction.mir | 19 - test/CodeGen/MIR/X86/named-registers.mir | 14 +- test/CodeGen/MIR/X86/newline-handling.mir | 109 + test/CodeGen/MIR/X86/null-register-operands.mir | 14 +- test/CodeGen/MIR/X86/register-mask-operands.mir | 28 +- .../X86/register-operands-target-flag-error.mir | 24 + .../MIR/X86/simple-register-allocation-hints.mir | 34 + .../X86/spill-slot-fixed-stack-object-aliased.mir | 12 +- .../spill-slot-fixed-stack-object-immutable.mir | 12 +- .../MIR/X86/spill-slot-fixed-stack-objects.mir | 12 +- test/CodeGen/MIR/X86/stack-object-debug-info.mir | 65 + test/CodeGen/MIR/X86/stack-object-invalid-name.mir | 28 + .../stack-object-operand-name-mismatch-error.mir | 33 + test/CodeGen/MIR/X86/stack-object-operands.mir | 45 + .../MIR/X86/stack-object-redefinition-error.mir | 37 + test/CodeGen/MIR/X86/stack-objects.mir | 22 +- test/CodeGen/MIR/X86/standalone-register-error.mir | 24 + test/CodeGen/MIR/X86/subregister-operands.mir | 21 +- .../MIR/X86/successor-basic-blocks-weights.mir | 42 + test/CodeGen/MIR/X86/successor-basic-blocks.mir | 83 + test/CodeGen/MIR/X86/tied-def-operand-invalid.mir | 25 + test/CodeGen/MIR/X86/undef-register-flag.mir | 26 +- .../MIR/X86/undefined-fixed-stack-object.mir | 38 + test/CodeGen/MIR/X86/undefined-global-value.mir | 16 +- .../MIR/X86/undefined-ir-block-in-blockaddress.mir | 30 + .../undefined-ir-block-slot-in-blockaddress.mir | 29 + test/CodeGen/MIR/X86/undefined-jump-table-id.mir | 73 + .../MIR/X86/undefined-named-global-value.mir | 16 +- test/CodeGen/MIR/X86/undefined-register-class.mir | 8 +- test/CodeGen/MIR/X86/undefined-stack-object.mir | 30 + .../MIR/X86/undefined-value-in-memory-operand.mir | 24 + .../CodeGen/MIR/X86/undefined-virtual-register.mir | 14 +- test/CodeGen/MIR/X86/unknown-instruction.mir | 10 +- .../MIR/X86/unknown-machine-basic-block.mir | 26 +- test/CodeGen/MIR/X86/unknown-metadata-keyword.mir | 25 + test/CodeGen/MIR/X86/unknown-metadata-node.mir | 59 + .../MIR/X86/unknown-named-machine-basic-block.mir | 28 +- test/CodeGen/MIR/X86/unknown-register.mir | 12 +- test/CodeGen/MIR/X86/unknown-subregister-index.mir | 18 +- test/CodeGen/MIR/X86/unrecognized-character.mir | 10 +- .../MIR/X86/used-physical-register-info.mir | 109 + .../X86/variable-sized-stack-object-size-error.mir | 14 +- .../MIR/X86/variable-sized-stack-objects.mir | 18 +- .../X86/virtual-register-redefinition-error.mir | 27 + test/CodeGen/MIR/X86/virtual-registers.mir | 90 +- test/CodeGen/MIR/basic-blocks.mir | 49 - .../MIR/expected-eof-after-successor-mbb.mir | 29 - .../expected-mbb-reference-for-successor-mbb.mir | 29 - test/CodeGen/MIR/frame-info.mir | 91 - .../MIR/function-missing-machine-function.mir | 13 - test/CodeGen/MIR/llvm-ir-error-reported.mir | 22 - test/CodeGen/MIR/llvmIR.mir | 37 - test/CodeGen/MIR/llvmIRMissing.mir | 9 - .../MIR/machine-basic-block-redefinition-error.mir | 17 - .../MIR/machine-basic-block-unknown-name.mir | 19 - .../MIR/machine-function-missing-body-error.mir | 15 - .../MIR/machine-function-missing-function.mir | 23 - test/CodeGen/MIR/machine-function-missing-name.mir | 26 - .../MIR/machine-function-redefinition-error.mir | 10 - test/CodeGen/MIR/machine-function.mir | 66 - test/CodeGen/MIR/register-info.mir | 40 - test/CodeGen/MIR/successor-basic-blocks.mir | 58 - .../CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll | 27 + test/CodeGen/Mips/addi.ll | 2 +- test/CodeGen/Mips/adjust-callstack-sp.ll | 2 +- test/CodeGen/Mips/align16.ll | 2 +- test/CodeGen/Mips/alloca16.ll | 2 +- test/CodeGen/Mips/and1.ll | 2 +- test/CodeGen/Mips/asm-large-immediate.ll | 3 +- test/CodeGen/Mips/atomicops.ll | 2 +- test/CodeGen/Mips/beqzc.ll | 2 +- test/CodeGen/Mips/beqzc1.ll | 2 +- test/CodeGen/Mips/br-jmp.ll | 4 +- test/CodeGen/Mips/brconeq.ll | 2 +- test/CodeGen/Mips/brconeqk.ll | 2 +- test/CodeGen/Mips/brconeqz.ll | 2 +- test/CodeGen/Mips/brconge.ll | 2 +- test/CodeGen/Mips/brcongt.ll | 2 +- test/CodeGen/Mips/brconle.ll | 2 +- test/CodeGen/Mips/brconlt.ll | 2 +- test/CodeGen/Mips/brconne.ll | 2 +- test/CodeGen/Mips/brconnek.ll | 2 +- test/CodeGen/Mips/brconnez.ll | 2 +- test/CodeGen/Mips/brind.ll | 2 +- test/CodeGen/Mips/brsize3.ll | 4 +- test/CodeGen/Mips/brsize3a.ll | 2 +- test/CodeGen/Mips/cconv/arguments-varargs.ll | 72 +- test/CodeGen/Mips/ci2.ll | 2 +- test/CodeGen/Mips/cmplarge.ll | 2 +- test/CodeGen/Mips/const1.ll | 2 +- test/CodeGen/Mips/const4a.ll | 2 +- test/CodeGen/Mips/const6.ll | 4 +- test/CodeGen/Mips/const6a.ll | 4 +- test/CodeGen/Mips/div.ll | 2 +- test/CodeGen/Mips/div_rem.ll | 2 +- test/CodeGen/Mips/divu.ll | 2 +- test/CodeGen/Mips/divu_remu.ll | 2 +- test/CodeGen/Mips/eh.ll | 2 +- test/CodeGen/Mips/emergency-spill-slot-near-fp.ll | 4 +- test/CodeGen/Mips/emutls_generic.ll | 70 + test/CodeGen/Mips/ex2.ll | 2 +- test/CodeGen/Mips/extins.ll | 2 +- test/CodeGen/Mips/f16abs.ll | 2 +- test/CodeGen/Mips/fixdfsf.ll | 4 +- test/CodeGen/Mips/fp16instrinsmc.ll | 4 +- test/CodeGen/Mips/fp16mix.ll | 6 +- test/CodeGen/Mips/fp16static.ll | 2 +- test/CodeGen/Mips/helloworld.ll | 12 +- test/CodeGen/Mips/hf16_1.ll | 4 +- test/CodeGen/Mips/hf16call32.ll | 408 +- test/CodeGen/Mips/hf16call32_body.ll | 206 +- test/CodeGen/Mips/hf1_body.ll | 18 +- test/CodeGen/Mips/hfptrcall.ll | 2 +- test/CodeGen/Mips/i32k.ll | 2 +- .../CodeGen/Mips/inlineasm-assembler-directives.ll | 4 +- test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll | 36 +- test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll | 4 +- test/CodeGen/Mips/inlineasm-operand-code.ll | 185 +- test/CodeGen/Mips/inlineasm_constraint.ll | 94 +- test/CodeGen/Mips/inlineasmmemop.ll | 8 +- test/CodeGen/Mips/insn-zero-size-bb.ll | 4 +- test/CodeGen/Mips/interrupt-attr-64-error.ll | 9 + test/CodeGen/Mips/interrupt-attr-args-error.ll | 9 + test/CodeGen/Mips/interrupt-attr-error.ll | 9 + test/CodeGen/Mips/interrupt-attr.ll | 244 + test/CodeGen/Mips/jtstat.ll | 2 +- test/CodeGen/Mips/l3mc.ll | 20 +- test/CodeGen/Mips/lb1.ll | 2 +- test/CodeGen/Mips/lbu1.ll | 2 +- test/CodeGen/Mips/lcb2.ll | 4 +- test/CodeGen/Mips/lcb3c.ll | 2 +- test/CodeGen/Mips/lcb4a.ll | 2 +- test/CodeGen/Mips/lcb5.ll | 2 +- test/CodeGen/Mips/lh1.ll | 2 +- test/CodeGen/Mips/lhu1.ll | 2 +- test/CodeGen/Mips/llcarry.ll | 2 +- test/CodeGen/Mips/llvm-ir/atomicrmx.ll | 26 + test/CodeGen/Mips/llvm-ir/call.ll | 14 + test/CodeGen/Mips/llvm-ir/load-atomic.ll | 42 + test/CodeGen/Mips/llvm-ir/sqrt.ll | 13 + test/CodeGen/Mips/llvm-ir/store-atomic.ll | 42 + test/CodeGen/Mips/madd-msub.ll | 2 +- test/CodeGen/Mips/mbrsize4a.ll | 2 +- test/CodeGen/Mips/mips16-hf-attr-2.ll | 2 +- test/CodeGen/Mips/mips16-hf-attr.ll | 2 +- test/CodeGen/Mips/mips16_32_1.ll | 2 +- test/CodeGen/Mips/mips16_32_10.ll | 2 +- test/CodeGen/Mips/mips16_32_3.ll | 2 +- test/CodeGen/Mips/mips16_32_4.ll | 2 +- test/CodeGen/Mips/mips16_32_5.ll | 2 +- test/CodeGen/Mips/mips16_32_6.ll | 2 +- test/CodeGen/Mips/mips16_32_7.ll | 2 +- test/CodeGen/Mips/mips16_fpret.ll | 8 +- test/CodeGen/Mips/mips16ex.ll | 2 +- test/CodeGen/Mips/mips16fpe.ll | 6 +- test/CodeGen/Mips/misha.ll | 2 +- test/CodeGen/Mips/msa/elm_copy.ll | 5 +- test/CodeGen/Mips/mul.ll | 2 +- test/CodeGen/Mips/mulll.ll | 2 +- test/CodeGen/Mips/mulull.ll | 2 +- test/CodeGen/Mips/nacl-align.ll | 7 +- test/CodeGen/Mips/neg1.ll | 2 +- test/CodeGen/Mips/no-odd-spreg-msa.ll | 24 +- test/CodeGen/Mips/nomips16.ll | 2 +- test/CodeGen/Mips/not1.ll | 2 +- test/CodeGen/Mips/null.ll | 2 +- test/CodeGen/Mips/or1.ll | 2 +- test/CodeGen/Mips/powif64_16.ll | 2 +- test/CodeGen/Mips/rem.ll | 2 +- test/CodeGen/Mips/remu.ll | 2 +- test/CodeGen/Mips/s2rem.ll | 4 +- test/CodeGen/Mips/sb1.ll | 2 +- test/CodeGen/Mips/sel1c.ll | 2 +- test/CodeGen/Mips/sel2c.ll | 2 +- test/CodeGen/Mips/selTBteqzCmpi.ll | 2 +- test/CodeGen/Mips/selTBtnezCmpi.ll | 2 +- test/CodeGen/Mips/selTBtnezSlti.ll | 2 +- test/CodeGen/Mips/seleq.ll | 2 +- test/CodeGen/Mips/seleqk.ll | 2 +- test/CodeGen/Mips/selgek.ll | 2 +- test/CodeGen/Mips/selgt.ll | 2 +- test/CodeGen/Mips/selle.ll | 2 +- test/CodeGen/Mips/selltk.ll | 2 +- test/CodeGen/Mips/selne.ll | 2 +- test/CodeGen/Mips/selnek.ll | 2 +- test/CodeGen/Mips/selpat.ll | 2 +- test/CodeGen/Mips/seteq.ll | 2 +- test/CodeGen/Mips/seteqz.ll | 2 +- test/CodeGen/Mips/setge.ll | 2 +- test/CodeGen/Mips/setgek.ll | 2 +- test/CodeGen/Mips/setle.ll | 2 +- test/CodeGen/Mips/setlt.ll | 2 +- test/CodeGen/Mips/setltk.ll | 2 +- test/CodeGen/Mips/setne.ll | 2 +- test/CodeGen/Mips/setuge.ll | 2 +- test/CodeGen/Mips/setugt.ll | 2 +- test/CodeGen/Mips/setule.ll | 2 +- test/CodeGen/Mips/setult.ll | 2 +- test/CodeGen/Mips/setultk.ll | 2 +- test/CodeGen/Mips/sh1.ll | 2 +- test/CodeGen/Mips/simplebr.ll | 2 +- test/CodeGen/Mips/sitofp-selectcc-opt.ll | 3 +- test/CodeGen/Mips/sll1.ll | 2 +- test/CodeGen/Mips/sll2.ll | 2 +- test/CodeGen/Mips/sr1.ll | 4 +- test/CodeGen/Mips/sra1.ll | 2 +- test/CodeGen/Mips/sra2.ll | 2 +- test/CodeGen/Mips/srl1.ll | 2 +- test/CodeGen/Mips/srl2.ll | 2 +- test/CodeGen/Mips/stchar.ll | 4 +- test/CodeGen/Mips/stldst.ll | 2 +- test/CodeGen/Mips/sub1.ll | 2 +- test/CodeGen/Mips/sub2.ll | 2 +- test/CodeGen/Mips/tail16.ll | 2 +- test/CodeGen/Mips/tailcall.ll | 2 +- test/CodeGen/Mips/tls-alias.ll | 2 +- test/CodeGen/Mips/tls16.ll | 2 +- test/CodeGen/Mips/tls16_2.ll | 2 +- test/CodeGen/Mips/trap1.ll | 2 +- test/CodeGen/Mips/ul1.ll | 2 +- test/CodeGen/Mips/xor1.ll | 2 +- test/CodeGen/NVPTX/branch-fold.ll | 40 + test/CodeGen/NVPTX/bypass-div.ll | 80 + test/CodeGen/NVPTX/combine-min-max.ll | 307 + test/CodeGen/NVPTX/fma-assoc.ll | 13 + test/CodeGen/NVPTX/global-addrspace.ll | 12 + test/CodeGen/NVPTX/load-with-non-coherent-cache.ll | 264 + test/CodeGen/NVPTX/lower-aggr-copies.ll | 118 +- test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll | 20 +- test/CodeGen/NVPTX/reg-copy.ll | 224 + test/CodeGen/NVPTX/symbol-naming.ll | 4 +- test/CodeGen/NVPTX/vector-call.ll | 2 +- test/CodeGen/PowerPC/2006-01-20-ShiftPartsCrash.ll | 1 + test/CodeGen/PowerPC/2006-08-15-SelectionCrash.ll | 1 + test/CodeGen/PowerPC/2006-12-07-LargeAlloca.ll | 1 + test/CodeGen/PowerPC/2006-12-07-SelectCrash.ll | 1 + test/CodeGen/PowerPC/2007-11-19-VectorSplitting.ll | 1 + test/CodeGen/PowerPC/BoolRetToIntTest.ll | 203 + test/CodeGen/PowerPC/BreakableToken-reduced.ll | 335 + test/CodeGen/PowerPC/aantidep-def-ec.mir | 117 + test/CodeGen/PowerPC/aantidep-inline-asm-use.ll | 305 + test/CodeGen/PowerPC/addisdtprelha-nonr3.mir | 80 + test/CodeGen/PowerPC/alias.ll | 4 +- test/CodeGen/PowerPC/bitcasts-direct-move.ll | 83 + test/CodeGen/PowerPC/bitreverse.ll | 23 + test/CodeGen/PowerPC/branch-hint.ll | 135 + test/CodeGen/PowerPC/coal-sections.ll | 24 + test/CodeGen/PowerPC/crbit-asm-disabled.ll | 16 + test/CodeGen/PowerPC/crbit-asm.ll | 3 +- test/CodeGen/PowerPC/cttz.ll | 2 +- test/CodeGen/PowerPC/dbg.ll | 10 +- test/CodeGen/PowerPC/dyn-alloca-offset.ll | 21 + test/CodeGen/PowerPC/e500-1.ll | 30 + test/CodeGen/PowerPC/emutls_generic.ll | 41 + test/CodeGen/PowerPC/fast-isel-binary.ll | 26 +- test/CodeGen/PowerPC/fast-isel-br-const.ll | 2 +- test/CodeGen/PowerPC/fast-isel-call.ll | 14 +- test/CodeGen/PowerPC/fast-isel-cmp-imm.ll | 34 +- test/CodeGen/PowerPC/fast-isel-const.ll | 2 +- test/CodeGen/PowerPC/fast-isel-conversion-p5.ll | 20 +- test/CodeGen/PowerPC/fast-isel-conversion.ll | 48 +- test/CodeGen/PowerPC/fast-isel-crash.ll | 4 +- test/CodeGen/PowerPC/fast-isel-ext.ll | 20 +- test/CodeGen/PowerPC/fast-isel-fold.ll | 26 +- test/CodeGen/PowerPC/fast-isel-indirectbr.ll | 2 +- test/CodeGen/PowerPC/fast-isel-load-store.ll | 34 +- test/CodeGen/PowerPC/fast-isel-redefinition.ll | 2 +- test/CodeGen/PowerPC/fast-isel-ret.ll | 52 +- test/CodeGen/PowerPC/fast-isel-shifter.ll | 12 +- .../PowerPC/fastisel-gep-promote-before-add.ll | 2 +- .../PowerPC/fma-mutate-register-constraint.ll | 89 + .../PowerPC/fp-int-conversions-direct-moves.ll | 24 +- .../PowerPC/fp128-bitcast-after-operation.ll | 137 + test/CodeGen/PowerPC/load-shift-combine.ll | 1 + test/CodeGen/PowerPC/long-compare.ll | 2 +- test/CodeGen/PowerPC/machine-combiner.ll | 188 + test/CodeGen/PowerPC/mc-instrlat.ll | 25 + test/CodeGen/PowerPC/mcm-13.ll | 27 + test/CodeGen/PowerPC/memcpy-vec.ll | 7 +- test/CodeGen/PowerPC/merge-st-chain-op.ll | 41 + .../PowerPC/p8-scalar_vector_conversions.ll | 1476 ++ test/CodeGen/PowerPC/peephole-align.ll | 335 + test/CodeGen/PowerPC/ppc-shrink-wrapping.ll | 784 + test/CodeGen/PowerPC/ppc32-i1-vaarg.ll | 2 +- test/CodeGen/PowerPC/ppc64-icbt-pwr7.ll | 8 +- test/CodeGen/PowerPC/ppcsoftops.ll | 50 + test/CodeGen/PowerPC/pr17168.ll | 366 +- test/CodeGen/PowerPC/pr24546.ll | 22 +- test/CodeGen/PowerPC/pr24636.ll | 41 + test/CodeGen/PowerPC/pr25157-peephole.ll | 61 + test/CodeGen/PowerPC/preincprep-nontrans-crash.ll | 94 + test/CodeGen/PowerPC/qpx-unal-cons-lds.ll | 217 + test/CodeGen/PowerPC/retaddr2.ll | 6 +- test/CodeGen/PowerPC/rm-zext.ll | 6 +- test/CodeGen/PowerPC/rotl-rotr-crash.ll | 12 + test/CodeGen/PowerPC/sdiv-pow2.ll | 8 +- .../selectiondag-extload-computeknownbits.ll | 12 + test/CodeGen/PowerPC/seteq-0.ll | 2 +- test/CodeGen/PowerPC/sjlj.ll | 20 +- test/CodeGen/PowerPC/stack-realign.ll | 26 +- test/CodeGen/PowerPC/stackmap-frame-setup.ll | 20 + test/CodeGen/PowerPC/swaps-le-5.ll | 4 +- test/CodeGen/PowerPC/swaps-le-6.ll | 42 + test/CodeGen/PowerPC/unal-vec-ldst.ll | 580 + test/CodeGen/PowerPC/unal-vec-negarith.ll | 17 + test/CodeGen/PowerPC/unwind-dw2-g.ll | 6 +- test/CodeGen/PowerPC/variable_elem_vec_extracts.ll | 114 + test/CodeGen/PowerPC/vec-asm-disabled.ll | 14 + test/CodeGen/PowerPC/vec_add_sub_quadword.ll | 6 +- .../PowerPC/vector-merge-store-fp-constants.ll | 28 + test/CodeGen/PowerPC/vsx.ll | 5 +- test/CodeGen/PowerPC/vsx_insert_extract_le.ll | 6 +- test/CodeGen/PowerPC/vsx_scalar_ld_st.ll | 6 +- test/CodeGen/PowerPC/vsx_shuffle_le.ll | 20 +- test/CodeGen/SPARC/2011-01-22-SRet.ll | 2 +- test/CodeGen/SPARC/32abi.ll | 191 + test/CodeGen/SPARC/64abi.ll | 84 +- test/CodeGen/SPARC/basictest.ll | 21 +- test/CodeGen/SPARC/float-constants.ll | 41 + test/CodeGen/SPARC/float.ll | 10 +- test/CodeGen/SPARC/fp128.ll | 4 +- test/CodeGen/SPARC/inlineasm.ll | 53 +- test/CodeGen/SPARC/missing-sret.ll | 9 + test/CodeGen/SPARC/reserved-regs.ll | 135 + test/CodeGen/SPARC/select-mask.ll | 17 + test/CodeGen/SPARC/spill.ll | 64 + test/CodeGen/SPARC/stack-align.ll | 22 + test/CodeGen/SPARC/tls.ll | 2 +- test/CodeGen/SPARC/varargs.ll | 2 +- test/CodeGen/SystemZ/alloca-03.ll | 84 + test/CodeGen/SystemZ/alloca-04.ll | 14 + test/CodeGen/SystemZ/args-01.ll | 4 +- test/CodeGen/SystemZ/args-02.ll | 4 +- test/CodeGen/SystemZ/args-03.ll | 4 +- test/CodeGen/SystemZ/args-04.ll | 2 +- test/CodeGen/SystemZ/args-07.ll | 2 +- test/CodeGen/SystemZ/asm-17.ll | 3 +- test/CodeGen/SystemZ/asm-18.ll | 3 +- test/CodeGen/SystemZ/dag-combine-01.ll | 97 + test/CodeGen/SystemZ/fp-abs-01.ll | 4 +- test/CodeGen/SystemZ/fp-abs-02.ll | 4 +- test/CodeGen/SystemZ/fp-add-02.ll | 2 +- test/CodeGen/SystemZ/fp-cmp-02.ll | 5 +- test/CodeGen/SystemZ/fp-cmp-05.ll | 80 + test/CodeGen/SystemZ/fp-const-02.ll | 4 +- test/CodeGen/SystemZ/fp-libcall.ll | 273 + test/CodeGen/SystemZ/fp-move-05.ll | 2 +- test/CodeGen/SystemZ/fp-neg-01.ll | 4 +- test/CodeGen/SystemZ/fp-sincos-01.ll | 56 + test/CodeGen/SystemZ/insert-05.ll | 4 +- test/CodeGen/SystemZ/int-cmp-44.ll | 3 +- test/CodeGen/SystemZ/int-cmp-51.ll | 34 + test/CodeGen/SystemZ/int-cmp-52.ll | 24 + test/CodeGen/SystemZ/memchr-01.ll | 2 +- test/CodeGen/SystemZ/spill-01.ll | 2 +- test/CodeGen/SystemZ/vec-args-04.ll | 26 +- test/CodeGen/SystemZ/vec-args-05.ll | 10 +- test/CodeGen/SystemZ/vec-perm-12.ll | 43 + test/CodeGen/SystemZ/vec-perm-13.ll | 38 + test/CodeGen/SystemZ/xor-01.ll | 2 +- test/CodeGen/Thumb/2010-07-15-debugOrdering.ll | 14 +- test/CodeGen/Thumb/cortex-m0-unaligned-access.ll | 2 +- test/CodeGen/Thumb/large-stack.ll | 20 +- .../Thumb/ldm-stm-base-materialization-thumb2.ll | 93 + test/CodeGen/Thumb/ldm-stm-base-materialization.ll | 77 +- test/CodeGen/Thumb/pop.ll | 4 +- test/CodeGen/Thumb/segmented-stacks.ll | 24 +- test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll | 36 - test/CodeGen/Thumb/thumb-shrink-wrapping.ll | 691 + test/CodeGen/Thumb/vargs.ll | 6 +- test/CodeGen/Thumb2/crash.ll | 14 +- test/CodeGen/Thumb2/emit-unwinding.ll | 11 + test/CodeGen/Thumb2/float-cmp.ll | 44 +- test/CodeGen/Thumb2/float-intrinsics-double.ll | 11 +- test/CodeGen/Thumb2/float-intrinsics-float.ll | 4 +- test/CodeGen/Thumb2/ifcvt-compare.ll | 6 +- test/CodeGen/Thumb2/machine-licm.ll | 8 +- test/CodeGen/Thumb2/pic-load.ll | 12 +- test/CodeGen/Thumb2/setjmp_longjmp.ll | 89 + test/CodeGen/Thumb2/thumb2-ifcvt1.ll | 14 +- test/CodeGen/Thumb2/thumb2-ifcvt2.ll | 4 +- test/CodeGen/Thumb2/thumb2-mulhi.ll | 2 +- test/CodeGen/Thumb2/thumb2-smla.ll | 4 +- test/CodeGen/Thumb2/thumb2-smul.ll | 2 +- test/CodeGen/Thumb2/thumb2-spill-q.ll | 28 +- test/CodeGen/Thumb2/thumb2-uxt_rot.ll | 8 +- test/CodeGen/Thumb2/v8_IT_1.ll | 4 +- test/CodeGen/Thumb2/v8_IT_3.ll | 5 +- test/CodeGen/Thumb2/v8_IT_5.ll | 4 +- test/CodeGen/WebAssembly/call.ll | 127 + test/CodeGen/WebAssembly/cfg-stackify.ll | 1102 ++ test/CodeGen/WebAssembly/comparisons_f32.ll | 181 + test/CodeGen/WebAssembly/comparisons_f64.ll | 181 + test/CodeGen/WebAssembly/comparisons_i32.ll | 98 + test/CodeGen/WebAssembly/comparisons_i64.ll | 98 + test/CodeGen/WebAssembly/conv.ll | 255 + test/CodeGen/WebAssembly/copysign-casts.ll | 28 + test/CodeGen/WebAssembly/cpus.ll | 17 + test/CodeGen/WebAssembly/dead-vreg.ll | 51 + test/CodeGen/WebAssembly/f32.ll | 154 + test/CodeGen/WebAssembly/f64.ll | 154 + test/CodeGen/WebAssembly/fast-isel.ll | 20 + test/CodeGen/WebAssembly/frem.ll | 26 + test/CodeGen/WebAssembly/func.ll | 62 + test/CodeGen/WebAssembly/global.ll | 177 + test/CodeGen/WebAssembly/globl.ll | 10 + test/CodeGen/WebAssembly/i32.ll | 190 + test/CodeGen/WebAssembly/i64.ll | 190 + test/CodeGen/WebAssembly/ident.ll | 12 + test/CodeGen/WebAssembly/immediates.ll | 198 + test/CodeGen/WebAssembly/inline-asm.ll | 94 + test/CodeGen/WebAssembly/legalize.ll | 62 + test/CodeGen/WebAssembly/load-ext.ll | 96 + test/CodeGen/WebAssembly/load-store-i1.ll | 68 + test/CodeGen/WebAssembly/load.ll | 46 + test/CodeGen/WebAssembly/loop-idiom.ll | 53 + test/CodeGen/WebAssembly/memory-addr32.ll | 27 + test/CodeGen/WebAssembly/memory-addr64.ll | 27 + test/CodeGen/WebAssembly/offset-folding.ll | 48 + test/CodeGen/WebAssembly/offset.ll | 185 + test/CodeGen/WebAssembly/phi.ll | 47 + test/CodeGen/WebAssembly/reg-stackify.ll | 126 + test/CodeGen/WebAssembly/return-int32.ll | 10 + test/CodeGen/WebAssembly/return-void.ll | 10 + test/CodeGen/WebAssembly/returned.ll | 49 + test/CodeGen/WebAssembly/select.ll | 135 + test/CodeGen/WebAssembly/signext-zeroext.ll | 60 + test/CodeGen/WebAssembly/store-results.ll | 61 + test/CodeGen/WebAssembly/store-trunc.ll | 46 + test/CodeGen/WebAssembly/store.ll | 42 + test/CodeGen/WebAssembly/switch.ll | 174 + test/CodeGen/WebAssembly/unreachable.ll | 34 + test/CodeGen/WebAssembly/unused-argument.ll | 31 + test/CodeGen/WebAssembly/userstack.ll | 81 + test/CodeGen/WebAssembly/varargs.ll | 123 + test/CodeGen/WebAssembly/vtable.ll | 171 + test/CodeGen/WinEH/cppeh-alloca-sink.ll | 180 - test/CodeGen/WinEH/cppeh-catch-all-win32.ll | 86 - test/CodeGen/WinEH/cppeh-catch-all.ll | 97 - test/CodeGen/WinEH/cppeh-catch-and-throw.ll | 143 - test/CodeGen/WinEH/cppeh-catch-scalar.ll | 126 - test/CodeGen/WinEH/cppeh-catch-unwind.ll | 240 - test/CodeGen/WinEH/cppeh-cleanup-invoke.ll | 91 - test/CodeGen/WinEH/cppeh-demote-liveout.ll | 72 - test/CodeGen/WinEH/cppeh-frame-vars.ll | 272 - test/CodeGen/WinEH/cppeh-inalloca.ll | 194 - test/CodeGen/WinEH/cppeh-min-unwind.ll | 99 - .../CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll | 106 - test/CodeGen/WinEH/cppeh-multi-catch.ll | 226 - test/CodeGen/WinEH/cppeh-nested-1.ll | 194 - test/CodeGen/WinEH/cppeh-nested-2.ll | 324 - test/CodeGen/WinEH/cppeh-nested-3.ll | 260 - test/CodeGen/WinEH/cppeh-nested-rethrow.ll | 212 - test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll | 278 - test/CodeGen/WinEH/cppeh-prepared-catch-all.ll | 47 - .../WinEH/cppeh-prepared-catch-reordered.ll | 165 - test/CodeGen/WinEH/cppeh-prepared-catch.ll | 232 - test/CodeGen/WinEH/cppeh-prepared-cleanups.ll | 245 - test/CodeGen/WinEH/cppeh-shared-empty-catch.ll | 110 - test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll | 394 - test/CodeGen/WinEH/cppeh-state-calc-1.ll | 289 - test/CodeGen/WinEH/seh-catch-all.ll | 59 - test/CodeGen/WinEH/seh-exception-code.ll | 66 - test/CodeGen/WinEH/seh-exception-code2.ll | 91 - test/CodeGen/WinEH/seh-inlined-finally.ll | 83 - test/CodeGen/WinEH/seh-outlined-finally-win32.ll | 172 - test/CodeGen/WinEH/seh-outlined-finally.ll | 155 - test/CodeGen/WinEH/seh-prepared-basic.ll | 83 - test/CodeGen/WinEH/seh-resume-phi.ll | 66 - test/CodeGen/WinEH/seh-simple.ll | 233 - test/CodeGen/WinEH/wineh-cloning.ll | 391 + test/CodeGen/WinEH/wineh-demotion.ll | 356 + test/CodeGen/WinEH/wineh-intrinsics-invalid.ll | 26 + test/CodeGen/WinEH/wineh-intrinsics.ll | 44 + test/CodeGen/WinEH/wineh-no-demotion.ll | 130 + .../CodeGen/WinEH/wineh-statenumbering-cleanups.ll | 62 + test/CodeGen/WinEH/wineh-statenumbering.ll | 148 + test/CodeGen/X86/2006-10-02-BoolRetCrash.ll | 1 + .../X86/2006-10-19-SwitchUnnecessaryBranching.ll | 4 +- test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll | 2 +- test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll | 2 +- test/CodeGen/X86/2008-03-14-SpillerCrash.ll | 2 +- .../CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll | 8 +- test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll | 15 +- test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll | 8 +- test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll | 14 +- test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll | 8 +- test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll | 1 + test/CodeGen/X86/2009-06-06-ConcatVectors.ll | 1 + test/CodeGen/X86/2009-10-16-Scope.ll | 6 +- test/CodeGen/X86/2010-01-18-DbgValue.ll | 8 +- test/CodeGen/X86/2010-02-01-DbgValueCrash.ll | 8 +- test/CodeGen/X86/2010-05-25-DotDebugLoc.ll | 22 +- test/CodeGen/X86/2010-05-26-DotDebugLoc.ll | 20 +- test/CodeGen/X86/2010-05-28-Crash.ll | 18 +- test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll | 28 +- test/CodeGen/X86/2010-07-06-DbgCrash.ll | 7 +- test/CodeGen/X86/2010-08-04-StackVariable.ll | 24 +- test/CodeGen/X86/2010-09-16-EmptyFilename.ll | 10 +- test/CodeGen/X86/2010-11-02-DbgParameter.ll | 8 +- test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll | 20 +- test/CodeGen/X86/2011-10-21-widen-cmp.ll | 42 +- .../X86/2011-12-06-AVXVectorExtractCombine.ll | 13 +- test/CodeGen/X86/2011-20-21-zext-ui2fp.ll | 14 +- test/CodeGen/X86/2012-01-12-extract-sv.ll | 28 +- test/CodeGen/X86/2012-08-17-legalizer-crash.ll | 3 +- test/CodeGen/X86/2012-1-10-buildvector.ll | 1 + test/CodeGen/X86/2012-11-30-handlemove-dbg.ll | 8 +- test/CodeGen/X86/2012-11-30-misched-dbg.ll | 16 +- test/CodeGen/X86/2012-11-30-regpres-dbg.ll | 8 +- test/CodeGen/X86/3dnow-intrinsics.ll | 4 +- test/CodeGen/X86/GC/alloc_loop.ll | 1 + test/CodeGen/X86/GC/cg-O0.ll | 1 + test/CodeGen/X86/GC/dynamic-frame-size.ll | 10 +- test/CodeGen/X86/GC/lower_gcroot.ll | 1 + test/CodeGen/X86/MachineBranchProb.ll | 4 +- test/CodeGen/X86/MachineSink-DbgValue.ll | 12 +- test/CodeGen/X86/MergeConsecutiveStores.ll | 37 +- test/CodeGen/X86/StackColoring-dbg.ll | 6 +- test/CodeGen/X86/add-nsw-sext.ll | 168 + test/CodeGen/X86/aliases.ll | 26 +- test/CodeGen/X86/and-encoding.ll | 41 + test/CodeGen/X86/atomic-flags.ll | 61 + test/CodeGen/X86/atomic-minmax-i6432.ll | 8 +- test/CodeGen/X86/atomic-non-integer.ll | 108 + test/CodeGen/X86/atomic128.ll | 52 +- test/CodeGen/X86/atomic_mi.ll | 662 +- test/CodeGen/X86/avg.ll | 724 + test/CodeGen/X86/avx-cvt-2.ll | 1 + test/CodeGen/X86/avx-cvt.ll | 6 +- test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll | 66 + test/CodeGen/X86/avx-intrinsics-x86.ll | 685 +- test/CodeGen/X86/avx-isa-check.ll | 570 + test/CodeGen/X86/avx-load-store.ll | 4 +- test/CodeGen/X86/avx-logic.ll | 2 + test/CodeGen/X86/avx-shift.ll | 1 + test/CodeGen/X86/avx-shuffle-x86_32.ll | 26 +- test/CodeGen/X86/avx-splat.ll | 114 +- test/CodeGen/X86/avx-vbroadcast.ll | 261 +- test/CodeGen/X86/avx-vperm2x128.ll | 44 +- test/CodeGen/X86/avx-win64.ll | 2 - test/CodeGen/X86/avx.ll | 6 +- test/CodeGen/X86/avx2-conversions.ll | 131 +- test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll | 120 + test/CodeGen/X86/avx2-intrinsics-x86.ll | 94 +- test/CodeGen/X86/avx2-nontemporal.ll | 17 +- test/CodeGen/X86/avx2-vbroadcast.ll | 441 +- test/CodeGen/X86/avx512-arith.ll | 358 +- test/CodeGen/X86/avx512-bugfix-25270.ll | 35 + test/CodeGen/X86/avx512-build-vector.ll | 1 + test/CodeGen/X86/avx512-calling-conv.ll | 481 +- test/CodeGen/X86/avx512-cvt.ll | 119 +- test/CodeGen/X86/avx512-ext.ll | 1835 +++ test/CodeGen/X86/avx512-extract-subvector.ll | 56 + test/CodeGen/X86/avx512-fma.ll | 155 +- test/CodeGen/X86/avx512-gather-scatter-intrin.ll | 185 +- test/CodeGen/X86/avx512-insert-extract.ll | 519 +- test/CodeGen/X86/avx512-intrinsics.ll | 4965 +++++- test/CodeGen/X86/avx512-logic.ll | 164 +- test/CodeGen/X86/avx512-mask-op.ll | 1472 +- test/CodeGen/X86/avx512-skx-insert-subvec.ll | 135 + test/CodeGen/X86/avx512-trunc-ext.ll | 961 -- test/CodeGen/X86/avx512-trunc.ll | 488 + test/CodeGen/X86/avx512-vbroadcast.ll | 262 +- test/CodeGen/X86/avx512-vec-cmp.ll | 27 +- test/CodeGen/X86/avx512bw-intrinsics.ll | 2674 ++- test/CodeGen/X86/avx512bwvl-intrinsics.ll | 748 + test/CodeGen/X86/avx512cd-intrinsics.ll | 18 + test/CodeGen/X86/avx512cdvl-intrinsics.ll | 179 + test/CodeGen/X86/avx512dq-intrinsics.ll | 667 + test/CodeGen/X86/avx512dqvl-intrinsics.ll | 818 +- test/CodeGen/X86/avx512vl-intrinsics.ll | 2977 +++- test/CodeGen/X86/bit-piece-comment.ll | 64 + test/CodeGen/X86/bitreverse.ll | 22 + test/CodeGen/X86/branchfolding-catchpads.ll | 95 + test/CodeGen/X86/buildvec-insertvec.ll | 1 + test/CodeGen/X86/catchpad-realign-savexmm.ll | 53 + test/CodeGen/X86/catchpad-regmask.ll | 144 + test/CodeGen/X86/catchpad-weight.ll | 82 + test/CodeGen/X86/catchret-empty-fallthrough.ll | 53 + test/CodeGen/X86/catchret-fallthrough.ll | 42 + test/CodeGen/X86/cleanuppad-inalloca.ll | 68 + test/CodeGen/X86/cleanuppad-large-codemodel.ll | 27 + test/CodeGen/X86/cleanuppad-realign.ll | 78 + test/CodeGen/X86/clz.ll | 148 +- test/CodeGen/X86/cmp.ll | 44 + test/CodeGen/X86/cmpxchg-clobber-flags.ll | 150 +- test/CodeGen/X86/coal-sections.ll | 23 + test/CodeGen/X86/coalescer-win64.ll | 16 + .../CodeGen/X86/code_placement_cold_loop_blocks.ll | 122 + .../code_placement_ignore_succ_in_inner_loop.ll | 123 + test/CodeGen/X86/code_placement_loop_rotation.ll | 80 + test/CodeGen/X86/code_placement_loop_rotation2.ll | 122 + test/CodeGen/X86/codegen-prepare-cast.ll | 2 +- test/CodeGen/X86/coff-comdat.ll | 2 +- test/CodeGen/X86/combine-and.ll | 1 + test/CodeGen/X86/combine-avx-intrinsics.ll | 59 - test/CodeGen/X86/combine-avx2-intrinsics.ll | 74 - test/CodeGen/X86/combine-multiplies.ll | 163 + test/CodeGen/X86/combine-or.ll | 1 + test/CodeGen/X86/combine-sse2-intrinsics.ll | 53 - test/CodeGen/X86/combine-sse41-intrinsics.ll | 91 - test/CodeGen/X86/commute-two-addr.ll | 2 +- test/CodeGen/X86/constant-hoisting-and.ll | 19 + test/CodeGen/X86/constant-hoisting-cmp.ll | 25 + test/CodeGen/X86/copysign-constant-magnitude.ll | 24 +- test/CodeGen/X86/cppeh-nounwind.ll | 35 - test/CodeGen/X86/cxx_tlscc64.ll | 71 + test/CodeGen/X86/dag-fmf-cse.ll | 22 + test/CodeGen/X86/dag-merge-fast-accesses.ll | 90 + test/CodeGen/X86/darwin-tls.ll | 28 + .../X86/dbg-changes-codegen-branch-folding.ll | 48 +- test/CodeGen/X86/dbg-changes-codegen.ll | 9 +- test/CodeGen/X86/dbg-combine.ll | 12 +- test/CodeGen/X86/debugloc-argsize.ll | 58 + test/CodeGen/X86/divide-by-constant.ll | 32 + test/CodeGen/X86/dllexport-x86_64.ll | 10 +- test/CodeGen/X86/dllexport.ll | 8 +- test/CodeGen/X86/dwarf-comp-dir.ll | 2 +- test/CodeGen/X86/dynamic-allocas-VLAs.ll | 2 +- test/CodeGen/X86/eh-null-personality.ll | 25 + test/CodeGen/X86/eh_frame.ll | 4 +- test/CodeGen/X86/emutls-pic.ll | 168 + test/CodeGen/X86/emutls-pie.ll | 131 + test/CodeGen/X86/emutls.ll | 347 + test/CodeGen/X86/emutls_generic.ll | 107 + test/CodeGen/X86/exedeps-movq.ll | 19 + test/CodeGen/X86/expand-vr64-gr64-copy.mir | 36 + .../X86/extractelement-legalization-cycle.ll | 21 + test/CodeGen/X86/extractelement-shuffle.ll | 1 + test/CodeGen/X86/fadd-combines.ll | 224 + test/CodeGen/X86/fast-isel-bitcasts-avx.ll | 244 + test/CodeGen/X86/fast-isel-bitcasts.ll | 245 + test/CodeGen/X86/fast-isel-cmp-branch.ll | 17 +- test/CodeGen/X86/fast-isel-deadcode.ll | 147 + test/CodeGen/X86/fast-isel-emutls.ll | 48 + test/CodeGen/X86/fast-isel-nontemporal.ll | 111 + test/CodeGen/X86/fast-isel-stackcheck.ll | 44 + test/CodeGen/X86/fast-isel-tls.ll | 2 +- test/CodeGen/X86/fdiv-combine.ll | 69 +- test/CodeGen/X86/fdiv.ll | 52 +- test/CodeGen/X86/fixup-lea.ll | 34 + test/CodeGen/X86/float-asmprint.ll | 15 + test/CodeGen/X86/floor-soft-float.ll | 2 +- test/CodeGen/X86/fma-commute-x86.ll | 761 + test/CodeGen/X86/fma-do-not-commute.ll | 2 +- test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll | 499 +- test/CodeGen/X86/fma-intrinsics-x86.ll | 688 +- test/CodeGen/X86/fma-scalar-memfold.ll | 383 + test/CodeGen/X86/fma_patterns.ll | 1301 +- test/CodeGen/X86/fma_patterns_wide.ll | 851 +- test/CodeGen/X86/fmaxnum.ll | 203 +- test/CodeGen/X86/fminnum.ll | 181 +- test/CodeGen/X86/fmul-combines.ll | 44 +- test/CodeGen/X86/fold-load-binops.ll | 1 + test/CodeGen/X86/fold-load-unops.ll | 1 + test/CodeGen/X86/fold-push.ll | 40 + test/CodeGen/X86/force-align-stack-alloca.ll | 2 +- test/CodeGen/X86/force-align-stack.ll | 2 +- test/CodeGen/X86/fp-fast.ll | 1 + test/CodeGen/X86/fp-logic.ll | 264 + test/CodeGen/X86/fp128-calling-conv.ll | 47 + test/CodeGen/X86/fp128-cast.ll | 279 + test/CodeGen/X86/fp128-compare.ll | 96 + test/CodeGen/X86/fp128-i128.ll | 320 + test/CodeGen/X86/fp128-libcalls.ll | 107 + test/CodeGen/X86/fp128-load.ll | 35 + test/CodeGen/X86/fp128-store.ll | 14 + test/CodeGen/X86/fpcmp-soft-fp.ll | 127 + test/CodeGen/X86/fpstack-debuginstr-kill.ll | 16 +- test/CodeGen/X86/frameescape.ll | 128 - test/CodeGen/X86/frem-msvc32.ll | 12 + test/CodeGen/X86/funclet-layout.ll | 158 + test/CodeGen/X86/function-alias.ll | 12 + test/CodeGen/X86/gcc_except_table.ll | 2 +- test/CodeGen/X86/global-sections.ll | 7 +- test/CodeGen/X86/h-register-store.ll | 25 +- test/CodeGen/X86/h-registers-0.ll | 1 + test/CodeGen/X86/h-registers-1.ll | 1 + test/CodeGen/X86/h-registers-3.ll | 1 + test/CodeGen/X86/half.ll | 4 +- test/CodeGen/X86/hhvm-cc.ll | 241 + test/CodeGen/X86/i386-shrink-wrapping.ll | 113 + test/CodeGen/X86/immediate_merging.ll | 82 + test/CodeGen/X86/implicit-null-check.ll | 51 +- test/CodeGen/X86/imul.ll | 63 + test/CodeGen/X86/inalloca-stdcall.ll | 5 +- test/CodeGen/X86/inalloca.ll | 15 +- test/CodeGen/X86/inconsistent_landingpad.ll | 30 + test/CodeGen/X86/inline-asm-2addr.ll | 11 +- test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll | 2 +- test/CodeGen/X86/inline-sse.ll | 34 + test/CodeGen/X86/insertps-from-constantpool.ll | 20 + test/CodeGen/X86/insertps-unfold-load-bug.ll | 33 + test/CodeGen/X86/int-intrinsic.ll | 2 +- test/CodeGen/X86/late-address-taken.ll | 68 + test/CodeGen/X86/lea-opt.ll | 131 + test/CodeGen/X86/lit.local.cfg | 2 +- test/CodeGen/X86/localescape.ll | 143 + test/CodeGen/X86/lower-vec-shift-2.ll | 1 + test/CodeGen/X86/lsr-static-addr.ll | 2 +- test/CodeGen/X86/machine-combiner-int-vec.ll | 112 + test/CodeGen/X86/machine-combiner-int.ll | 194 + test/CodeGen/X86/machine-combiner.ll | 467 +- test/CodeGen/X86/machine-cp.ll | 38 +- test/CodeGen/X86/machine-trace-metrics-crash.ll | 4 +- test/CodeGen/X86/masked_gather_scatter.ll | 2012 ++- test/CodeGen/X86/masked_memop.ll | 524 +- test/CodeGen/X86/materialize.ll | 184 + test/CodeGen/X86/mcu-abi.ll | 112 + test/CodeGen/X86/memcpy-2.ll | 26 +- test/CodeGen/X86/memcpy.ll | 33 + .../X86/merge-store-partially-alias-loads.ll | 52 + .../X86/misched-code-difference-with-debug.ll | 12 +- test/CodeGen/X86/mmx-arg-passing-x86-64.ll | 1 + test/CodeGen/X86/mmx-arg-passing.ll | 1 + test/CodeGen/X86/mmx-coalescing.ll | 84 + test/CodeGen/X86/mmx-intrinsics.ll | 291 +- test/CodeGen/X86/mmx-only.ll | 21 + test/CodeGen/X86/movntdq-no-avx.ll | 2 +- test/CodeGen/X86/movpc32-check.ll | 42 + test/CodeGen/X86/movtopush.ll | 25 +- test/CodeGen/X86/mult-alt-x86.ll | 2 +- test/CodeGen/X86/musttail-varargs.ll | 43 + test/CodeGen/X86/nontemporal-2.ll | 21 +- test/CodeGen/X86/nontemporal.ll | 11 +- test/CodeGen/X86/null-streamer.ll | 4 +- test/CodeGen/X86/opt-ext-uses.ll | 8 +- test/CodeGen/X86/or-branch.ll | 30 +- test/CodeGen/X86/or-lea.ll | 120 + test/CodeGen/X86/palignr.ll | 1 + test/CodeGen/X86/patchpoint-verifiable.mir | 42 + test/CodeGen/X86/peephole-na-phys-copy-folding.ll | 190 + test/CodeGen/X86/pmul.ll | 297 +- test/CodeGen/X86/pop-stack-cleanup.ll | 76 + test/CodeGen/X86/powi.ll | 38 +- test/CodeGen/X86/pr11415.ll | 8 +- test/CodeGen/X86/pr11468.ll | 2 +- test/CodeGen/X86/pr11985.ll | 30 +- test/CodeGen/X86/pr13577.ll | 5 +- test/CodeGen/X86/pr15267.ll | 240 +- test/CodeGen/X86/pr17631.ll | 2 +- test/CodeGen/X86/pr21529.ll | 15 - test/CodeGen/X86/pr22019.ll | 2 +- test/CodeGen/X86/pr23900.ll | 29 - test/CodeGen/X86/pr24139.ll | 148 + test/CodeGen/X86/pr24602.ll | 17 + test/CodeGen/X86/pr25828.ll | 30 + test/CodeGen/X86/prolog-push-seq.ll | 19 + test/CodeGen/X86/pseudo_cmov_lower.ll | 267 + test/CodeGen/X86/pseudo_cmov_lower1.ll | 39 + test/CodeGen/X86/pseudo_cmov_lower2.ll | 100 + test/CodeGen/X86/psubus.ll | 580 +- test/CodeGen/X86/push-cfi-debug.ll | 53 + test/CodeGen/X86/push-cfi-obj.ll | 44 + test/CodeGen/X86/push-cfi.ll | 304 + test/CodeGen/X86/ragreedy-hoist-spill.ll | 2 +- test/CodeGen/X86/rem_crash.ll | 257 + test/CodeGen/X86/remat-invalid-liveness.ll | 85 - test/CodeGen/X86/rodata-relocs.ll | 8 +- test/CodeGen/X86/rounding-ops.ll | 24 +- test/CodeGen/X86/safestack.ll | 32 + test/CodeGen/X86/sar_fold.ll | 37 + test/CodeGen/X86/sar_fold64.ll | 43 + test/CodeGen/X86/scalar-fp-to-i64.ll | 151 + test/CodeGen/X86/scalar-int-to-fp.ll | 132 + test/CodeGen/X86/sdiv-pow2.ll | 33 + test/CodeGen/X86/seh-catch-all-win32.ll | 33 +- test/CodeGen/X86/seh-catch-all.ll | 29 +- test/CodeGen/X86/seh-catchpad.ll | 198 + test/CodeGen/X86/seh-except-finally.ll | 71 +- test/CodeGen/X86/seh-exception-code.ll | 38 + test/CodeGen/X86/seh-filter.ll | 21 - test/CodeGen/X86/seh-finally.ll | 50 +- test/CodeGen/X86/seh-safe-div-win32.ll | 42 +- test/CodeGen/X86/seh-safe-div.ll | 54 +- test/CodeGen/X86/seh-stack-realign-win32.ll | 99 - test/CodeGen/X86/seh-stack-realign.ll | 34 +- test/CodeGen/X86/setcc-lowering.ll | 1 + test/CodeGen/X86/setcc.ll | 20 + test/CodeGen/X86/shift-bmi2.ll | 20 +- test/CodeGen/X86/shrink-wrap-chkstk.ll | 37 + test/CodeGen/X86/slow-div.ll | 15 + test/CodeGen/X86/slow-unaligned-mem.ll | 95 + test/CodeGen/X86/soft-fp.ll | 34 +- test/CodeGen/X86/soft-sitofp.ll | 169 + test/CodeGen/X86/splat-for-size.ll | 197 +- test/CodeGen/X86/sqrt-fastmath.ll | 9 +- test/CodeGen/X86/sse-align-12.ll | 1 + test/CodeGen/X86/sse-minmax.ll | 2 +- test/CodeGen/X86/sse-only.ll | 20 + test/CodeGen/X86/sse-scalar-fp-arith-unary.ll | 1 + test/CodeGen/X86/sse2-vector-shifts.ll | 282 +- test/CodeGen/X86/sse2.ll | 1 + test/CodeGen/X86/sse3-avx-addsub-2.ll | 312 +- test/CodeGen/X86/sse3-avx-addsub.ll | 197 +- test/CodeGen/X86/sse3-intrinsics-fast-isel.ll | 171 + test/CodeGen/X86/sse3.ll | 7 +- test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll | 47 +- test/CodeGen/X86/sse41-intrinsics-x86.ll | 48 - test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll | 185 +- test/CodeGen/X86/sse41.ll | 65 +- test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll | 98 + test/CodeGen/X86/sse_partial_update.ll | 33 + test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll | 290 + test/CodeGen/X86/stack-align-memcpy.ll | 2 +- test/CodeGen/X86/stack-folding-adx-x86_64.ll | 45 + test/CodeGen/X86/stack-folding-fp-avx1.ll | 18 +- test/CodeGen/X86/stack-folding-fp-sse42.ll | 28 +- test/CodeGen/X86/stack-folding-int-avx1.ll | 40 +- test/CodeGen/X86/stack-folding-int-avx2.ll | 55 +- test/CodeGen/X86/stack-folding-int-sse42.ll | 38 +- test/CodeGen/X86/stack-folding-mmx.ll | 148 +- test/CodeGen/X86/stack-folding-x86_64.ll | 2 +- test/CodeGen/X86/stack-folding-xop.ll | 2 +- test/CodeGen/X86/stack-probe-size.ll | 3 +- test/CodeGen/X86/stack-protector-dbginfo.ll | 36 +- test/CodeGen/X86/stack-protector-weight.ll | 4 +- test/CodeGen/X86/stackmap-frame-setup.ll | 20 + test/CodeGen/X86/statepoint-allocas.ll | 10 +- test/CodeGen/X86/statepoint-call-lowering.ll | 103 +- test/CodeGen/X86/statepoint-far-call.ll | 4 +- test/CodeGen/X86/statepoint-forward.ll | 16 +- .../X86/statepoint-gctransition-call-lowering.ll | 66 +- test/CodeGen/X86/statepoint-invoke.ll | 78 +- test/CodeGen/X86/statepoint-stack-usage.ll | 54 +- test/CodeGen/X86/statepoint-stackmap-format.ll | 96 +- test/CodeGen/X86/stdarg.ll | 10 +- test/CodeGen/X86/stores-merging.ll | 46 +- test/CodeGen/X86/switch-bt.ll | 8 +- test/CodeGen/X86/switch-edge-weight.ll | 281 + test/CodeGen/X86/switch-jump-table.ll | 54 +- test/CodeGen/X86/switch-order-weight.ll | 2 +- test/CodeGen/X86/switch.ll | 85 +- test/CodeGen/X86/swizzle-2.ll | 1 + test/CodeGen/X86/system-intrinsics-64-xsave.ll | 41 + test/CodeGen/X86/system-intrinsics-64-xsavec.ll | 21 + test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll | 21 + test/CodeGen/X86/system-intrinsics-64-xsaves.ll | 41 + test/CodeGen/X86/system-intrinsics-64.ll | 2 +- test/CodeGen/X86/system-intrinsics-xsave.ll | 23 + test/CodeGen/X86/system-intrinsics-xsavec.ll | 12 + test/CodeGen/X86/system-intrinsics-xsaveopt.ll | 12 + test/CodeGen/X86/system-intrinsics-xsaves.ll | 23 + test/CodeGen/X86/system-intrinsics.ll | 2 +- test/CodeGen/X86/tail-dup-catchret.ll | 31 + test/CodeGen/X86/tail-merge-wineh.ll | 107 + test/CodeGen/X86/tail-opts.ll | 40 +- test/CodeGen/X86/tailcall-mem-intrinsics.ll | 4 +- test/CodeGen/X86/tailcall-msvc-conventions.ll | 189 + test/CodeGen/X86/tailcall-readnone.ll | 15 + test/CodeGen/X86/tls-android-negative.ll | 65 + test/CodeGen/X86/tls-android.ll | 89 + test/CodeGen/X86/tls-models.ll | 2 + test/CodeGen/X86/tls-pie.ll | 8 + test/CodeGen/X86/token_landingpad.ll | 21 + test/CodeGen/X86/trunc-store.ll | 49 + test/CodeGen/X86/unaligned-32-byte-memops.ll | 7 +- test/CodeGen/X86/unaligned-spill-folding.ll | 2 +- test/CodeGen/X86/unknown-location.ll | 8 +- test/CodeGen/X86/v2f32.ll | 1 + test/CodeGen/X86/vec_cast2.ll | 31 +- test/CodeGen/X86/vec_cmp_sint-128.ll | 722 + test/CodeGen/X86/vec_cmp_uint-128.ll | 860 + test/CodeGen/X86/vec_ctbits.ll | 129 +- test/CodeGen/X86/vec_extract-avx.ll | 114 +- test/CodeGen/X86/vec_fabs.ll | 2 +- test/CodeGen/X86/vec_fp_to_int.ll | 1269 +- test/CodeGen/X86/vec_insert-5.ll | 1 + test/CodeGen/X86/vec_int_to_fp.ll | 1920 ++- test/CodeGen/X86/vec_minmax_sint.ll | 2090 +++ test/CodeGen/X86/vec_minmax_uint.ll | 2229 +++ test/CodeGen/X86/vec_sdiv_to_shift.ll | 13 + test/CodeGen/X86/vec_trunc_sext.ll | 31 +- test/CodeGen/X86/vec_uint_to_fp-fastmath.ll | 130 + test/CodeGen/X86/vec_uint_to_fp.ll | 8 +- test/CodeGen/X86/vector-blend.ll | 72 +- test/CodeGen/X86/vector-idiv.ll | 1 + test/CodeGen/X86/vector-lzcnt-128.ll | 472 +- test/CodeGen/X86/vector-lzcnt-256.ll | 257 +- test/CodeGen/X86/vector-lzcnt-512.ll | 219 + .../CodeGen/X86/vector-merge-store-fp-constants.ll | 35 + test/CodeGen/X86/vector-popcnt-128.ll | 37 +- test/CodeGen/X86/vector-popcnt-256.ll | 73 +- test/CodeGen/X86/vector-popcnt-512.ll | 161 + test/CodeGen/X86/vector-rotate-128.ll | 1595 ++ test/CodeGen/X86/vector-rotate-256.ll | 1089 ++ test/CodeGen/X86/vector-sext.ll | 3988 ++++- test/CodeGen/X86/vector-shift-ashr-128.ll | 917 +- test/CodeGen/X86/vector-shift-ashr-256.ll | 691 +- test/CodeGen/X86/vector-shift-ashr-512.ll | 378 + test/CodeGen/X86/vector-shift-lshr-128.ll | 619 +- test/CodeGen/X86/vector-shift-lshr-256.ll | 444 +- test/CodeGen/X86/vector-shift-lshr-512.ll | 317 + test/CodeGen/X86/vector-shift-shl-128.ll | 501 +- test/CodeGen/X86/vector-shift-shl-256.ll | 403 +- test/CodeGen/X86/vector-shift-shl-512.ll | 293 + test/CodeGen/X86/vector-shuffle-128-v16.ll | 276 + test/CodeGen/X86/vector-shuffle-128-v2.ll | 318 +- test/CodeGen/X86/vector-shuffle-128-v4.ll | 92 + test/CodeGen/X86/vector-shuffle-128-v8.ll | 252 + test/CodeGen/X86/vector-shuffle-256-v16.ll | 249 +- test/CodeGen/X86/vector-shuffle-256-v32.ll | 210 +- test/CodeGen/X86/vector-shuffle-256-v4.ll | 703 +- test/CodeGen/X86/vector-shuffle-256-v8.ll | 221 +- test/CodeGen/X86/vector-shuffle-512-v16.ll | 134 + test/CodeGen/X86/vector-shuffle-512-v32.ll | 44 + test/CodeGen/X86/vector-shuffle-512-v8.ll | 2487 ++- test/CodeGen/X86/vector-shuffle-combining.ll | 1 + test/CodeGen/X86/vector-shuffle-mmx.ll | 1 + test/CodeGen/X86/vector-shuffle-sse1.ll | 1 + test/CodeGen/X86/vector-shuffle-sse4a.ll | 140 + test/CodeGen/X86/vector-shuffle-v1.ll | 439 + test/CodeGen/X86/vector-trunc.ll | 681 +- test/CodeGen/X86/vector-tzcnt-128.ll | 2035 +-- test/CodeGen/X86/vector-tzcnt-256.ll | 1455 +- test/CodeGen/X86/vector-tzcnt-512.ll | 271 + test/CodeGen/X86/vector-zext.ll | 1523 +- test/CodeGen/X86/vector-zmov.ll | 1 + ...gisters-cleared-in-machine-functions-liveins.ll | 19 + test/CodeGen/X86/vmovq.ll | 28 + test/CodeGen/X86/vselect-2.ll | 1 + test/CodeGen/X86/vselect-avx.ll | 12 +- test/CodeGen/X86/vselect-minmax.ll | 16332 +++++++++++++------ test/CodeGen/X86/vselect.ll | 1 + test/CodeGen/X86/vshift_scalar.ll | 1 + test/CodeGen/X86/wide-integer-cmp.ll | 130 + test/CodeGen/X86/widen_load-2.ll | 4 +- test/CodeGen/X86/widen_shuffle-1.ll | 1 + test/CodeGen/X86/win-catchpad-csrs.ll | 268 + test/CodeGen/X86/win-catchpad-nested-cxx.ll | 105 + test/CodeGen/X86/win-catchpad-nested.ll | 42 + test/CodeGen/X86/win-catchpad-varargs.ll | 101 + test/CodeGen/X86/win-catchpad.ll | 353 + test/CodeGen/X86/win-cleanuppad.ll | 199 + test/CodeGen/X86/win-funclet-cfi.ll | 95 + test/CodeGen/X86/win-mixed-ehpersonality.ll | 81 + test/CodeGen/X86/win32-eh-states.ll | 213 +- test/CodeGen/X86/win32-eh.ll | 49 +- test/CodeGen/X86/win32-pic-jumptable.ll | 8 +- test/CodeGen/X86/win32-seh-catchpad-realign.ll | 77 + test/CodeGen/X86/win32-seh-catchpad.ll | 231 + test/CodeGen/X86/win32-seh-nested-finally.ll | 80 + test/CodeGen/X86/win32-spill-xmm.ll | 40 + test/CodeGen/X86/win64_frame.ll | 70 +- test/CodeGen/X86/win64_sibcall.ll | 38 + test/CodeGen/X86/win_coreclr_chkstk.ll | 143 + test/CodeGen/X86/win_eh_prepare.ll | 82 - test/CodeGen/X86/win_ftol2.ll | 166 - test/CodeGen/X86/wineh-coreclr.ll | 267 + test/CodeGen/X86/wineh-exceptionpointer.ll | 26 + test/CodeGen/X86/wineh-no-ehpads.ll | 20 + test/CodeGen/X86/x32-function_pointer-3.ll | 2 +- test/CodeGen/X86/x32-indirectbr.ll | 26 + test/CodeGen/X86/x32-landingpad.ll | 27 + test/CodeGen/X86/x32-va_start.ll | 99 + test/CodeGen/X86/x86-32-intrcc.ll | 79 + test/CodeGen/X86/x86-64-baseptr.ll | 4 +- .../X86/x86-64-double-precision-shift-left.ll | 17 +- .../X86/x86-64-double-precision-shift-right.ll | 9 +- test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll | 4 +- test/CodeGen/X86/x86-64-intrcc.ll | 86 + test/CodeGen/X86/x86-64-ms_abi-vararg.ll | 108 + test/CodeGen/X86/x86-64-pic-10.ll | 2 +- test/CodeGen/X86/x86-fold-pshufb.ll | 20 +- test/CodeGen/X86/x86-sanitizer-shrink-wrapping.ll | 40 + test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll | 16 +- test/CodeGen/X86/x86-shrink-wrap-unwind.ll | 153 + test/CodeGen/X86/x86-shrink-wrapping.ll | 254 +- test/CodeGen/X86/x86-win64-shrink-wrapping.ll | 126 + test/CodeGen/X86/xop-intrinsics-x86_64.ll | 33 +- test/CodeGen/X86/xop-pcmov.ll | 163 + test/CodeGen/XCore/aliases.ll | 6 +- test/CodeGen/XCore/dwarf_debug.ll | 8 +- 1591 files changed, 135533 insertions(+), 31280 deletions(-) create mode 100644 test/CodeGen/AArch64/aarch64-addv.ll create mode 100644 test/CodeGen/AArch64/aarch64-deferred-spilling.ll create mode 100644 test/CodeGen/AArch64/aarch64-loop-gep-opt.ll create mode 100644 test/CodeGen/AArch64/aarch64-minmaxv.ll create mode 100644 test/CodeGen/AArch64/aarch64-smax-constantfold.ll create mode 100644 test/CodeGen/AArch64/arm64-builtins-linux.ll create mode 100644 test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll create mode 100644 test/CodeGen/AArch64/arm64-fmax-safe.ll create mode 100644 test/CodeGen/AArch64/arm64-ld-from-st.ll create mode 100644 test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll create mode 100644 test/CodeGen/AArch64/bitreverse.ll create mode 100644 test/CodeGen/AArch64/cxx-tlscc.ll create mode 100644 test/CodeGen/AArch64/dag-combine-select.ll create mode 100644 test/CodeGen/AArch64/divrem.ll create mode 100644 test/CodeGen/AArch64/emutls.ll create mode 100644 test/CodeGen/AArch64/emutls_generic.ll create mode 100644 test/CodeGen/AArch64/eon.ll create mode 100644 test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll create mode 100644 test/CodeGen/AArch64/fast-isel-cmp-vec.ll create mode 100644 test/CodeGen/AArch64/fast-isel-folded-shift.ll create mode 100644 test/CodeGen/AArch64/fcvt_combine.ll create mode 100644 test/CodeGen/AArch64/fdiv_combine.ll create mode 100644 test/CodeGen/AArch64/misched-fusion.ll create mode 100644 test/CodeGen/AArch64/nontemporal.ll create mode 100644 test/CodeGen/AArch64/readcyclecounter.ll create mode 100644 test/CodeGen/AArch64/rotate.ll create mode 100644 test/CodeGen/AArch64/round-conv.ll create mode 100755 test/CodeGen/AArch64/shrink-wrap.ll create mode 100644 test/CodeGen/AArch64/stackmap-frame-setup.ll create mode 100644 test/CodeGen/AArch64/tbi.ll create mode 100644 test/CodeGen/AArch64/vector-fcopysign.ll create mode 100644 test/CodeGen/AMDGPU/addrspacecast.ll create mode 100644 test/CodeGen/AMDGPU/annotate-kernel-features.ll create mode 100644 test/CodeGen/AMDGPU/bitreverse.ll create mode 100644 test/CodeGen/AMDGPU/calling-conventions.ll create mode 100644 test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll create mode 100644 test/CodeGen/AMDGPU/ci-use-flat-for-global.ll create mode 100644 test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll create mode 100644 test/CodeGen/AMDGPU/ds-sub-offset.ll create mode 100644 test/CodeGen/AMDGPU/dynamic_stackalloc.ll create mode 100644 test/CodeGen/AMDGPU/extract-vector-elt-i64.ll create mode 100644 test/CodeGen/AMDGPU/flat-scratch-reg.ll create mode 100644 test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll create mode 100644 test/CodeGen/AMDGPU/global-constant.ll create mode 100644 test/CodeGen/AMDGPU/hsa-globals.ll create mode 100644 test/CodeGen/AMDGPU/hsa-group-segment.ll create mode 100644 test/CodeGen/AMDGPU/image-attributes.ll create mode 100644 test/CodeGen/AMDGPU/image-resource-id.ll create mode 100644 test/CodeGen/AMDGPU/inline-constraints.ll create mode 100644 test/CodeGen/AMDGPU/large-alloca-compute.ll create mode 100644 test/CodeGen/AMDGPU/large-alloca-graphics.ll delete mode 100644 test/CodeGen/AMDGPU/large-alloca.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll create mode 100644 test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll create mode 100644 test/CodeGen/AMDGPU/llvm.SI.packf16.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll create mode 100644 test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll create mode 100644 test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll create mode 100644 test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll create mode 100644 test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll create mode 100644 test/CodeGen/AMDGPU/opencl-image-metadata.ll create mode 100644 test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll create mode 100644 test/CodeGen/AMDGPU/sampler-resource-id.ll create mode 100644 test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll create mode 100644 test/CodeGen/AMDGPU/si-literal-folding.ll create mode 100644 test/CodeGen/AMDGPU/sminmax.ll create mode 100644 test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll create mode 100644 test/CodeGen/AMDGPU/store_typed.ll create mode 100644 test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll create mode 100644 test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll create mode 100644 test/CodeGen/ARM/MachO-subtypes.ll create mode 100644 test/CodeGen/ARM/Windows/division.ll delete mode 100644 test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll create mode 100644 test/CodeGen/ARM/Windows/libcalls.ll create mode 100644 test/CodeGen/ARM/Windows/no-eabi.ll create mode 100644 test/CodeGen/ARM/Windows/no-frame-register.ll create mode 100644 test/CodeGen/ARM/Windows/overflow.ll create mode 100644 test/CodeGen/ARM/align-sp-adjustment.ll create mode 100644 test/CodeGen/ARM/apcs-vfp.ll create mode 100644 test/CodeGen/ARM/arm-eabi.ll create mode 100644 test/CodeGen/ARM/arm-shrink-wrapping-linux.ll create mode 100644 test/CodeGen/ARM/arm-shrink-wrapping.ll create mode 100644 test/CodeGen/ARM/build-attributes-optimization-minsize.ll create mode 100644 test/CodeGen/ARM/build-attributes-optimization-mixed.ll create mode 100644 test/CodeGen/ARM/build-attributes-optimization-optnone.ll create mode 100644 test/CodeGen/ARM/build-attributes-optimization-optsize.ll create mode 100644 test/CodeGen/ARM/build-attributes-optimization.ll create mode 100644 test/CodeGen/ARM/cfi-alignment.ll create mode 100644 test/CodeGen/ARM/combine-vmovdrr.ll create mode 100644 test/CodeGen/ARM/debugtrap.ll create mode 100644 test/CodeGen/ARM/emutls.ll create mode 100644 test/CodeGen/ARM/emutls1.ll create mode 100644 test/CodeGen/ARM/emutls_generic.ll create mode 100644 test/CodeGen/ARM/fp16-args.ll create mode 100644 test/CodeGen/ARM/gep-optimization.ll create mode 100644 test/CodeGen/ARM/global-merge-external.ll create mode 100644 test/CodeGen/ARM/ldm-stm-base-materialization.ll create mode 100644 test/CodeGen/ARM/legalize-unaligned-load.ll create mode 100644 test/CodeGen/ARM/memcpy-ldm-stm.ll create mode 100644 test/CodeGen/ARM/minmax.ll create mode 100644 test/CodeGen/ARM/neon_vshl_minint.ll create mode 100644 test/CodeGen/ARM/pr25317.ll create mode 100644 test/CodeGen/ARM/pr25838.ll create mode 100644 test/CodeGen/ARM/rotate.ll create mode 100644 test/CodeGen/ARM/sat-arith.ll delete mode 100644 test/CodeGen/ARM/sched-it-debug-nodes.ll create mode 100644 test/CodeGen/ARM/setjmp_longjmp.ll create mode 100644 test/CodeGen/ARM/softfp-fabs-fneg.ll create mode 100644 test/CodeGen/ARM/ssat-lower.ll create mode 100644 test/CodeGen/ARM/ssat-upper.ll create mode 100644 test/CodeGen/ARM/subtarget-no-movt.ll create mode 100644 test/CodeGen/ARM/thumb1-ldst-opt.ll create mode 100644 test/CodeGen/ARM/unaligned_load_store_vfp.ll create mode 100644 test/CodeGen/ARM/usat-lower.ll create mode 100644 test/CodeGen/ARM/usat-upper.ll create mode 100644 test/CodeGen/ARM/v7k-abi-align.ll create mode 100644 test/CodeGen/ARM/v7k-libcalls.ll create mode 100644 test/CodeGen/ARM/v7k-sincos.ll create mode 100644 test/CodeGen/ARM/vfp-reg-stride.ll create mode 100644 test/CodeGen/ARM/vld-vst-upgrade.ll create mode 100644 test/CodeGen/ARM/vminmaxnm-safe.ll create mode 100644 test/CodeGen/CPP/gep.ll create mode 100644 test/CodeGen/Generic/ForceStackAlign.ll create mode 100644 test/CodeGen/Generic/lit.local.cfg create mode 100644 test/CodeGen/Hexagon/NVJumpCmp.ll create mode 100644 test/CodeGen/Hexagon/bit-eval.ll create mode 100644 test/CodeGen/Hexagon/bit-loop.ll create mode 100644 test/CodeGen/Hexagon/cfi-late.ll create mode 100644 test/CodeGen/Hexagon/early-if-conversion-bug1.ll create mode 100644 test/CodeGen/Hexagon/early-if-phi-i1.ll create mode 100644 test/CodeGen/Hexagon/early-if-spare.ll create mode 100644 test/CodeGen/Hexagon/early-if.ll create mode 100644 test/CodeGen/Hexagon/ifcvt-edge-weight.ll create mode 100644 test/CodeGen/Hexagon/memcpy-likely-aligned.ll create mode 100644 test/CodeGen/Hexagon/mux-basic.ll create mode 100644 test/CodeGen/Hexagon/pic-jumptables.ll create mode 100644 test/CodeGen/Hexagon/pic-simple.ll create mode 100644 test/CodeGen/Hexagon/pic-static.ll create mode 100644 test/CodeGen/Hexagon/sdr-basic.ll create mode 100644 test/CodeGen/Hexagon/sdr-shr32.ll create mode 100644 test/CodeGen/Hexagon/store-widen-aliased-load.ll create mode 100644 test/CodeGen/Hexagon/store-widen-negv.ll create mode 100644 test/CodeGen/Hexagon/store-widen-negv2.ll create mode 100644 test/CodeGen/Hexagon/store-widen.ll create mode 100644 test/CodeGen/Hexagon/tail-dup-subreg-abort.ll create mode 100644 test/CodeGen/Hexagon/v60Intrins.ll create mode 100644 test/CodeGen/Hexagon/v60Vasr.ll create mode 100644 test/CodeGen/Hexagon/v60small.ll create mode 100644 test/CodeGen/MIR/AArch64/cfi-def-cfa.mir create mode 100644 test/CodeGen/MIR/AArch64/expected-target-flag-name.mir create mode 100644 test/CodeGen/MIR/AArch64/invalid-target-flag-name.mir create mode 100644 test/CodeGen/MIR/AArch64/lit.local.cfg create mode 100644 test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir create mode 100644 test/CodeGen/MIR/AArch64/stack-object-local-offset.mir create mode 100644 test/CodeGen/MIR/AArch64/target-flags.mir create mode 100644 test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir create mode 100644 test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir create mode 100644 test/CodeGen/MIR/AMDGPU/lit.local.cfg create mode 100644 test/CodeGen/MIR/AMDGPU/target-index-operands.mir create mode 100644 test/CodeGen/MIR/ARM/ARMLoadStoreDBG.mir create mode 100644 test/CodeGen/MIR/ARM/bundled-instructions.mir create mode 100644 test/CodeGen/MIR/ARM/cfi-same-value.mir create mode 100644 test/CodeGen/MIR/ARM/expected-closing-brace.mir create mode 100644 test/CodeGen/MIR/ARM/extraneous-closing-brace-error.mir create mode 100644 test/CodeGen/MIR/ARM/lit.local.cfg create mode 100644 test/CodeGen/MIR/ARM/nested-instruction-bundle-error.mir create mode 100644 test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir create mode 100644 test/CodeGen/MIR/Generic/basic-blocks.mir create mode 100644 test/CodeGen/MIR/Generic/expected-colon-after-basic-block.mir create mode 100644 test/CodeGen/MIR/Generic/expected-mbb-reference-for-successor-mbb.mir create mode 100644 test/CodeGen/MIR/Generic/frame-info.mir create mode 100644 test/CodeGen/MIR/Generic/function-missing-machine-function.mir create mode 100644 test/CodeGen/MIR/Generic/invalid-jump-table-kind.mir create mode 100644 test/CodeGen/MIR/Generic/lit.local.cfg create mode 100644 test/CodeGen/MIR/Generic/llvm-ir-error-reported.mir create mode 100644 test/CodeGen/MIR/Generic/llvmIR.mir create mode 100644 test/CodeGen/MIR/Generic/llvmIRMissing.mir create mode 100644 test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir create mode 100644 test/CodeGen/MIR/Generic/machine-basic-block-redefinition-error.mir create mode 100644 test/CodeGen/MIR/Generic/machine-basic-block-undefined-ir-block.mir create mode 100644 test/CodeGen/MIR/Generic/machine-basic-block-unknown-name.mir create mode 100644 test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir create mode 100644 test/CodeGen/MIR/Generic/machine-function-missing-function.mir create mode 100644 test/CodeGen/MIR/Generic/machine-function-missing-name.mir create mode 100644 test/CodeGen/MIR/Generic/machine-function-redefinition-error.mir create mode 100644 test/CodeGen/MIR/Generic/machine-function.mir create mode 100644 test/CodeGen/MIR/Generic/register-info.mir create mode 100644 test/CodeGen/MIR/Mips/expected-global-value-or-symbol-after-call-entry.mir create mode 100644 test/CodeGen/MIR/Mips/lit.local.cfg create mode 100644 test/CodeGen/MIR/Mips/memory-operands.mir create mode 100644 test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir create mode 100644 test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir create mode 100644 test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir create mode 100644 test/CodeGen/MIR/NVPTX/lit.local.cfg create mode 100644 test/CodeGen/MIR/PowerPC/lit.local.cfg create mode 100644 test/CodeGen/MIR/PowerPC/unordered-implicit-registers.mir create mode 100644 test/CodeGen/MIR/X86/basic-block-not-at-start-of-line-error.mir create mode 100644 test/CodeGen/MIR/X86/block-address-operands.mir create mode 100644 test/CodeGen/MIR/X86/callee-saved-info.mir create mode 100644 test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir create mode 100644 test/CodeGen/MIR/X86/cfi-def-cfa-register.mir create mode 100644 test/CodeGen/MIR/X86/cfi-offset.mir create mode 100644 test/CodeGen/MIR/X86/constant-pool-item-redefinition-error.mir create mode 100644 test/CodeGen/MIR/X86/constant-pool.mir create mode 100644 test/CodeGen/MIR/X86/constant-value-error.mir create mode 100644 test/CodeGen/MIR/X86/def-register-already-tied-error.mir create mode 100644 test/CodeGen/MIR/X86/duplicate-memory-operand-flag.mir create mode 100644 test/CodeGen/MIR/X86/duplicate-register-flag-error.mir create mode 100644 test/CodeGen/MIR/X86/early-clobber-register-flag.mir create mode 100644 test/CodeGen/MIR/X86/expected-align-in-memory-operand.mir create mode 100644 test/CodeGen/MIR/X86/expected-alignment-after-align-in-memory-operand.mir create mode 100644 test/CodeGen/MIR/X86/expected-basic-block-at-start-of-body.mir create mode 100644 test/CodeGen/MIR/X86/expected-block-reference-in-blockaddress.mir create mode 100644 test/CodeGen/MIR/X86/expected-comma-after-cfi-register.mir create mode 100644 test/CodeGen/MIR/X86/expected-comma-after-memory-operand.mir create mode 100644 test/CodeGen/MIR/X86/expected-from-in-memory-operand.mir create mode 100644 test/CodeGen/MIR/X86/expected-function-reference-after-blockaddress.mir create mode 100644 test/CodeGen/MIR/X86/expected-global-value-after-blockaddress.mir create mode 100644 test/CodeGen/MIR/X86/expected-integer-after-offset-sign.mir create mode 100644 test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir create mode 100644 test/CodeGen/MIR/X86/expected-integer-in-successor-weight.mir create mode 100644 test/CodeGen/MIR/X86/expected-load-or-store-in-memory-operand.mir create mode 100644 test/CodeGen/MIR/X86/expected-metadata-node-after-debug-location.mir create mode 100644 test/CodeGen/MIR/X86/expected-metadata-node-after-exclaim.mir create mode 100644 test/CodeGen/MIR/X86/expected-metadata-node-in-stack-object.mir create mode 100644 test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir create mode 100644 test/CodeGen/MIR/X86/expected-named-register-in-callee-saved-register.mir create mode 100644 test/CodeGen/MIR/X86/expected-named-register-in-functions-livein.mir create mode 100644 test/CodeGen/MIR/X86/expected-newline-at-end-of-list.mir create mode 100644 test/CodeGen/MIR/X86/expected-offset-after-cfi-operand.mir create mode 100644 test/CodeGen/MIR/X86/expected-pointer-value-in-memory-operand.mir create mode 100644 test/CodeGen/MIR/X86/expected-positive-alignment-after-align.mir create mode 100644 test/CodeGen/MIR/X86/expected-register-after-cfi-operand.mir create mode 100644 test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir create mode 100644 test/CodeGen/MIR/X86/expected-stack-object.mir create mode 100644 test/CodeGen/MIR/X86/expected-target-flag-name.mir create mode 100644 test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir create mode 100644 test/CodeGen/MIR/X86/expected-value-in-memory-operand.mir create mode 100644 test/CodeGen/MIR/X86/expected-virtual-register-in-functions-livein.mir create mode 100644 test/CodeGen/MIR/X86/external-symbol-operands.mir create mode 100644 test/CodeGen/MIR/X86/fixed-stack-memory-operands.mir create mode 100644 test/CodeGen/MIR/X86/fixed-stack-object-redefinition-error.mir create mode 100644 test/CodeGen/MIR/X86/frame-info-save-restore-points.mir create mode 100644 test/CodeGen/MIR/X86/frame-info-stack-references.mir create mode 100644 test/CodeGen/MIR/X86/frame-setup-instruction-flag.mir create mode 100644 test/CodeGen/MIR/X86/function-liveins.mir create mode 100644 test/CodeGen/MIR/X86/inline-asm-registers.mir create mode 100644 test/CodeGen/MIR/X86/instructions-debug-location.mir create mode 100644 test/CodeGen/MIR/X86/invalid-constant-pool-item.mir create mode 100644 test/CodeGen/MIR/X86/invalid-metadata-node-type.mir create mode 100644 test/CodeGen/MIR/X86/invalid-target-flag-name.mir create mode 100644 test/CodeGen/MIR/X86/invalid-tied-def-index-error.mir create mode 100644 test/CodeGen/MIR/X86/jump-table-info.mir create mode 100644 test/CodeGen/MIR/X86/jump-table-redefinition-error.mir create mode 100644 test/CodeGen/MIR/X86/large-cfi-offset-number-error.mir create mode 100644 test/CodeGen/MIR/X86/large-immediate-operand-error.mir create mode 100644 test/CodeGen/MIR/X86/large-offset-number-error.mir create mode 100644 test/CodeGen/MIR/X86/large-size-in-memory-operand-error.mir create mode 100644 test/CodeGen/MIR/X86/liveout-register-mask.mir create mode 100644 test/CodeGen/MIR/X86/machine-verifier.mir create mode 100644 test/CodeGen/MIR/X86/memory-operands.mir create mode 100644 test/CodeGen/MIR/X86/metadata-operands.mir create mode 100644 test/CodeGen/MIR/X86/missing-closing-quote.mir delete mode 100644 test/CodeGen/MIR/X86/missing-instruction.mir create mode 100644 test/CodeGen/MIR/X86/newline-handling.mir create mode 100644 test/CodeGen/MIR/X86/register-operands-target-flag-error.mir create mode 100644 test/CodeGen/MIR/X86/simple-register-allocation-hints.mir create mode 100644 test/CodeGen/MIR/X86/stack-object-debug-info.mir create mode 100644 test/CodeGen/MIR/X86/stack-object-invalid-name.mir create mode 100644 test/CodeGen/MIR/X86/stack-object-operand-name-mismatch-error.mir create mode 100644 test/CodeGen/MIR/X86/stack-object-operands.mir create mode 100644 test/CodeGen/MIR/X86/stack-object-redefinition-error.mir create mode 100644 test/CodeGen/MIR/X86/standalone-register-error.mir create mode 100644 test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir create mode 100644 test/CodeGen/MIR/X86/successor-basic-blocks.mir create mode 100644 test/CodeGen/MIR/X86/tied-def-operand-invalid.mir create mode 100644 test/CodeGen/MIR/X86/undefined-fixed-stack-object.mir create mode 100644 test/CodeGen/MIR/X86/undefined-ir-block-in-blockaddress.mir create mode 100644 test/CodeGen/MIR/X86/undefined-ir-block-slot-in-blockaddress.mir create mode 100644 test/CodeGen/MIR/X86/undefined-jump-table-id.mir create mode 100644 test/CodeGen/MIR/X86/undefined-stack-object.mir create mode 100644 test/CodeGen/MIR/X86/undefined-value-in-memory-operand.mir create mode 100644 test/CodeGen/MIR/X86/unknown-metadata-keyword.mir create mode 100644 test/CodeGen/MIR/X86/unknown-metadata-node.mir create mode 100644 test/CodeGen/MIR/X86/used-physical-register-info.mir create mode 100644 test/CodeGen/MIR/X86/virtual-register-redefinition-error.mir delete mode 100644 test/CodeGen/MIR/basic-blocks.mir delete mode 100644 test/CodeGen/MIR/expected-eof-after-successor-mbb.mir delete mode 100644 test/CodeGen/MIR/expected-mbb-reference-for-successor-mbb.mir delete mode 100644 test/CodeGen/MIR/frame-info.mir delete mode 100644 test/CodeGen/MIR/function-missing-machine-function.mir delete mode 100644 test/CodeGen/MIR/llvm-ir-error-reported.mir delete mode 100644 test/CodeGen/MIR/llvmIR.mir delete mode 100644 test/CodeGen/MIR/llvmIRMissing.mir delete mode 100644 test/CodeGen/MIR/machine-basic-block-redefinition-error.mir delete mode 100644 test/CodeGen/MIR/machine-basic-block-unknown-name.mir delete mode 100644 test/CodeGen/MIR/machine-function-missing-body-error.mir delete mode 100644 test/CodeGen/MIR/machine-function-missing-function.mir delete mode 100644 test/CodeGen/MIR/machine-function-missing-name.mir delete mode 100644 test/CodeGen/MIR/machine-function-redefinition-error.mir delete mode 100644 test/CodeGen/MIR/machine-function.mir delete mode 100644 test/CodeGen/MIR/register-info.mir delete mode 100644 test/CodeGen/MIR/successor-basic-blocks.mir create mode 100644 test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll create mode 100644 test/CodeGen/Mips/emutls_generic.ll create mode 100644 test/CodeGen/Mips/interrupt-attr-64-error.ll create mode 100644 test/CodeGen/Mips/interrupt-attr-args-error.ll create mode 100644 test/CodeGen/Mips/interrupt-attr-error.ll create mode 100644 test/CodeGen/Mips/interrupt-attr.ll create mode 100644 test/CodeGen/Mips/llvm-ir/atomicrmx.ll create mode 100644 test/CodeGen/Mips/llvm-ir/load-atomic.ll create mode 100644 test/CodeGen/Mips/llvm-ir/sqrt.ll create mode 100644 test/CodeGen/Mips/llvm-ir/store-atomic.ll create mode 100644 test/CodeGen/NVPTX/branch-fold.ll create mode 100644 test/CodeGen/NVPTX/bypass-div.ll create mode 100644 test/CodeGen/NVPTX/combine-min-max.ll create mode 100644 test/CodeGen/NVPTX/global-addrspace.ll create mode 100644 test/CodeGen/NVPTX/load-with-non-coherent-cache.ll create mode 100644 test/CodeGen/NVPTX/reg-copy.ll create mode 100644 test/CodeGen/PowerPC/BoolRetToIntTest.ll create mode 100644 test/CodeGen/PowerPC/BreakableToken-reduced.ll create mode 100644 test/CodeGen/PowerPC/aantidep-def-ec.mir create mode 100644 test/CodeGen/PowerPC/aantidep-inline-asm-use.ll create mode 100644 test/CodeGen/PowerPC/addisdtprelha-nonr3.mir create mode 100644 test/CodeGen/PowerPC/bitcasts-direct-move.ll create mode 100644 test/CodeGen/PowerPC/bitreverse.ll create mode 100644 test/CodeGen/PowerPC/branch-hint.ll create mode 100644 test/CodeGen/PowerPC/coal-sections.ll create mode 100644 test/CodeGen/PowerPC/crbit-asm-disabled.ll create mode 100644 test/CodeGen/PowerPC/dyn-alloca-offset.ll create mode 100644 test/CodeGen/PowerPC/e500-1.ll create mode 100644 test/CodeGen/PowerPC/emutls_generic.ll create mode 100644 test/CodeGen/PowerPC/fma-mutate-register-constraint.ll create mode 100644 test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll create mode 100644 test/CodeGen/PowerPC/machine-combiner.ll create mode 100644 test/CodeGen/PowerPC/mc-instrlat.ll create mode 100644 test/CodeGen/PowerPC/mcm-13.ll create mode 100644 test/CodeGen/PowerPC/merge-st-chain-op.ll create mode 100644 test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll create mode 100644 test/CodeGen/PowerPC/peephole-align.ll create mode 100644 test/CodeGen/PowerPC/ppc-shrink-wrapping.ll create mode 100644 test/CodeGen/PowerPC/ppcsoftops.ll create mode 100644 test/CodeGen/PowerPC/pr24636.ll create mode 100644 test/CodeGen/PowerPC/pr25157-peephole.ll create mode 100644 test/CodeGen/PowerPC/preincprep-nontrans-crash.ll create mode 100644 test/CodeGen/PowerPC/qpx-unal-cons-lds.ll create mode 100644 test/CodeGen/PowerPC/rotl-rotr-crash.ll create mode 100644 test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll create mode 100644 test/CodeGen/PowerPC/stackmap-frame-setup.ll create mode 100644 test/CodeGen/PowerPC/swaps-le-6.ll create mode 100644 test/CodeGen/PowerPC/unal-vec-ldst.ll create mode 100644 test/CodeGen/PowerPC/unal-vec-negarith.ll create mode 100644 test/CodeGen/PowerPC/variable_elem_vec_extracts.ll create mode 100644 test/CodeGen/PowerPC/vec-asm-disabled.ll create mode 100644 test/CodeGen/PowerPC/vector-merge-store-fp-constants.ll create mode 100644 test/CodeGen/SPARC/32abi.ll create mode 100644 test/CodeGen/SPARC/float-constants.ll create mode 100644 test/CodeGen/SPARC/missing-sret.ll create mode 100644 test/CodeGen/SPARC/reserved-regs.ll create mode 100644 test/CodeGen/SPARC/select-mask.ll create mode 100644 test/CodeGen/SPARC/spill.ll create mode 100644 test/CodeGen/SPARC/stack-align.ll create mode 100644 test/CodeGen/SystemZ/alloca-03.ll create mode 100644 test/CodeGen/SystemZ/alloca-04.ll create mode 100644 test/CodeGen/SystemZ/dag-combine-01.ll create mode 100644 test/CodeGen/SystemZ/fp-cmp-05.ll create mode 100644 test/CodeGen/SystemZ/fp-libcall.ll create mode 100644 test/CodeGen/SystemZ/fp-sincos-01.ll create mode 100644 test/CodeGen/SystemZ/int-cmp-51.ll create mode 100644 test/CodeGen/SystemZ/int-cmp-52.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-12.ll create mode 100644 test/CodeGen/SystemZ/vec-perm-13.ll create mode 100644 test/CodeGen/Thumb/ldm-stm-base-materialization-thumb2.ll delete mode 100644 test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll create mode 100644 test/CodeGen/Thumb/thumb-shrink-wrapping.ll create mode 100644 test/CodeGen/Thumb2/emit-unwinding.ll create mode 100644 test/CodeGen/Thumb2/setjmp_longjmp.ll create mode 100644 test/CodeGen/WebAssembly/call.ll create mode 100644 test/CodeGen/WebAssembly/cfg-stackify.ll create mode 100644 test/CodeGen/WebAssembly/comparisons_f32.ll create mode 100644 test/CodeGen/WebAssembly/comparisons_f64.ll create mode 100644 test/CodeGen/WebAssembly/comparisons_i32.ll create mode 100644 test/CodeGen/WebAssembly/comparisons_i64.ll create mode 100644 test/CodeGen/WebAssembly/conv.ll create mode 100644 test/CodeGen/WebAssembly/copysign-casts.ll create mode 100644 test/CodeGen/WebAssembly/cpus.ll create mode 100644 test/CodeGen/WebAssembly/dead-vreg.ll create mode 100644 test/CodeGen/WebAssembly/f32.ll create mode 100644 test/CodeGen/WebAssembly/f64.ll create mode 100644 test/CodeGen/WebAssembly/fast-isel.ll create mode 100644 test/CodeGen/WebAssembly/frem.ll create mode 100644 test/CodeGen/WebAssembly/func.ll create mode 100644 test/CodeGen/WebAssembly/global.ll create mode 100644 test/CodeGen/WebAssembly/globl.ll create mode 100644 test/CodeGen/WebAssembly/i32.ll create mode 100644 test/CodeGen/WebAssembly/i64.ll create mode 100644 test/CodeGen/WebAssembly/ident.ll create mode 100644 test/CodeGen/WebAssembly/immediates.ll create mode 100644 test/CodeGen/WebAssembly/inline-asm.ll create mode 100644 test/CodeGen/WebAssembly/legalize.ll create mode 100644 test/CodeGen/WebAssembly/load-ext.ll create mode 100644 test/CodeGen/WebAssembly/load-store-i1.ll create mode 100644 test/CodeGen/WebAssembly/load.ll create mode 100644 test/CodeGen/WebAssembly/loop-idiom.ll create mode 100644 test/CodeGen/WebAssembly/memory-addr32.ll create mode 100644 test/CodeGen/WebAssembly/memory-addr64.ll create mode 100644 test/CodeGen/WebAssembly/offset-folding.ll create mode 100644 test/CodeGen/WebAssembly/offset.ll create mode 100644 test/CodeGen/WebAssembly/phi.ll create mode 100644 test/CodeGen/WebAssembly/reg-stackify.ll create mode 100644 test/CodeGen/WebAssembly/return-int32.ll create mode 100644 test/CodeGen/WebAssembly/return-void.ll create mode 100644 test/CodeGen/WebAssembly/returned.ll create mode 100644 test/CodeGen/WebAssembly/select.ll create mode 100644 test/CodeGen/WebAssembly/signext-zeroext.ll create mode 100644 test/CodeGen/WebAssembly/store-results.ll create mode 100644 test/CodeGen/WebAssembly/store-trunc.ll create mode 100644 test/CodeGen/WebAssembly/store.ll create mode 100644 test/CodeGen/WebAssembly/switch.ll create mode 100644 test/CodeGen/WebAssembly/unreachable.ll create mode 100644 test/CodeGen/WebAssembly/unused-argument.ll create mode 100644 test/CodeGen/WebAssembly/userstack.ll create mode 100644 test/CodeGen/WebAssembly/varargs.ll create mode 100644 test/CodeGen/WebAssembly/vtable.ll delete mode 100644 test/CodeGen/WinEH/cppeh-alloca-sink.ll delete mode 100644 test/CodeGen/WinEH/cppeh-catch-all-win32.ll delete mode 100644 test/CodeGen/WinEH/cppeh-catch-all.ll delete mode 100644 test/CodeGen/WinEH/cppeh-catch-and-throw.ll delete mode 100644 test/CodeGen/WinEH/cppeh-catch-scalar.ll delete mode 100644 test/CodeGen/WinEH/cppeh-catch-unwind.ll delete mode 100644 test/CodeGen/WinEH/cppeh-cleanup-invoke.ll delete mode 100644 test/CodeGen/WinEH/cppeh-demote-liveout.ll delete mode 100644 test/CodeGen/WinEH/cppeh-frame-vars.ll delete mode 100644 test/CodeGen/WinEH/cppeh-inalloca.ll delete mode 100644 test/CodeGen/WinEH/cppeh-min-unwind.ll delete mode 100644 test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll delete mode 100644 test/CodeGen/WinEH/cppeh-multi-catch.ll delete mode 100644 test/CodeGen/WinEH/cppeh-nested-1.ll delete mode 100644 test/CodeGen/WinEH/cppeh-nested-2.ll delete mode 100644 test/CodeGen/WinEH/cppeh-nested-3.ll delete mode 100644 test/CodeGen/WinEH/cppeh-nested-rethrow.ll delete mode 100644 test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll delete mode 100644 test/CodeGen/WinEH/cppeh-prepared-catch-all.ll delete mode 100644 test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll delete mode 100644 test/CodeGen/WinEH/cppeh-prepared-catch.ll delete mode 100644 test/CodeGen/WinEH/cppeh-prepared-cleanups.ll delete mode 100644 test/CodeGen/WinEH/cppeh-shared-empty-catch.ll delete mode 100644 test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll delete mode 100644 test/CodeGen/WinEH/cppeh-state-calc-1.ll delete mode 100644 test/CodeGen/WinEH/seh-catch-all.ll delete mode 100644 test/CodeGen/WinEH/seh-exception-code.ll delete mode 100644 test/CodeGen/WinEH/seh-exception-code2.ll delete mode 100644 test/CodeGen/WinEH/seh-inlined-finally.ll delete mode 100644 test/CodeGen/WinEH/seh-outlined-finally-win32.ll delete mode 100644 test/CodeGen/WinEH/seh-outlined-finally.ll delete mode 100644 test/CodeGen/WinEH/seh-prepared-basic.ll delete mode 100644 test/CodeGen/WinEH/seh-resume-phi.ll delete mode 100644 test/CodeGen/WinEH/seh-simple.ll create mode 100644 test/CodeGen/WinEH/wineh-cloning.ll create mode 100644 test/CodeGen/WinEH/wineh-demotion.ll create mode 100644 test/CodeGen/WinEH/wineh-intrinsics-invalid.ll create mode 100644 test/CodeGen/WinEH/wineh-intrinsics.ll create mode 100644 test/CodeGen/WinEH/wineh-no-demotion.ll create mode 100644 test/CodeGen/WinEH/wineh-statenumbering-cleanups.ll create mode 100644 test/CodeGen/WinEH/wineh-statenumbering.ll create mode 100644 test/CodeGen/X86/add-nsw-sext.ll create mode 100644 test/CodeGen/X86/and-encoding.ll create mode 100644 test/CodeGen/X86/atomic-flags.ll create mode 100644 test/CodeGen/X86/atomic-non-integer.ll create mode 100644 test/CodeGen/X86/avg.ll create mode 100644 test/CodeGen/X86/avx-isa-check.ll create mode 100644 test/CodeGen/X86/avx512-bugfix-25270.ll create mode 100644 test/CodeGen/X86/avx512-ext.ll create mode 100644 test/CodeGen/X86/avx512-extract-subvector.ll create mode 100644 test/CodeGen/X86/avx512-skx-insert-subvec.ll delete mode 100644 test/CodeGen/X86/avx512-trunc-ext.ll create mode 100644 test/CodeGen/X86/avx512-trunc.ll create mode 100644 test/CodeGen/X86/avx512cd-intrinsics.ll create mode 100644 test/CodeGen/X86/avx512cdvl-intrinsics.ll create mode 100644 test/CodeGen/X86/avx512dq-intrinsics.ll create mode 100644 test/CodeGen/X86/bit-piece-comment.ll create mode 100644 test/CodeGen/X86/bitreverse.ll create mode 100644 test/CodeGen/X86/branchfolding-catchpads.ll create mode 100644 test/CodeGen/X86/catchpad-realign-savexmm.ll create mode 100644 test/CodeGen/X86/catchpad-regmask.ll create mode 100644 test/CodeGen/X86/catchpad-weight.ll create mode 100644 test/CodeGen/X86/catchret-empty-fallthrough.ll create mode 100644 test/CodeGen/X86/catchret-fallthrough.ll create mode 100644 test/CodeGen/X86/cleanuppad-inalloca.ll create mode 100644 test/CodeGen/X86/cleanuppad-large-codemodel.ll create mode 100644 test/CodeGen/X86/cleanuppad-realign.ll create mode 100644 test/CodeGen/X86/coal-sections.ll create mode 100644 test/CodeGen/X86/coalescer-win64.ll create mode 100644 test/CodeGen/X86/code_placement_cold_loop_blocks.ll create mode 100644 test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll create mode 100644 test/CodeGen/X86/code_placement_loop_rotation.ll create mode 100644 test/CodeGen/X86/code_placement_loop_rotation2.ll create mode 100644 test/CodeGen/X86/combine-multiplies.ll delete mode 100644 test/CodeGen/X86/combine-sse2-intrinsics.ll create mode 100644 test/CodeGen/X86/constant-hoisting-and.ll create mode 100644 test/CodeGen/X86/constant-hoisting-cmp.ll delete mode 100644 test/CodeGen/X86/cppeh-nounwind.ll create mode 100644 test/CodeGen/X86/cxx_tlscc64.ll create mode 100644 test/CodeGen/X86/dag-fmf-cse.ll create mode 100644 test/CodeGen/X86/dag-merge-fast-accesses.ll create mode 100644 test/CodeGen/X86/darwin-tls.ll create mode 100644 test/CodeGen/X86/debugloc-argsize.ll create mode 100644 test/CodeGen/X86/eh-null-personality.ll create mode 100644 test/CodeGen/X86/emutls-pic.ll create mode 100644 test/CodeGen/X86/emutls-pie.ll create mode 100644 test/CodeGen/X86/emutls.ll create mode 100644 test/CodeGen/X86/emutls_generic.ll create mode 100644 test/CodeGen/X86/expand-vr64-gr64-copy.mir create mode 100644 test/CodeGen/X86/extractelement-legalization-cycle.ll create mode 100644 test/CodeGen/X86/fadd-combines.ll create mode 100644 test/CodeGen/X86/fast-isel-bitcasts-avx.ll create mode 100644 test/CodeGen/X86/fast-isel-bitcasts.ll create mode 100644 test/CodeGen/X86/fast-isel-deadcode.ll create mode 100644 test/CodeGen/X86/fast-isel-emutls.ll create mode 100644 test/CodeGen/X86/fast-isel-nontemporal.ll create mode 100644 test/CodeGen/X86/fast-isel-stackcheck.ll create mode 100644 test/CodeGen/X86/fixup-lea.ll create mode 100644 test/CodeGen/X86/fma-commute-x86.ll create mode 100644 test/CodeGen/X86/fma-scalar-memfold.ll create mode 100644 test/CodeGen/X86/fold-push.ll create mode 100644 test/CodeGen/X86/fp-logic.ll create mode 100644 test/CodeGen/X86/fp128-calling-conv.ll create mode 100644 test/CodeGen/X86/fp128-cast.ll create mode 100644 test/CodeGen/X86/fp128-compare.ll create mode 100644 test/CodeGen/X86/fp128-i128.ll create mode 100644 test/CodeGen/X86/fp128-libcalls.ll create mode 100644 test/CodeGen/X86/fp128-load.ll create mode 100644 test/CodeGen/X86/fp128-store.ll create mode 100644 test/CodeGen/X86/fpcmp-soft-fp.ll delete mode 100644 test/CodeGen/X86/frameescape.ll create mode 100644 test/CodeGen/X86/frem-msvc32.ll create mode 100644 test/CodeGen/X86/funclet-layout.ll create mode 100644 test/CodeGen/X86/function-alias.ll create mode 100644 test/CodeGen/X86/hhvm-cc.ll create mode 100644 test/CodeGen/X86/i386-shrink-wrapping.ll create mode 100644 test/CodeGen/X86/immediate_merging.ll create mode 100644 test/CodeGen/X86/inconsistent_landingpad.ll create mode 100644 test/CodeGen/X86/inline-sse.ll create mode 100644 test/CodeGen/X86/insertps-from-constantpool.ll create mode 100644 test/CodeGen/X86/insertps-unfold-load-bug.ll create mode 100644 test/CodeGen/X86/late-address-taken.ll create mode 100644 test/CodeGen/X86/lea-opt.ll create mode 100644 test/CodeGen/X86/localescape.ll create mode 100644 test/CodeGen/X86/machine-combiner-int-vec.ll create mode 100644 test/CodeGen/X86/machine-combiner-int.ll create mode 100644 test/CodeGen/X86/materialize.ll create mode 100644 test/CodeGen/X86/mcu-abi.ll create mode 100644 test/CodeGen/X86/merge-store-partially-alias-loads.ll create mode 100644 test/CodeGen/X86/mmx-coalescing.ll create mode 100644 test/CodeGen/X86/mmx-only.ll create mode 100644 test/CodeGen/X86/movpc32-check.ll create mode 100644 test/CodeGen/X86/or-lea.ll create mode 100644 test/CodeGen/X86/patchpoint-verifiable.mir create mode 100644 test/CodeGen/X86/peephole-na-phys-copy-folding.ll create mode 100644 test/CodeGen/X86/pop-stack-cleanup.ll delete mode 100644 test/CodeGen/X86/pr21529.ll delete mode 100644 test/CodeGen/X86/pr23900.ll create mode 100644 test/CodeGen/X86/pr24139.ll create mode 100644 test/CodeGen/X86/pr24602.ll create mode 100644 test/CodeGen/X86/pr25828.ll create mode 100644 test/CodeGen/X86/prolog-push-seq.ll create mode 100644 test/CodeGen/X86/pseudo_cmov_lower.ll create mode 100644 test/CodeGen/X86/pseudo_cmov_lower1.ll create mode 100644 test/CodeGen/X86/pseudo_cmov_lower2.ll create mode 100644 test/CodeGen/X86/push-cfi-debug.ll create mode 100644 test/CodeGen/X86/push-cfi-obj.ll create mode 100644 test/CodeGen/X86/push-cfi.ll create mode 100644 test/CodeGen/X86/rem_crash.ll delete mode 100644 test/CodeGen/X86/remat-invalid-liveness.ll create mode 100644 test/CodeGen/X86/safestack.ll create mode 100644 test/CodeGen/X86/sar_fold.ll create mode 100644 test/CodeGen/X86/sar_fold64.ll create mode 100644 test/CodeGen/X86/scalar-fp-to-i64.ll create mode 100644 test/CodeGen/X86/scalar-int-to-fp.ll create mode 100644 test/CodeGen/X86/sdiv-pow2.ll create mode 100644 test/CodeGen/X86/seh-catchpad.ll create mode 100644 test/CodeGen/X86/seh-exception-code.ll delete mode 100644 test/CodeGen/X86/seh-filter.ll delete mode 100644 test/CodeGen/X86/seh-stack-realign-win32.ll create mode 100644 test/CodeGen/X86/shrink-wrap-chkstk.ll create mode 100644 test/CodeGen/X86/slow-unaligned-mem.ll create mode 100644 test/CodeGen/X86/soft-sitofp.ll create mode 100644 test/CodeGen/X86/sse-only.ll create mode 100644 test/CodeGen/X86/sse3-intrinsics-fast-isel.ll create mode 100644 test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll create mode 100644 test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll create mode 100644 test/CodeGen/X86/stack-folding-adx-x86_64.ll create mode 100644 test/CodeGen/X86/stackmap-frame-setup.ll create mode 100644 test/CodeGen/X86/switch-edge-weight.ll create mode 100644 test/CodeGen/X86/system-intrinsics-64-xsave.ll create mode 100644 test/CodeGen/X86/system-intrinsics-64-xsavec.ll create mode 100644 test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll create mode 100644 test/CodeGen/X86/system-intrinsics-64-xsaves.ll create mode 100644 test/CodeGen/X86/system-intrinsics-xsave.ll create mode 100644 test/CodeGen/X86/system-intrinsics-xsavec.ll create mode 100644 test/CodeGen/X86/system-intrinsics-xsaveopt.ll create mode 100644 test/CodeGen/X86/system-intrinsics-xsaves.ll create mode 100644 test/CodeGen/X86/tail-dup-catchret.ll create mode 100644 test/CodeGen/X86/tail-merge-wineh.ll create mode 100644 test/CodeGen/X86/tailcall-msvc-conventions.ll create mode 100644 test/CodeGen/X86/tailcall-readnone.ll create mode 100644 test/CodeGen/X86/tls-android-negative.ll create mode 100644 test/CodeGen/X86/tls-android.ll create mode 100644 test/CodeGen/X86/token_landingpad.ll create mode 100644 test/CodeGen/X86/trunc-store.ll create mode 100644 test/CodeGen/X86/vec_cmp_sint-128.ll create mode 100644 test/CodeGen/X86/vec_cmp_uint-128.ll create mode 100644 test/CodeGen/X86/vec_minmax_sint.ll create mode 100644 test/CodeGen/X86/vec_minmax_uint.ll create mode 100644 test/CodeGen/X86/vec_uint_to_fp-fastmath.ll create mode 100644 test/CodeGen/X86/vector-lzcnt-512.ll create mode 100644 test/CodeGen/X86/vector-merge-store-fp-constants.ll create mode 100644 test/CodeGen/X86/vector-popcnt-512.ll create mode 100644 test/CodeGen/X86/vector-rotate-128.ll create mode 100644 test/CodeGen/X86/vector-rotate-256.ll create mode 100644 test/CodeGen/X86/vector-shift-ashr-512.ll create mode 100644 test/CodeGen/X86/vector-shift-lshr-512.ll create mode 100644 test/CodeGen/X86/vector-shift-shl-512.ll create mode 100644 test/CodeGen/X86/vector-shuffle-512-v32.ll create mode 100644 test/CodeGen/X86/vector-shuffle-v1.ll create mode 100644 test/CodeGen/X86/vector-tzcnt-512.ll create mode 100644 test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll create mode 100644 test/CodeGen/X86/vmovq.ll create mode 100644 test/CodeGen/X86/wide-integer-cmp.ll create mode 100644 test/CodeGen/X86/win-catchpad-csrs.ll create mode 100644 test/CodeGen/X86/win-catchpad-nested-cxx.ll create mode 100644 test/CodeGen/X86/win-catchpad-nested.ll create mode 100644 test/CodeGen/X86/win-catchpad-varargs.ll create mode 100644 test/CodeGen/X86/win-catchpad.ll create mode 100644 test/CodeGen/X86/win-cleanuppad.ll create mode 100644 test/CodeGen/X86/win-funclet-cfi.ll create mode 100644 test/CodeGen/X86/win-mixed-ehpersonality.ll create mode 100644 test/CodeGen/X86/win32-seh-catchpad-realign.ll create mode 100644 test/CodeGen/X86/win32-seh-catchpad.ll create mode 100644 test/CodeGen/X86/win32-seh-nested-finally.ll create mode 100644 test/CodeGen/X86/win32-spill-xmm.ll create mode 100644 test/CodeGen/X86/win64_sibcall.ll create mode 100644 test/CodeGen/X86/win_coreclr_chkstk.ll delete mode 100644 test/CodeGen/X86/win_eh_prepare.ll delete mode 100644 test/CodeGen/X86/win_ftol2.ll create mode 100644 test/CodeGen/X86/wineh-coreclr.ll create mode 100644 test/CodeGen/X86/wineh-exceptionpointer.ll create mode 100644 test/CodeGen/X86/wineh-no-ehpads.ll create mode 100644 test/CodeGen/X86/x32-indirectbr.ll create mode 100644 test/CodeGen/X86/x32-landingpad.ll create mode 100644 test/CodeGen/X86/x32-va_start.ll create mode 100644 test/CodeGen/X86/x86-32-intrcc.ll create mode 100644 test/CodeGen/X86/x86-64-intrcc.ll create mode 100644 test/CodeGen/X86/x86-64-ms_abi-vararg.ll create mode 100644 test/CodeGen/X86/x86-sanitizer-shrink-wrapping.ll create mode 100644 test/CodeGen/X86/x86-shrink-wrap-unwind.ll create mode 100644 test/CodeGen/X86/x86-win64-shrink-wrapping.ll create mode 100644 test/CodeGen/X86/xop-pcmov.ll (limited to 'test/CodeGen') diff --git a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll index b075573cc6742..5eb455f3a22cd 100644 --- a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll +++ b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll @@ -3,7 +3,7 @@ ; Bug 20598 -define void @test() #0 { +define void @test() #0 !dbg !4 { entry: br label %for.body, !dbg !39 @@ -44,39 +44,39 @@ attributes #1 = { nounwind readnone } !llvm.module.flags = !{!36, !37} !llvm.ident = !{!38} -!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) !1 = !DIFile(filename: "test.c", directory: "") !2 = !{} !3 = !{!4} -!4 = !DISubprogram(name: "", line: 140, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 141, file: !1, scope: !1, type: !6, function: void ()* @test, variables: !12) +!4 = distinct !DISubprogram(name: "", line: 140, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 141, file: !1, scope: !1, type: !6, variables: !12) !6 = !DISubroutineType(types: !7) !7 = !{null, !8} !8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !9) !9 = !DIDerivedType(tag: DW_TAG_typedef, line: 30, file: !1, baseType: !11) !11 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed) !12 = !{!13, !14, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35} -!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "", line: 140, arg: 1, scope: !4, file: !1, type: !8) -!14 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15) +!13 = !DILocalVariable(name: "", line: 140, arg: 1, scope: !4, file: !1, type: !8) +!14 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15) !15 = !DIDerivedType(tag: DW_TAG_typedef, line: 183, file: !1, baseType: !17) !17 = !DIBasicType(tag: DW_TAG_base_type, size: 64, align: 64, encoding: DW_ATE_signed) -!18 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15) -!19 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15) -!20 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15) -!21 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15) -!22 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15) -!23 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15) -!24 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15) -!25 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15) -!26 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15) -!27 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15) -!28 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15) -!29 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15) -!30 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15) -!31 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15) -!32 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15) -!33 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15) -!34 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 145, scope: !4, file: !1, type: !8) -!35 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 146, scope: !4, file: !1, type: !11) +!18 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15) +!19 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15) +!20 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15) +!21 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15) +!22 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15) +!23 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15) +!24 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15) +!25 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15) +!26 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15) +!27 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15) +!28 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15) +!29 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15) +!30 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15) +!31 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15) +!32 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15) +!33 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15) +!34 = !DILocalVariable(name: "", line: 145, scope: !4, file: !1, type: !8) +!35 = !DILocalVariable(name: "", line: 146, scope: !4, file: !1, type: !11) !36 = !{i32 2, !"Dwarf Version", i32 4} !37 = !{i32 2, !"Debug Info Version", i32 3} !38 = !{!"clang version 3.6.0 "} diff --git a/test/CodeGen/AArch64/aarch64-addv.ll b/test/CodeGen/AArch64/aarch64-addv.ll new file mode 100644 index 0000000000000..ca374eea28e72 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-addv.ll @@ -0,0 +1,98 @@ +; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s + +define i8 @add_B(<16 x i8>* %arr) { +; CHECK-LABEL: add_B +; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b + %bin.rdx = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> + %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0 + %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> + %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf + %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> + %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> + %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13 + %r = extractelement <16 x i8> %bin.rdx14, i32 0 + ret i8 %r +} + +define i16 @add_H(<8 x i16>* %arr) { +; CHECK-LABEL: add_H +; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + %bin.rdx = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> + %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> + %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> + %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13 + %r = extractelement <8 x i16> %bin.rdx14, i32 0 + ret i16 %r +} + +define i32 @add_S( <4 x i32>* %arr) { +; CHECK-LABEL: add_S +; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s + %bin.rdx = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> + %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> + %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12 + %r = extractelement <4 x i32> %bin.rdx13, i32 0 + ret i32 %r +} + +define i64 @add_D(<2 x i64>* %arr) { +; CHECK-LABEL: add_D +; CHECK-NOT: addv + %bin.rdx = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> + %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0 + %r = extractelement <2 x i64> %bin.rdx0, i32 0 + ret i64 %r +} + +define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) { +; CHECK-LABEL: oversized_ADDV_256 +; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s +entry: + %0 = bitcast i8* %arg1 to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = bitcast i8* %arg2 to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp slt <8 x i32> %6, zeroinitializer + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx2 = add <8 x i32> %bin.rdx, %rdx.shuf1 + %rdx.shuf3 = shufflevector <8 x i32> %bin.rdx2, <8 x i32> undef, <8 x i32> + %bin.rdx4 = add <8 x i32> %bin.rdx2, %rdx.shuf3 + %10 = extractelement <8 x i32> %bin.rdx4, i32 0 + ret i32 %10 +} + +define i32 @oversized_ADDV_512(<16 x i32>* %arr) { +; CHECK-LABEL: oversized_ADDV_512 +; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s + %bin.rdx = load <16 x i32>, <16 x i32>* %arr + + %rdx.shuf0 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> + %bin.rdx0 = add <16 x i32> %bin.rdx, %rdx.shuf0 + + %rdx.shuf = shufflevector <16 x i32> %bin.rdx0, <16 x i32> undef, <16 x i32> + %bin.rdx11 = add <16 x i32> %bin.rdx0, %rdx.shuf + + %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx11, <16 x i32> undef, <16 x i32> + %bin.rdx13 = add <16 x i32> %bin.rdx11, %rdx.shuf12 + + %rdx.shuf13 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> + %bin.rdx14 = add <16 x i32> %bin.rdx13, %rdx.shuf13 + + %r = extractelement <16 x i32> %bin.rdx14, i32 0 + ret i32 %r +} diff --git a/test/CodeGen/AArch64/aarch64-deferred-spilling.ll b/test/CodeGen/AArch64/aarch64-deferred-spilling.ll new file mode 100644 index 0000000000000..7accdced7d44d --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-deferred-spilling.ll @@ -0,0 +1,514 @@ +;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=true -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=DEFERRED +;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=false -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=REGULAR + +; Check that we do not end up with useless spill code. +; +; Move to the basic block we are interested in. +; +; CHECK: // %if.then.120 +; +; REGULAR: str w21, [sp, #[[OFFSET:[0-9]+]]] // 4-byte Folded Spill +; Check that w21 wouldn't need to be spilled since it is never reused. +; REGULAR-NOT: {{[wx]}}21{{,?}} +; +; Check that w22 is used to carry a value through the call. +; DEFERRED-NOT: str {{[wx]}}22, +; DEFERRED: mov {{[wx]}}22, +; DEFERRED-NOT: str {{[wx]}}22, +; +; CHECK: bl fprintf +; +; DEFERRED-NOT: ldr {{[wx]}}22, +; DEFERRED: mov {{[wx][0-9]+}}, {{[wx]}}22 +; DEFERRED-NOT: ldr {{[wx]}}22, +; +; REGULAR-NOT: {{[wx]}}21{{,?}} +; REGULAR: ldr w21, [sp, #[[OFFSET]]] // 4-byte Folded Reload +; +; End of the basic block we are interested in. +; CHECK: b +; CHECK: {{[^:]+}}: // %sw.bb.123 + +%struct.__sFILE = type { i8*, i32, i32, i32, i32, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, i8*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 } +%struct.__sbuf = type { i8*, i64 } +%struct.DState = type { %struct.bz_stream*, i32, i8, i32, i8, i32, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, [256 x i32], i32, [257 x i32], [257 x i32], i32*, i16*, i8*, i32, i32, i32, i32, i32, [256 x i8], [16 x i8], [256 x i8], [4096 x i8], [16 x i32], [18002 x i8], [18002 x i8], [6 x [258 x i8]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, i32*, i32* } +%struct.bz_stream = type { i8*, i32, i32, i32, i8*, i32, i32, i32, i8*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8* } + +@__sF = external global [0 x %struct.__sFILE], align 8 +@.str = private unnamed_addr constant [20 x i8] c"\0A [%d: stuff+mf \00", align 1 + +declare i32 @fprintf(%struct.__sFILE* nocapture, i8* nocapture readonly, ...) + +declare void @bar(i32) + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) + +define i32 @foo(%struct.DState* %s) { +entry: + %state = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 1 + %tmp = load i32, i32* %state, align 4 + %cmp = icmp eq i32 %tmp, 10 + %save_i = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 40 + br i1 %cmp, label %if.end.thread, label %if.end + +if.end.thread: ; preds = %entry + %save_j = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41 + %save_t = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42 + %save_alphaSize = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43 + %save_nGroups = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44 + %save_nSelectors = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45 + %save_EOB = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46 + %save_groupNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47 + %save_groupPos = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48 + %save_nextSym = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49 + %save_nblockMAX = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50 + %save_nblock = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51 + %save_es = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52 + %save_N = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53 + %save_curr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54 + %save_zt = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55 + %save_zn = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56 + %save_zvec = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57 + %save_zj = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58 + %tmp1 = bitcast i32* %save_i to i8* + call void @llvm.memset.p0i8.i64(i8* %tmp1, i8 0, i64 108, i32 4, i1 false) + br label %sw.default + +if.end: ; preds = %entry + %.pre = load i32, i32* %save_i, align 4 + %save_j3.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41 + %.pre406 = load i32, i32* %save_j3.phi.trans.insert, align 4 + %save_t4.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42 + %.pre407 = load i32, i32* %save_t4.phi.trans.insert, align 4 + %save_alphaSize5.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43 + %.pre408 = load i32, i32* %save_alphaSize5.phi.trans.insert, align 4 + %save_nGroups6.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44 + %.pre409 = load i32, i32* %save_nGroups6.phi.trans.insert, align 4 + %save_nSelectors7.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45 + %.pre410 = load i32, i32* %save_nSelectors7.phi.trans.insert, align 4 + %save_EOB8.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46 + %.pre411 = load i32, i32* %save_EOB8.phi.trans.insert, align 4 + %save_groupNo9.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47 + %.pre412 = load i32, i32* %save_groupNo9.phi.trans.insert, align 4 + %save_groupPos10.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48 + %.pre413 = load i32, i32* %save_groupPos10.phi.trans.insert, align 4 + %save_nextSym11.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49 + %.pre414 = load i32, i32* %save_nextSym11.phi.trans.insert, align 4 + %save_nblockMAX12.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50 + %.pre415 = load i32, i32* %save_nblockMAX12.phi.trans.insert, align 4 + %save_nblock13.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51 + %.pre416 = load i32, i32* %save_nblock13.phi.trans.insert, align 4 + %save_es14.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52 + %.pre417 = load i32, i32* %save_es14.phi.trans.insert, align 4 + %save_N15.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53 + %.pre418 = load i32, i32* %save_N15.phi.trans.insert, align 4 + %save_curr16.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54 + %.pre419 = load i32, i32* %save_curr16.phi.trans.insert, align 4 + %save_zt17.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55 + %.pre420 = load i32, i32* %save_zt17.phi.trans.insert, align 4 + %save_zn18.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56 + %.pre421 = load i32, i32* %save_zn18.phi.trans.insert, align 4 + %save_zvec19.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57 + %.pre422 = load i32, i32* %save_zvec19.phi.trans.insert, align 4 + %save_zj20.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58 + %.pre423 = load i32, i32* %save_zj20.phi.trans.insert, align 4 + switch i32 %tmp, label %sw.default [ + i32 13, label %sw.bb + i32 14, label %if.end.sw.bb.65_crit_edge + i32 25, label %if.end.sw.bb.123_crit_edge + ] + +if.end.sw.bb.123_crit_edge: ; preds = %if.end + %.pre433 = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8 + br label %sw.bb.123 + +if.end.sw.bb.65_crit_edge: ; preds = %if.end + %bsLive69.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8 + %.pre426 = load i32, i32* %bsLive69.phi.trans.insert, align 4 + br label %sw.bb.65 + +sw.bb: ; preds = %if.end + %sunkaddr = ptrtoint %struct.DState* %s to i64 + %sunkaddr485 = add i64 %sunkaddr, 8 + %sunkaddr486 = inttoptr i64 %sunkaddr485 to i32* + store i32 13, i32* %sunkaddr486, align 4 + %bsLive = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8 + %tmp2 = load i32, i32* %bsLive, align 4 + %cmp28.400 = icmp sgt i32 %tmp2, 7 + br i1 %cmp28.400, label %sw.bb.if.then.29_crit_edge, label %if.end.33.lr.ph + +sw.bb.if.then.29_crit_edge: ; preds = %sw.bb + %sunkaddr487 = ptrtoint %struct.DState* %s to i64 + %sunkaddr488 = add i64 %sunkaddr487, 32 + %sunkaddr489 = inttoptr i64 %sunkaddr488 to i32* + %.pre425 = load i32, i32* %sunkaddr489, align 4 + br label %if.then.29 + +if.end.33.lr.ph: ; preds = %sw.bb + %tmp3 = bitcast %struct.DState* %s to %struct.bz_stream** + %.pre424 = load %struct.bz_stream*, %struct.bz_stream** %tmp3, align 8 + %avail_in.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre424, i64 0, i32 1 + %.pre430 = load i32, i32* %avail_in.phi.trans.insert, align 4 + %tmp4 = add i32 %.pre430, -1 + br label %if.end.33 + +if.then.29: ; preds = %while.body.backedge, %sw.bb.if.then.29_crit_edge + %tmp5 = phi i32 [ %.pre425, %sw.bb.if.then.29_crit_edge ], [ %or, %while.body.backedge ] + %.lcssa393 = phi i32 [ %tmp2, %sw.bb.if.then.29_crit_edge ], [ %add, %while.body.backedge ] + %sub = add nsw i32 %.lcssa393, -8 + %shr = lshr i32 %tmp5, %sub + %and = and i32 %shr, 255 + %sunkaddr491 = ptrtoint %struct.DState* %s to i64 + %sunkaddr492 = add i64 %sunkaddr491, 36 + %sunkaddr493 = inttoptr i64 %sunkaddr492 to i32* + store i32 %sub, i32* %sunkaddr493, align 4 + %blockSize100k = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 9 + store i32 %and, i32* %blockSize100k, align 4 + %and.off = add nsw i32 %and, -49 + %tmp6 = icmp ugt i32 %and.off, 8 + br i1 %tmp6, label %save_state_and_return, label %if.end.62 + +if.end.33: ; preds = %while.body.backedge, %if.end.33.lr.ph + %lsr.iv482 = phi i32 [ %tmp4, %if.end.33.lr.ph ], [ %lsr.iv.next483, %while.body.backedge ] + %tmp7 = phi i32 [ %tmp2, %if.end.33.lr.ph ], [ %add, %while.body.backedge ] + %cmp35 = icmp eq i32 %lsr.iv482, -1 + br i1 %cmp35, label %save_state_and_return, label %if.end.37 + +if.end.37: ; preds = %if.end.33 + %tmp8 = bitcast %struct.bz_stream* %.pre424 to i8** + %sunkaddr494 = ptrtoint %struct.DState* %s to i64 + %sunkaddr495 = add i64 %sunkaddr494, 32 + %sunkaddr496 = inttoptr i64 %sunkaddr495 to i32* + %tmp9 = load i32, i32* %sunkaddr496, align 4 + %shl = shl i32 %tmp9, 8 + %tmp10 = load i8*, i8** %tmp8, align 8 + %tmp11 = load i8, i8* %tmp10, align 1 + %conv = zext i8 %tmp11 to i32 + %or = or i32 %conv, %shl + store i32 %or, i32* %sunkaddr496, align 4 + %add = add nsw i32 %tmp7, 8 + %sunkaddr497 = ptrtoint %struct.DState* %s to i64 + %sunkaddr498 = add i64 %sunkaddr497, 36 + %sunkaddr499 = inttoptr i64 %sunkaddr498 to i32* + store i32 %add, i32* %sunkaddr499, align 4 + %incdec.ptr = getelementptr inbounds i8, i8* %tmp10, i64 1 + store i8* %incdec.ptr, i8** %tmp8, align 8 + %sunkaddr500 = ptrtoint %struct.bz_stream* %.pre424 to i64 + %sunkaddr501 = add i64 %sunkaddr500, 8 + %sunkaddr502 = inttoptr i64 %sunkaddr501 to i32* + store i32 %lsr.iv482, i32* %sunkaddr502, align 4 + %sunkaddr503 = ptrtoint %struct.bz_stream* %.pre424 to i64 + %sunkaddr504 = add i64 %sunkaddr503, 12 + %sunkaddr505 = inttoptr i64 %sunkaddr504 to i32* + %tmp12 = load i32, i32* %sunkaddr505, align 4 + %inc = add i32 %tmp12, 1 + store i32 %inc, i32* %sunkaddr505, align 4 + %cmp49 = icmp eq i32 %inc, 0 + br i1 %cmp49, label %if.then.51, label %while.body.backedge + +if.then.51: ; preds = %if.end.37 + %sunkaddr506 = ptrtoint %struct.bz_stream* %.pre424 to i64 + %sunkaddr507 = add i64 %sunkaddr506, 16 + %sunkaddr508 = inttoptr i64 %sunkaddr507 to i32* + %tmp13 = load i32, i32* %sunkaddr508, align 4 + %inc53 = add i32 %tmp13, 1 + store i32 %inc53, i32* %sunkaddr508, align 4 + br label %while.body.backedge + +while.body.backedge: ; preds = %if.then.51, %if.end.37 + %lsr.iv.next483 = add i32 %lsr.iv482, -1 + %cmp28 = icmp sgt i32 %add, 7 + br i1 %cmp28, label %if.then.29, label %if.end.33 + +if.end.62: ; preds = %if.then.29 + %sub64 = add nsw i32 %and, -48 + %sunkaddr509 = ptrtoint %struct.DState* %s to i64 + %sunkaddr510 = add i64 %sunkaddr509, 40 + %sunkaddr511 = inttoptr i64 %sunkaddr510 to i32* + store i32 %sub64, i32* %sunkaddr511, align 4 + br label %sw.bb.65 + +sw.bb.65: ; preds = %if.end.62, %if.end.sw.bb.65_crit_edge + %bsLive69.pre-phi = phi i32* [ %bsLive69.phi.trans.insert, %if.end.sw.bb.65_crit_edge ], [ %bsLive, %if.end.62 ] + %tmp14 = phi i32 [ %.pre426, %if.end.sw.bb.65_crit_edge ], [ %sub, %if.end.62 ] + %sunkaddr512 = ptrtoint %struct.DState* %s to i64 + %sunkaddr513 = add i64 %sunkaddr512, 8 + %sunkaddr514 = inttoptr i64 %sunkaddr513 to i32* + store i32 14, i32* %sunkaddr514, align 4 + %cmp70.397 = icmp sgt i32 %tmp14, 7 + br i1 %cmp70.397, label %if.then.72, label %if.end.82.lr.ph + +if.end.82.lr.ph: ; preds = %sw.bb.65 + %tmp15 = bitcast %struct.DState* %s to %struct.bz_stream** + %.pre427 = load %struct.bz_stream*, %struct.bz_stream** %tmp15, align 8 + %avail_in84.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre427, i64 0, i32 1 + %.pre431 = load i32, i32* %avail_in84.phi.trans.insert, align 4 + %tmp16 = add i32 %.pre431, -1 + br label %if.end.82 + +if.then.72: ; preds = %while.body.68.backedge, %sw.bb.65 + %.lcssa390 = phi i32 [ %tmp14, %sw.bb.65 ], [ %add97, %while.body.68.backedge ] + %sub76 = add nsw i32 %.lcssa390, -8 + %sunkaddr516 = ptrtoint %struct.DState* %s to i64 + %sunkaddr517 = add i64 %sunkaddr516, 36 + %sunkaddr518 = inttoptr i64 %sunkaddr517 to i32* + store i32 %sub76, i32* %sunkaddr518, align 4 + %currBlockNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 11 + %tmp17 = load i32, i32* %currBlockNo, align 4 + %inc117 = add nsw i32 %tmp17, 1 + store i32 %inc117, i32* %currBlockNo, align 4 + %verbosity = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 12 + %tmp18 = load i32, i32* %verbosity, align 4 + %cmp118 = icmp sgt i32 %tmp18, 1 + br i1 %cmp118, label %if.then.120, label %sw.bb.123, !prof !0 + +if.end.82: ; preds = %while.body.68.backedge, %if.end.82.lr.ph + %lsr.iv480 = phi i32 [ %tmp16, %if.end.82.lr.ph ], [ %lsr.iv.next481, %while.body.68.backedge ] + %tmp19 = phi i32 [ %tmp14, %if.end.82.lr.ph ], [ %add97, %while.body.68.backedge ] + %cmp85 = icmp eq i32 %lsr.iv480, -1 + br i1 %cmp85, label %save_state_and_return, label %if.end.88 + +if.end.88: ; preds = %if.end.82 + %tmp20 = bitcast %struct.bz_stream* %.pre427 to i8** + %sunkaddr519 = ptrtoint %struct.DState* %s to i64 + %sunkaddr520 = add i64 %sunkaddr519, 32 + %sunkaddr521 = inttoptr i64 %sunkaddr520 to i32* + %tmp21 = load i32, i32* %sunkaddr521, align 4 + %shl90 = shl i32 %tmp21, 8 + %tmp22 = load i8*, i8** %tmp20, align 8 + %tmp23 = load i8, i8* %tmp22, align 1 + %conv93 = zext i8 %tmp23 to i32 + %or94 = or i32 %conv93, %shl90 + store i32 %or94, i32* %sunkaddr521, align 4 + %add97 = add nsw i32 %tmp19, 8 + %sunkaddr522 = ptrtoint %struct.DState* %s to i64 + %sunkaddr523 = add i64 %sunkaddr522, 36 + %sunkaddr524 = inttoptr i64 %sunkaddr523 to i32* + store i32 %add97, i32* %sunkaddr524, align 4 + %incdec.ptr100 = getelementptr inbounds i8, i8* %tmp22, i64 1 + store i8* %incdec.ptr100, i8** %tmp20, align 8 + %sunkaddr525 = ptrtoint %struct.bz_stream* %.pre427 to i64 + %sunkaddr526 = add i64 %sunkaddr525, 8 + %sunkaddr527 = inttoptr i64 %sunkaddr526 to i32* + store i32 %lsr.iv480, i32* %sunkaddr527, align 4 + %sunkaddr528 = ptrtoint %struct.bz_stream* %.pre427 to i64 + %sunkaddr529 = add i64 %sunkaddr528, 12 + %sunkaddr530 = inttoptr i64 %sunkaddr529 to i32* + %tmp24 = load i32, i32* %sunkaddr530, align 4 + %inc106 = add i32 %tmp24, 1 + store i32 %inc106, i32* %sunkaddr530, align 4 + %cmp109 = icmp eq i32 %inc106, 0 + br i1 %cmp109, label %if.then.111, label %while.body.68.backedge + +if.then.111: ; preds = %if.end.88 + %sunkaddr531 = ptrtoint %struct.bz_stream* %.pre427 to i64 + %sunkaddr532 = add i64 %sunkaddr531, 16 + %sunkaddr533 = inttoptr i64 %sunkaddr532 to i32* + %tmp25 = load i32, i32* %sunkaddr533, align 4 + %inc114 = add i32 %tmp25, 1 + store i32 %inc114, i32* %sunkaddr533, align 4 + br label %while.body.68.backedge + +while.body.68.backedge: ; preds = %if.then.111, %if.end.88 + %lsr.iv.next481 = add i32 %lsr.iv480, -1 + %cmp70 = icmp sgt i32 %add97, 7 + br i1 %cmp70, label %if.then.72, label %if.end.82 + +if.then.120: ; preds = %if.then.72 + %call = tail call i32 (%struct.__sFILE*, i8*, ...) @fprintf(%struct.__sFILE* getelementptr inbounds ([0 x %struct.__sFILE], [0 x %struct.__sFILE]* @__sF, i64 0, i64 2), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str, i64 0, i64 0), i32 %inc117) + br label %sw.bb.123 + +sw.bb.123: ; preds = %if.then.120, %if.then.72, %if.end.sw.bb.123_crit_edge + %bsLive127.pre-phi = phi i32* [ %.pre433, %if.end.sw.bb.123_crit_edge ], [ %bsLive69.pre-phi, %if.then.72 ], [ %bsLive69.pre-phi, %if.then.120 ] + %sunkaddr534 = ptrtoint %struct.DState* %s to i64 + %sunkaddr535 = add i64 %sunkaddr534, 8 + %sunkaddr536 = inttoptr i64 %sunkaddr535 to i32* + store i32 25, i32* %sunkaddr536, align 4 + %tmp26 = load i32, i32* %bsLive127.pre-phi, align 4 + %cmp128.395 = icmp sgt i32 %tmp26, 7 + br i1 %cmp128.395, label %sw.bb.123.if.then.130_crit_edge, label %if.end.140.lr.ph + +sw.bb.123.if.then.130_crit_edge: ; preds = %sw.bb.123 + %sunkaddr537 = ptrtoint %struct.DState* %s to i64 + %sunkaddr538 = add i64 %sunkaddr537, 32 + %sunkaddr539 = inttoptr i64 %sunkaddr538 to i32* + %.pre429 = load i32, i32* %sunkaddr539, align 4 + br label %if.then.130 + +if.end.140.lr.ph: ; preds = %sw.bb.123 + %tmp27 = bitcast %struct.DState* %s to %struct.bz_stream** + %.pre428 = load %struct.bz_stream*, %struct.bz_stream** %tmp27, align 8 + %avail_in142.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre428, i64 0, i32 1 + %.pre432 = load i32, i32* %avail_in142.phi.trans.insert, align 4 + %tmp28 = add i32 %.pre432, -1 + br label %if.end.140 + +if.then.130: ; preds = %while.body.126.backedge, %sw.bb.123.if.then.130_crit_edge + %tmp29 = phi i32 [ %.pre429, %sw.bb.123.if.then.130_crit_edge ], [ %or152, %while.body.126.backedge ] + %.lcssa = phi i32 [ %tmp26, %sw.bb.123.if.then.130_crit_edge ], [ %add155, %while.body.126.backedge ] + %sub134 = add nsw i32 %.lcssa, -8 + %shr135 = lshr i32 %tmp29, %sub134 + store i32 %sub134, i32* %bsLive127.pre-phi, align 4 + %origPtr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 13 + %tmp30 = load i32, i32* %origPtr, align 4 + %shl175 = shl i32 %tmp30, 8 + %conv176 = and i32 %shr135, 255 + %or177 = or i32 %shl175, %conv176 + store i32 %or177, i32* %origPtr, align 4 + %nInUse = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 27 + %tmp31 = load i32, i32* %nInUse, align 4 + %add179 = add nsw i32 %tmp31, 2 + br label %save_state_and_return + +if.end.140: ; preds = %while.body.126.backedge, %if.end.140.lr.ph + %lsr.iv = phi i32 [ %tmp28, %if.end.140.lr.ph ], [ %lsr.iv.next, %while.body.126.backedge ] + %tmp32 = phi i32 [ %tmp26, %if.end.140.lr.ph ], [ %add155, %while.body.126.backedge ] + %cmp143 = icmp eq i32 %lsr.iv, -1 + br i1 %cmp143, label %save_state_and_return, label %if.end.146 + +if.end.146: ; preds = %if.end.140 + %tmp33 = bitcast %struct.bz_stream* %.pre428 to i8** + %sunkaddr541 = ptrtoint %struct.DState* %s to i64 + %sunkaddr542 = add i64 %sunkaddr541, 32 + %sunkaddr543 = inttoptr i64 %sunkaddr542 to i32* + %tmp34 = load i32, i32* %sunkaddr543, align 4 + %shl148 = shl i32 %tmp34, 8 + %tmp35 = load i8*, i8** %tmp33, align 8 + %tmp36 = load i8, i8* %tmp35, align 1 + %conv151 = zext i8 %tmp36 to i32 + %or152 = or i32 %conv151, %shl148 + store i32 %or152, i32* %sunkaddr543, align 4 + %add155 = add nsw i32 %tmp32, 8 + store i32 %add155, i32* %bsLive127.pre-phi, align 4 + %incdec.ptr158 = getelementptr inbounds i8, i8* %tmp35, i64 1 + store i8* %incdec.ptr158, i8** %tmp33, align 8 + %sunkaddr544 = ptrtoint %struct.bz_stream* %.pre428 to i64 + %sunkaddr545 = add i64 %sunkaddr544, 8 + %sunkaddr546 = inttoptr i64 %sunkaddr545 to i32* + store i32 %lsr.iv, i32* %sunkaddr546, align 4 + %sunkaddr547 = ptrtoint %struct.bz_stream* %.pre428 to i64 + %sunkaddr548 = add i64 %sunkaddr547, 12 + %sunkaddr549 = inttoptr i64 %sunkaddr548 to i32* + %tmp37 = load i32, i32* %sunkaddr549, align 4 + %inc164 = add i32 %tmp37, 1 + store i32 %inc164, i32* %sunkaddr549, align 4 + %cmp167 = icmp eq i32 %inc164, 0 + br i1 %cmp167, label %if.then.169, label %while.body.126.backedge + +if.then.169: ; preds = %if.end.146 + %sunkaddr550 = ptrtoint %struct.bz_stream* %.pre428 to i64 + %sunkaddr551 = add i64 %sunkaddr550, 16 + %sunkaddr552 = inttoptr i64 %sunkaddr551 to i32* + %tmp38 = load i32, i32* %sunkaddr552, align 4 + %inc172 = add i32 %tmp38, 1 + store i32 %inc172, i32* %sunkaddr552, align 4 + br label %while.body.126.backedge + +while.body.126.backedge: ; preds = %if.then.169, %if.end.146 + %lsr.iv.next = add i32 %lsr.iv, -1 + %cmp128 = icmp sgt i32 %add155, 7 + br i1 %cmp128, label %if.then.130, label %if.end.140 + +sw.default: ; preds = %if.end, %if.end.thread + %tmp39 = phi i32 [ 0, %if.end.thread ], [ %.pre, %if.end ] + %tmp40 = phi i32 [ 0, %if.end.thread ], [ %.pre406, %if.end ] + %tmp41 = phi i32 [ 0, %if.end.thread ], [ %.pre407, %if.end ] + %tmp42 = phi i32 [ 0, %if.end.thread ], [ %.pre408, %if.end ] + %tmp43 = phi i32 [ 0, %if.end.thread ], [ %.pre409, %if.end ] + %tmp44 = phi i32 [ 0, %if.end.thread ], [ %.pre410, %if.end ] + %tmp45 = phi i32 [ 0, %if.end.thread ], [ %.pre411, %if.end ] + %tmp46 = phi i32 [ 0, %if.end.thread ], [ %.pre412, %if.end ] + %tmp47 = phi i32 [ 0, %if.end.thread ], [ %.pre413, %if.end ] + %tmp48 = phi i32 [ 0, %if.end.thread ], [ %.pre414, %if.end ] + %tmp49 = phi i32 [ 0, %if.end.thread ], [ %.pre415, %if.end ] + %tmp50 = phi i32 [ 0, %if.end.thread ], [ %.pre416, %if.end ] + %tmp51 = phi i32 [ 0, %if.end.thread ], [ %.pre417, %if.end ] + %tmp52 = phi i32 [ 0, %if.end.thread ], [ %.pre418, %if.end ] + %tmp53 = phi i32 [ 0, %if.end.thread ], [ %.pre419, %if.end ] + %tmp54 = phi i32 [ 0, %if.end.thread ], [ %.pre420, %if.end ] + %tmp55 = phi i32 [ 0, %if.end.thread ], [ %.pre421, %if.end ] + %tmp56 = phi i32 [ 0, %if.end.thread ], [ %.pre422, %if.end ] + %tmp57 = phi i32 [ 0, %if.end.thread ], [ %.pre423, %if.end ] + %save_j3.pre-phi469 = phi i32* [ %save_j, %if.end.thread ], [ %save_j3.phi.trans.insert, %if.end ] + %save_t4.pre-phi467 = phi i32* [ %save_t, %if.end.thread ], [ %save_t4.phi.trans.insert, %if.end ] + %save_alphaSize5.pre-phi465 = phi i32* [ %save_alphaSize, %if.end.thread ], [ %save_alphaSize5.phi.trans.insert, %if.end ] + %save_nGroups6.pre-phi463 = phi i32* [ %save_nGroups, %if.end.thread ], [ %save_nGroups6.phi.trans.insert, %if.end ] + %save_nSelectors7.pre-phi461 = phi i32* [ %save_nSelectors, %if.end.thread ], [ %save_nSelectors7.phi.trans.insert, %if.end ] + %save_EOB8.pre-phi459 = phi i32* [ %save_EOB, %if.end.thread ], [ %save_EOB8.phi.trans.insert, %if.end ] + %save_groupNo9.pre-phi457 = phi i32* [ %save_groupNo, %if.end.thread ], [ %save_groupNo9.phi.trans.insert, %if.end ] + %save_groupPos10.pre-phi455 = phi i32* [ %save_groupPos, %if.end.thread ], [ %save_groupPos10.phi.trans.insert, %if.end ] + %save_nextSym11.pre-phi453 = phi i32* [ %save_nextSym, %if.end.thread ], [ %save_nextSym11.phi.trans.insert, %if.end ] + %save_nblockMAX12.pre-phi451 = phi i32* [ %save_nblockMAX, %if.end.thread ], [ %save_nblockMAX12.phi.trans.insert, %if.end ] + %save_nblock13.pre-phi449 = phi i32* [ %save_nblock, %if.end.thread ], [ %save_nblock13.phi.trans.insert, %if.end ] + %save_es14.pre-phi447 = phi i32* [ %save_es, %if.end.thread ], [ %save_es14.phi.trans.insert, %if.end ] + %save_N15.pre-phi445 = phi i32* [ %save_N, %if.end.thread ], [ %save_N15.phi.trans.insert, %if.end ] + %save_curr16.pre-phi443 = phi i32* [ %save_curr, %if.end.thread ], [ %save_curr16.phi.trans.insert, %if.end ] + %save_zt17.pre-phi441 = phi i32* [ %save_zt, %if.end.thread ], [ %save_zt17.phi.trans.insert, %if.end ] + %save_zn18.pre-phi439 = phi i32* [ %save_zn, %if.end.thread ], [ %save_zn18.phi.trans.insert, %if.end ] + %save_zvec19.pre-phi437 = phi i32* [ %save_zvec, %if.end.thread ], [ %save_zvec19.phi.trans.insert, %if.end ] + %save_zj20.pre-phi435 = phi i32* [ %save_zj, %if.end.thread ], [ %save_zj20.phi.trans.insert, %if.end ] + tail call void @bar(i32 4001) + br label %save_state_and_return + +save_state_and_return: ; preds = %sw.default, %if.end.140, %if.then.130, %if.end.82, %if.end.33, %if.then.29 + %tmp58 = phi i32 [ %tmp39, %sw.default ], [ %.pre, %if.then.29 ], [ %.pre, %if.then.130 ], [ %.pre, %if.end.140 ], [ %.pre, %if.end.82 ], [ %.pre, %if.end.33 ] + %tmp59 = phi i32 [ %tmp40, %sw.default ], [ %.pre406, %if.then.29 ], [ %.pre406, %if.then.130 ], [ %.pre406, %if.end.140 ], [ %.pre406, %if.end.82 ], [ %.pre406, %if.end.33 ] + %tmp60 = phi i32 [ %tmp41, %sw.default ], [ %.pre407, %if.then.29 ], [ %.pre407, %if.then.130 ], [ %.pre407, %if.end.140 ], [ %.pre407, %if.end.82 ], [ %.pre407, %if.end.33 ] + %tmp61 = phi i32 [ %tmp43, %sw.default ], [ %.pre409, %if.then.29 ], [ %.pre409, %if.then.130 ], [ %.pre409, %if.end.140 ], [ %.pre409, %if.end.82 ], [ %.pre409, %if.end.33 ] + %tmp62 = phi i32 [ %tmp44, %sw.default ], [ %.pre410, %if.then.29 ], [ %.pre410, %if.then.130 ], [ %.pre410, %if.end.140 ], [ %.pre410, %if.end.82 ], [ %.pre410, %if.end.33 ] + %tmp63 = phi i32 [ %tmp45, %sw.default ], [ %.pre411, %if.then.29 ], [ %.pre411, %if.then.130 ], [ %.pre411, %if.end.140 ], [ %.pre411, %if.end.82 ], [ %.pre411, %if.end.33 ] + %tmp64 = phi i32 [ %tmp46, %sw.default ], [ %.pre412, %if.then.29 ], [ %.pre412, %if.then.130 ], [ %.pre412, %if.end.140 ], [ %.pre412, %if.end.82 ], [ %.pre412, %if.end.33 ] + %tmp65 = phi i32 [ %tmp47, %sw.default ], [ %.pre413, %if.then.29 ], [ %.pre413, %if.then.130 ], [ %.pre413, %if.end.140 ], [ %.pre413, %if.end.82 ], [ %.pre413, %if.end.33 ] + %tmp66 = phi i32 [ %tmp48, %sw.default ], [ %.pre414, %if.then.29 ], [ %.pre414, %if.then.130 ], [ %.pre414, %if.end.140 ], [ %.pre414, %if.end.82 ], [ %.pre414, %if.end.33 ] + %tmp67 = phi i32 [ %tmp49, %sw.default ], [ %.pre415, %if.then.29 ], [ %.pre415, %if.then.130 ], [ %.pre415, %if.end.140 ], [ %.pre415, %if.end.82 ], [ %.pre415, %if.end.33 ] + %tmp68 = phi i32 [ %tmp51, %sw.default ], [ %.pre417, %if.then.29 ], [ %.pre417, %if.then.130 ], [ %.pre417, %if.end.140 ], [ %.pre417, %if.end.82 ], [ %.pre417, %if.end.33 ] + %tmp69 = phi i32 [ %tmp52, %sw.default ], [ %.pre418, %if.then.29 ], [ %.pre418, %if.then.130 ], [ %.pre418, %if.end.140 ], [ %.pre418, %if.end.82 ], [ %.pre418, %if.end.33 ] + %tmp70 = phi i32 [ %tmp53, %sw.default ], [ %.pre419, %if.then.29 ], [ %.pre419, %if.then.130 ], [ %.pre419, %if.end.140 ], [ %.pre419, %if.end.82 ], [ %.pre419, %if.end.33 ] + %tmp71 = phi i32 [ %tmp54, %sw.default ], [ %.pre420, %if.then.29 ], [ %.pre420, %if.then.130 ], [ %.pre420, %if.end.140 ], [ %.pre420, %if.end.82 ], [ %.pre420, %if.end.33 ] + %tmp72 = phi i32 [ %tmp55, %sw.default ], [ %.pre421, %if.then.29 ], [ %.pre421, %if.then.130 ], [ %.pre421, %if.end.140 ], [ %.pre421, %if.end.82 ], [ %.pre421, %if.end.33 ] + %tmp73 = phi i32 [ %tmp56, %sw.default ], [ %.pre422, %if.then.29 ], [ %.pre422, %if.then.130 ], [ %.pre422, %if.end.140 ], [ %.pre422, %if.end.82 ], [ %.pre422, %if.end.33 ] + %tmp74 = phi i32 [ %tmp57, %sw.default ], [ %.pre423, %if.then.29 ], [ %.pre423, %if.then.130 ], [ %.pre423, %if.end.140 ], [ %.pre423, %if.end.82 ], [ %.pre423, %if.end.33 ] + %save_j3.pre-phi468 = phi i32* [ %save_j3.pre-phi469, %sw.default ], [ %save_j3.phi.trans.insert, %if.then.29 ], [ %save_j3.phi.trans.insert, %if.then.130 ], [ %save_j3.phi.trans.insert, %if.end.140 ], [ %save_j3.phi.trans.insert, %if.end.82 ], [ %save_j3.phi.trans.insert, %if.end.33 ] + %save_t4.pre-phi466 = phi i32* [ %save_t4.pre-phi467, %sw.default ], [ %save_t4.phi.trans.insert, %if.then.29 ], [ %save_t4.phi.trans.insert, %if.then.130 ], [ %save_t4.phi.trans.insert, %if.end.140 ], [ %save_t4.phi.trans.insert, %if.end.82 ], [ %save_t4.phi.trans.insert, %if.end.33 ] + %save_alphaSize5.pre-phi464 = phi i32* [ %save_alphaSize5.pre-phi465, %sw.default ], [ %save_alphaSize5.phi.trans.insert, %if.then.29 ], [ %save_alphaSize5.phi.trans.insert, %if.then.130 ], [ %save_alphaSize5.phi.trans.insert, %if.end.140 ], [ %save_alphaSize5.phi.trans.insert, %if.end.82 ], [ %save_alphaSize5.phi.trans.insert, %if.end.33 ] + %save_nGroups6.pre-phi462 = phi i32* [ %save_nGroups6.pre-phi463, %sw.default ], [ %save_nGroups6.phi.trans.insert, %if.then.29 ], [ %save_nGroups6.phi.trans.insert, %if.then.130 ], [ %save_nGroups6.phi.trans.insert, %if.end.140 ], [ %save_nGroups6.phi.trans.insert, %if.end.82 ], [ %save_nGroups6.phi.trans.insert, %if.end.33 ] + %save_nSelectors7.pre-phi460 = phi i32* [ %save_nSelectors7.pre-phi461, %sw.default ], [ %save_nSelectors7.phi.trans.insert, %if.then.29 ], [ %save_nSelectors7.phi.trans.insert, %if.then.130 ], [ %save_nSelectors7.phi.trans.insert, %if.end.140 ], [ %save_nSelectors7.phi.trans.insert, %if.end.82 ], [ %save_nSelectors7.phi.trans.insert, %if.end.33 ] + %save_EOB8.pre-phi458 = phi i32* [ %save_EOB8.pre-phi459, %sw.default ], [ %save_EOB8.phi.trans.insert, %if.then.29 ], [ %save_EOB8.phi.trans.insert, %if.then.130 ], [ %save_EOB8.phi.trans.insert, %if.end.140 ], [ %save_EOB8.phi.trans.insert, %if.end.82 ], [ %save_EOB8.phi.trans.insert, %if.end.33 ] + %save_groupNo9.pre-phi456 = phi i32* [ %save_groupNo9.pre-phi457, %sw.default ], [ %save_groupNo9.phi.trans.insert, %if.then.29 ], [ %save_groupNo9.phi.trans.insert, %if.then.130 ], [ %save_groupNo9.phi.trans.insert, %if.end.140 ], [ %save_groupNo9.phi.trans.insert, %if.end.82 ], [ %save_groupNo9.phi.trans.insert, %if.end.33 ] + %save_groupPos10.pre-phi454 = phi i32* [ %save_groupPos10.pre-phi455, %sw.default ], [ %save_groupPos10.phi.trans.insert, %if.then.29 ], [ %save_groupPos10.phi.trans.insert, %if.then.130 ], [ %save_groupPos10.phi.trans.insert, %if.end.140 ], [ %save_groupPos10.phi.trans.insert, %if.end.82 ], [ %save_groupPos10.phi.trans.insert, %if.end.33 ] + %save_nextSym11.pre-phi452 = phi i32* [ %save_nextSym11.pre-phi453, %sw.default ], [ %save_nextSym11.phi.trans.insert, %if.then.29 ], [ %save_nextSym11.phi.trans.insert, %if.then.130 ], [ %save_nextSym11.phi.trans.insert, %if.end.140 ], [ %save_nextSym11.phi.trans.insert, %if.end.82 ], [ %save_nextSym11.phi.trans.insert, %if.end.33 ] + %save_nblockMAX12.pre-phi450 = phi i32* [ %save_nblockMAX12.pre-phi451, %sw.default ], [ %save_nblockMAX12.phi.trans.insert, %if.then.29 ], [ %save_nblockMAX12.phi.trans.insert, %if.then.130 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.140 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.82 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.33 ] + %save_nblock13.pre-phi448 = phi i32* [ %save_nblock13.pre-phi449, %sw.default ], [ %save_nblock13.phi.trans.insert, %if.then.29 ], [ %save_nblock13.phi.trans.insert, %if.then.130 ], [ %save_nblock13.phi.trans.insert, %if.end.140 ], [ %save_nblock13.phi.trans.insert, %if.end.82 ], [ %save_nblock13.phi.trans.insert, %if.end.33 ] + %save_es14.pre-phi446 = phi i32* [ %save_es14.pre-phi447, %sw.default ], [ %save_es14.phi.trans.insert, %if.then.29 ], [ %save_es14.phi.trans.insert, %if.then.130 ], [ %save_es14.phi.trans.insert, %if.end.140 ], [ %save_es14.phi.trans.insert, %if.end.82 ], [ %save_es14.phi.trans.insert, %if.end.33 ] + %save_N15.pre-phi444 = phi i32* [ %save_N15.pre-phi445, %sw.default ], [ %save_N15.phi.trans.insert, %if.then.29 ], [ %save_N15.phi.trans.insert, %if.then.130 ], [ %save_N15.phi.trans.insert, %if.end.140 ], [ %save_N15.phi.trans.insert, %if.end.82 ], [ %save_N15.phi.trans.insert, %if.end.33 ] + %save_curr16.pre-phi442 = phi i32* [ %save_curr16.pre-phi443, %sw.default ], [ %save_curr16.phi.trans.insert, %if.then.29 ], [ %save_curr16.phi.trans.insert, %if.then.130 ], [ %save_curr16.phi.trans.insert, %if.end.140 ], [ %save_curr16.phi.trans.insert, %if.end.82 ], [ %save_curr16.phi.trans.insert, %if.end.33 ] + %save_zt17.pre-phi440 = phi i32* [ %save_zt17.pre-phi441, %sw.default ], [ %save_zt17.phi.trans.insert, %if.then.29 ], [ %save_zt17.phi.trans.insert, %if.then.130 ], [ %save_zt17.phi.trans.insert, %if.end.140 ], [ %save_zt17.phi.trans.insert, %if.end.82 ], [ %save_zt17.phi.trans.insert, %if.end.33 ] + %save_zn18.pre-phi438 = phi i32* [ %save_zn18.pre-phi439, %sw.default ], [ %save_zn18.phi.trans.insert, %if.then.29 ], [ %save_zn18.phi.trans.insert, %if.then.130 ], [ %save_zn18.phi.trans.insert, %if.end.140 ], [ %save_zn18.phi.trans.insert, %if.end.82 ], [ %save_zn18.phi.trans.insert, %if.end.33 ] + %save_zvec19.pre-phi436 = phi i32* [ %save_zvec19.pre-phi437, %sw.default ], [ %save_zvec19.phi.trans.insert, %if.then.29 ], [ %save_zvec19.phi.trans.insert, %if.then.130 ], [ %save_zvec19.phi.trans.insert, %if.end.140 ], [ %save_zvec19.phi.trans.insert, %if.end.82 ], [ %save_zvec19.phi.trans.insert, %if.end.33 ] + %save_zj20.pre-phi434 = phi i32* [ %save_zj20.pre-phi435, %sw.default ], [ %save_zj20.phi.trans.insert, %if.then.29 ], [ %save_zj20.phi.trans.insert, %if.then.130 ], [ %save_zj20.phi.trans.insert, %if.end.140 ], [ %save_zj20.phi.trans.insert, %if.end.82 ], [ %save_zj20.phi.trans.insert, %if.end.33 ] + %nblock.1 = phi i32 [ %tmp50, %sw.default ], [ %.pre416, %if.then.29 ], [ 0, %if.then.130 ], [ %.pre416, %if.end.140 ], [ %.pre416, %if.end.82 ], [ %.pre416, %if.end.33 ] + %alphaSize.1 = phi i32 [ %tmp42, %sw.default ], [ %.pre408, %if.then.29 ], [ %add179, %if.then.130 ], [ %.pre408, %if.end.140 ], [ %.pre408, %if.end.82 ], [ %.pre408, %if.end.33 ] + %retVal.0 = phi i32 [ 0, %sw.default ], [ -5, %if.then.29 ], [ -4, %if.then.130 ], [ 0, %if.end.140 ], [ 0, %if.end.82 ], [ 0, %if.end.33 ] + store i32 %tmp58, i32* %save_i, align 4 + store i32 %tmp59, i32* %save_j3.pre-phi468, align 4 + store i32 %tmp60, i32* %save_t4.pre-phi466, align 4 + store i32 %alphaSize.1, i32* %save_alphaSize5.pre-phi464, align 4 + store i32 %tmp61, i32* %save_nGroups6.pre-phi462, align 4 + store i32 %tmp62, i32* %save_nSelectors7.pre-phi460, align 4 + store i32 %tmp63, i32* %save_EOB8.pre-phi458, align 4 + store i32 %tmp64, i32* %save_groupNo9.pre-phi456, align 4 + store i32 %tmp65, i32* %save_groupPos10.pre-phi454, align 4 + store i32 %tmp66, i32* %save_nextSym11.pre-phi452, align 4 + store i32 %tmp67, i32* %save_nblockMAX12.pre-phi450, align 4 + store i32 %nblock.1, i32* %save_nblock13.pre-phi448, align 4 + store i32 %tmp68, i32* %save_es14.pre-phi446, align 4 + store i32 %tmp69, i32* %save_N15.pre-phi444, align 4 + store i32 %tmp70, i32* %save_curr16.pre-phi442, align 4 + store i32 %tmp71, i32* %save_zt17.pre-phi440, align 4 + store i32 %tmp72, i32* %save_zn18.pre-phi438, align 4 + store i32 %tmp73, i32* %save_zvec19.pre-phi436, align 4 + store i32 %tmp74, i32* %save_zj20.pre-phi434, align 4 + ret i32 %retVal.0 +} + +!0 = !{!"branch_weights", i32 10, i32 1} diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll index 739570236da92..1820b8163a905 100644 --- a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll +++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -disable-post-ra < %s | FileCheck %s ; This test aims to check basic correctness of frame layout & ; frame access code. There are 8 functions in this test file, @@ -252,11 +252,11 @@ entry: ; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24] ; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] ; Check correct reservation of 16-byte aligned VLA (size in w0) on stack -; CHECK: ubfx x9, x0, #0, #32 +; CHECK: mov w9, w0 +; CHECK: mov x10, sp ; CHECK: lsl x9, x9, #2 ; CHECK: add x9, x9, #15 ; CHECK: and x9, x9, #0x7fffffff0 -; CHECK: mov x10, sp ; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9 ; CHECK: mov sp, x[[VLASPTMP]] ; Check correct access to local variable, through frame pointer @@ -299,11 +299,11 @@ entry: ; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24] ; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] ; Check correct reservation of 16-byte aligned VLA (size in w0) on stack -; CHECK: ubfx x9, x0, #0, #32 +; CHECK: mov w9, w0 +; CHECK: mov x10, sp ; CHECK: lsl x9, x9, #2 ; CHECK: add x9, x9, #15 ; CHECK: and x9, x9, #0x7fffffff0 -; CHECK: mov x10, sp ; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9 ; CHECK: mov sp, x[[VLASPTMP]] ; Check correct access to local variable, through frame pointer @@ -361,11 +361,11 @@ entry: ; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] ; Check correct reservation of 16-byte aligned VLA (size in w0) on stack ; and set-up of base pointer (x19). -; CHECK: ubfx x9, x0, #0, #32 +; CHECK: mov w9, w0 +; CHECK: mov x10, sp ; CHECK: lsl x9, x9, #2 ; CHECK: add x9, x9, #15 ; CHECK: and x9, x9, #0x7fffffff0 -; CHECK: mov x10, sp ; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9 ; CHECK: mov sp, x[[VLASPTMP]] ; Check correct access to local variable, through base pointer @@ -414,11 +414,11 @@ entry: ; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] ; Check correct reservation of 16-byte aligned VLA (size in w0) on stack ; and set-up of base pointer (x19). -; CHECK: ubfx x9, x0, #0, #32 +; CHECK: mov w9, w0 +; CHECK: mov x10, sp ; CHECK: lsl x9, x9, #2 ; CHECK: add x9, x9, #15 ; CHECK: and x9, x9, #0x7fffffff0 -; CHECK: mov x10, sp ; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9 ; CHECK: mov sp, x[[VLASPTMP]] ; Check correct access to local variable, through base pointer @@ -465,11 +465,11 @@ entry: ; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] ; Check correct reservation of 16-byte aligned VLA (size in w0) on stack ; and set-up of base pointer (x19). -; CHECK: ubfx x9, x0, #0, #32 +; CHECK: mov w9, w0 +; CHECK: mov x10, sp ; CHECK: lsl x9, x9, #2 ; CHECK: add x9, x9, #15 ; CHECK: and x9, x9, #0x7fffffff0 -; CHECK: mov x10, sp ; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9 ; CHECK: mov sp, x[[VLASPTMP]] ; Check correct access to local variable, through base pointer @@ -522,10 +522,10 @@ bb1: ; CHECK-LABEL: realign_conditional2 ; Extra realignment in the prologue (performance issue). +; CHECK: tbz {{.*}} .[[LABEL:.*]] ; CHECK: sub x9, sp, #32 // =32 ; CHECK: and sp, x9, #0xffffffffffffffe0 ; CHECK: mov x19, sp -; CHECK: tbz {{.*}} .[[LABEL:.*]] ; Stack is realigned in a non-entry BB. ; CHECK: sub [[REG:x[01-9]+]], sp, #64 ; CHECK: and sp, [[REG]], #0xffffffffffffffe0 diff --git a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll index ea3b8fa557328..1bc2a3ccb1ca0 100644 --- a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll +++ b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll @@ -1,7 +1,10 @@ -; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic -lower-interleaved-accesses=true < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON +; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true -mattr=-neon < %s | FileCheck %s -check-prefix=NONEON -; CHECK-LABEL: load_factor2: -; CHECK: ld2 { v0.8b, v1.8b }, [x0] +; NEON-LABEL: load_factor2: +; NEON: ld2 { v0.8b, v1.8b }, [x0] +; NONEON-LABEL: load_factor2: +; NONEON-NOT: ld2 define <8 x i8> @load_factor2(<16 x i8>* %ptr) { %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4 %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> @@ -10,8 +13,10 @@ define <8 x i8> @load_factor2(<16 x i8>* %ptr) { ret <8 x i8> %add } -; CHECK-LABEL: load_factor3: -; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0] +; NEON-LABEL: load_factor3: +; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0] +; NONEON-LABEL: load_factor3: +; NONEON-NOT: ld3 define <4 x i32> @load_factor3(i32* %ptr) { %base = bitcast i32* %ptr to <12 x i32>* %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4 @@ -21,8 +26,10 @@ define <4 x i32> @load_factor3(i32* %ptr) { ret <4 x i32> %add } -; CHECK-LABEL: load_factor4: -; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +; NEON-LABEL: load_factor4: +; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +; NONEON-LABEL: load_factor4: +; NONEON-NOT: ld4 define <4 x i32> @load_factor4(i32* %ptr) { %base = bitcast i32* %ptr to <16 x i32>* %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4 @@ -32,16 +39,20 @@ define <4 x i32> @load_factor4(i32* %ptr) { ret <4 x i32> %add } -; CHECK-LABEL: store_factor2: -; CHECK: st2 { v0.8b, v1.8b }, [x0] +; NEON-LABEL: store_factor2: +; NEON: st2 { v0.8b, v1.8b }, [x0] +; NONEON-LABEL: store_factor2: +; NONEON-NOT: st2 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) { %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4 ret void } -; CHECK-LABEL: store_factor3: -; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0] +; NEON-LABEL: store_factor3: +; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0] +; NONEON-LABEL: store_factor3: +; NONEON-NOT: st3 define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { %base = bitcast i32* %ptr to <12 x i32>* %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> @@ -51,8 +62,10 @@ define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v ret void } -; CHECK-LABEL: store_factor4: -; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +; NEON-LABEL: store_factor4: +; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +; NONEON-LABEL: store_factor4: +; NONEON-NOT: st4 define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { %base = bitcast i32* %ptr to <16 x i32>* %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> @@ -65,8 +78,10 @@ define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v ; The following cases test that interleaved access of pointer vectors can be ; matched to ldN/stN instruction. -; CHECK-LABEL: load_ptrvec_factor2: -; CHECK: ld2 { v0.2d, v1.2d }, [x0] +; NEON-LABEL: load_ptrvec_factor2: +; NEON: ld2 { v0.2d, v1.2d }, [x0] +; NONEON-LABEL: load_ptrvec_factor2: +; NONEON-NOT: ld2 define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) { %base = bitcast i32** %ptr to <4 x i32*>* %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4 @@ -74,8 +89,10 @@ define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) { ret <2 x i32*> %strided.v0 } -; CHECK-LABEL: load_ptrvec_factor3: -; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0] +; NEON-LABEL: load_ptrvec_factor3: +; NEON: ld3 { v0.2d, v1.2d, v2.2d }, [x0] +; NONEON-LABEL: load_ptrvec_factor3: +; NONEON-NOT: ld3 define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) { %base = bitcast i32** %ptr to <6 x i32*>* %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4 @@ -86,8 +103,10 @@ define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr ret void } -; CHECK-LABEL: load_ptrvec_factor4: -; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +; NEON-LABEL: load_ptrvec_factor4: +; NEON: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +; NONEON-LABEL: load_ptrvec_factor4: +; NONEON-NOT: ld4 define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) { %base = bitcast i32** %ptr to <8 x i32*>* %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4 @@ -98,8 +117,10 @@ define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr ret void } -; CHECK-LABEL: store_ptrvec_factor2: -; CHECK: st2 { v0.2d, v1.2d }, [x0] +; NEON-LABEL: store_ptrvec_factor2: +; NEON: st2 { v0.2d, v1.2d }, [x0] +; NONEON-LABEL: store_ptrvec_factor2: +; NONEON-NOT: st2 define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) { %base = bitcast i32** %ptr to <4 x i32*>* %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> @@ -107,8 +128,10 @@ define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) { ret void } -; CHECK-LABEL: store_ptrvec_factor3: -; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0] +; NEON-LABEL: store_ptrvec_factor3: +; NEON: st3 { v0.2d, v1.2d, v2.2d }, [x0] +; NONEON-LABEL: store_ptrvec_factor3: +; NONEON-NOT: st3 define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) { %base = bitcast i32** %ptr to <6 x i32*>* %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> @@ -118,8 +141,10 @@ define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 ret void } -; CHECK-LABEL: store_ptrvec_factor4: -; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +; NEON-LABEL: store_ptrvec_factor4: +; NEON: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +; NONEON-LABEL: store_ptrvec_factor4: +; NONEON-NOT: st4 define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) { %base = bitcast i32* %ptr to <8 x i32*>* %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> @@ -132,8 +157,10 @@ define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 ; Following cases check that shuffle maskes with undef indices can be matched ; into ldN/stN instruction. -; CHECK-LABEL: load_undef_mask_factor2: -; CHECK: ld2 { v0.4s, v1.4s }, [x0] +; NEON-LABEL: load_undef_mask_factor2: +; NEON: ld2 { v0.4s, v1.4s }, [x0] +; NONEON-LABEL: load_undef_mask_factor2: +; NONEON-NOT: ld2 define <4 x i32> @load_undef_mask_factor2(i32* %ptr) { %base = bitcast i32* %ptr to <8 x i32>* %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4 @@ -143,8 +170,10 @@ define <4 x i32> @load_undef_mask_factor2(i32* %ptr) { ret <4 x i32> %add } -; CHECK-LABEL: load_undef_mask_factor3: -; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0] +; NEON-LABEL: load_undef_mask_factor3: +; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0] +; NONEON-LABEL: load_undef_mask_factor3: +; NONEON-NOT: ld3 define <4 x i32> @load_undef_mask_factor3(i32* %ptr) { %base = bitcast i32* %ptr to <12 x i32>* %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4 @@ -154,8 +183,10 @@ define <4 x i32> @load_undef_mask_factor3(i32* %ptr) { ret <4 x i32> %add } -; CHECK-LABEL: load_undef_mask_factor4: -; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +; NEON-LABEL: load_undef_mask_factor4: +; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +; NONEON-LABEL: load_undef_mask_factor4: +; NONEON-NOT: ld4 define <4 x i32> @load_undef_mask_factor4(i32* %ptr) { %base = bitcast i32* %ptr to <16 x i32>* %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4 @@ -165,8 +196,10 @@ define <4 x i32> @load_undef_mask_factor4(i32* %ptr) { ret <4 x i32> %add } -; CHECK-LABEL: store_undef_mask_factor2: -; CHECK: st2 { v0.4s, v1.4s }, [x0] +; NEON-LABEL: store_undef_mask_factor2: +; NEON: st2 { v0.4s, v1.4s }, [x0] +; NONEON-LABEL: store_undef_mask_factor2: +; NONEON-NOT: st2 define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) { %base = bitcast i32* %ptr to <8 x i32>* %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> @@ -174,8 +207,10 @@ define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) { ret void } -; CHECK-LABEL: store_undef_mask_factor3: -; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0] +; NEON-LABEL: store_undef_mask_factor3: +; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0] +; NONEON-LABEL: store_undef_mask_factor3: +; NONEON-NOT: st3 define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { %base = bitcast i32* %ptr to <12 x i32>* %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> @@ -185,8 +220,10 @@ define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, < ret void } -; CHECK-LABEL: store_undef_mask_factor4: -; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +; NEON-LABEL: store_undef_mask_factor4: +; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +; NONEON-LABEL: store_undef_mask_factor4: +; NONEON-NOT: st4 define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { %base = bitcast i32* %ptr to <16 x i32>* %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> @@ -195,3 +232,39 @@ define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, < store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4 ret void } + +; Check that we do something sane with illegal types. + +; NEON-LABEL: load_illegal_factor2: +; NEON: BB#0: +; NEON-NEXT: ldr q[[V:[0-9]+]], [x0] +; NEON-NEXT: uzp1 v0.4s, v[[V]].4s, v{{.*}}.4s +; NEON-NEXT: ret +; NONEON-LABEL: load_illegal_factor2: +; NONEON: BB#0: +; NONEON-NEXT: ldr s0, [x0] +; NONEON-NEXT: ldr s1, [x0, #8] +; NONEON-NEXT: ret +define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind { + %tmp1 = load <3 x float>, <3 x float>* %p, align 16 + %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> + ret <3 x float> %tmp2 +} + +; NEON-LABEL: store_illegal_factor2: +; NEON: BB#0: +; NEON-NEXT: uzp1 v0.4s, v0.4s, v{{.*}}.4s +; NEON-NEXT: st1 { v0.d }[0], [x0] +; NEON-NEXT: ret +; NONEON-LABEL: store_illegal_factor2: +; NONEON: BB#0: +; NONEON-NEXT: fmov w[[ELT2:[0-9]+]], s2 +; NONEON-NEXT: fmov w[[RES:[0-9]+]], s0 +; NONEON-NEXT: bfi x[[RES]], x[[ELT2]], #32, #32 +; NONEON-NEXT: str x[[RES]], [x0] +; NONEON-NEXT: ret +define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind { + %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> + store <3 x float> %tmp1, <3 x float>* %p, align 16 + ret void +} diff --git a/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll b/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll new file mode 100644 index 0000000000000..84277995ce5b5 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll @@ -0,0 +1,50 @@ +; RUN: llc -O3 -aarch64-gep-opt=true -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck <%t %s +; REQUIRES: asserts +target triple = "aarch64--linux-android" + +%typeD = type { i32, i32, [256 x i32], [257 x i32] } + +; Function Attrs: noreturn nounwind uwtable +define i32 @test1(%typeD* nocapture %s) { +entry: +; CHECK-LABEL: entry: +; CHECK: %uglygep = getelementptr i8, i8* %0, i64 1032 +; CHECK: br label %do.body.i + + + %tPos = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 0 + %k0 = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 1 + %.pre = load i32, i32* %tPos, align 4 + br label %do.body.i + +do.body.i: +; CHECK-LABEL: do.body.i: +; CHECK: %uglygep2 = getelementptr i8, i8* %uglygep, i64 %3 +; CHECK-NEXT: %4 = bitcast i8* %uglygep2 to i32* +; CHECK-NOT: %uglygep2 = getelementptr i8, i8* %uglygep, i64 1032 + + + %0 = phi i32 [ 256, %entry ], [ %.be, %do.body.i.backedge ] + %1 = phi i32 [ 0, %entry ], [ %.be6, %do.body.i.backedge ] + %add.i = add nsw i32 %1, %0 + %shr.i = ashr i32 %add.i, 1 + %idxprom.i = sext i32 %shr.i to i64 + %arrayidx.i = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 3, i64 %idxprom.i + %2 = load i32, i32* %arrayidx.i, align 4 + %cmp.i = icmp sle i32 %2, %.pre + %na.1.i = select i1 %cmp.i, i32 %0, i32 %shr.i + %nb.1.i = select i1 %cmp.i, i32 %shr.i, i32 %1 + %sub.i = sub nsw i32 %na.1.i, %nb.1.i + %cmp1.i = icmp eq i32 %sub.i, 1 + br i1 %cmp1.i, label %fooo.exit, label %do.body.i.backedge + +do.body.i.backedge: + %.be = phi i32 [ %na.1.i, %do.body.i ], [ 256, %fooo.exit ] + %.be6 = phi i32 [ %nb.1.i, %do.body.i ], [ 0, %fooo.exit ] + br label %do.body.i + +fooo.exit: ; preds = %do.body.i + store i32 %nb.1.i, i32* %k0, align 4 + br label %do.body.i.backedge +} + diff --git a/test/CodeGen/AArch64/aarch64-minmaxv.ll b/test/CodeGen/AArch64/aarch64-minmaxv.ll new file mode 100644 index 0000000000000..fb13b706cfafb --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-minmaxv.ll @@ -0,0 +1,511 @@ +; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linu--gnu" + +; CHECK-LABEL: smax_B +; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b +define i8 @smax_B(<16 x i8>* nocapture readonly %arr) { + %arr.load = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %arr.load, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp sgt <16 x i8> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %arr.load, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp sgt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp sgt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp sgt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + ret i8 %r +} + +; CHECK-LABEL: smax_H +; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h +define i16 @smax_H(<8 x i16>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp sgt <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp sgt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp sgt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + ret i16 %r +} + +; CHECK-LABEL: smax_S +; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s +define i32 @smax_S(<4 x i32> * nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp sgt <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp sgt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + ret i32 %r +} + +; CHECK-LABEL: smax_D +; CHECK-NOT: smaxv +define i64 @smax_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp sgt <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + + +; CHECK-LABEL: umax_B +; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b +define i8 @umax_B(<16 x i8>* nocapture readonly %arr) { + %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ugt <16 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ugt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ugt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ugt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + ret i8 %r +} + +; CHECK-LABEL: umax_H +; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h +define i16 @umax_H(<8 x i16>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp ugt <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp ugt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp ugt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + ret i16 %r +} + +; CHECK-LABEL: umax_S +; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s +define i32 @umax_S(<4 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp ugt <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp ugt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + ret i32 %r +} + +; CHECK-LABEL: umax_D +; CHECK-NOT: umaxv +define i64 @umax_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp ugt <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + + +; CHECK-LABEL: smin_B +; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b +define i8 @smin_B(<16 x i8>* nocapture readonly %arr) { + %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp slt <16 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp slt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp slt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp slt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + ret i8 %r +} + +; CHECK-LABEL: smin_H +; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h +define i16 @smin_H(<8 x i16>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp slt <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp slt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp slt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + ret i16 %r +} + +; CHECK-LABEL: smin_S +; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s +define i32 @smin_S(<4 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp slt <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp slt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + ret i32 %r +} + +; CHECK-LABEL: smin_D +; CHECK-NOT: sminv +define i64 @smin_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp slt <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + + +; CHECK-LABEL: umin_B +; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b +define i8 @umin_B(<16 x i8>* nocapture readonly %arr) { + %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ult <16 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ult <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ult <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ult <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + ret i8 %r +} + +; CHECK-LABEL: umin_H +; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h +define i16 @umin_H(<8 x i16>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp ult <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp ult <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + ret i16 %r +} + +; CHECK-LABEL: umin_S +; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s +define i32 @umin_S(<4 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp ult <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp ult <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + ret i32 %r +} + +; CHECK-LABEL: umin_D +; CHECK-NOT: uminv +define i64 @umin_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp ult <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + +; CHECK-LABEL: fmaxnm_S +; CHECK: fmaxnmv +define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) { + %rdx.minmax.select = load <4 x float>, <4 x float>* %arr + %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> + %rdx.minmax.cmp = fcmp fast oge <4 x float> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf + %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> + %rdx.minmax.cmp1 = fcmp fast oge <4 x float> %rdx.minmax.select1, %rdx.shuf1 + %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0 + %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0 + %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1 + %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt + ret float %r +} + +; CHECK-LABEL: fminnm_S +; CHECK: fminnmv +define float @fminnm_S(<4 x float>* nocapture readonly %arr) { + %rdx.minmax.select = load <4 x float>, <4 x float>* %arr + %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> + %rdx.minmax.cmp = fcmp fast ole <4 x float> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf + %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> + %rdx.minmax.cmp1 = fcmp fast ole <4 x float> %rdx.minmax.select1, %rdx.shuf1 + %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0 + %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0 + %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1 + %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt + ret float %r +} + +define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr) { +; CHECK-LABEL: oversized_umax_256 +; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: umaxv {{h[0-9]+}}, [[V0]] + %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr + %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ugt <16 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ugt <16 x i16> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ugt <16 x i16> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ugt <16 x i16> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt + ret i16 %r +} + +define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr) { +; CHECK-LABEL: oversized_umax_512 +; CHECK: umax v +; CHECK-NEXT: umax v +; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]] + %arr.load = load <16 x i32>, <16 x i32>* %arr + %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ugt <16 x i32> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ugt <16 x i32> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ugt <16 x i32> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ugt <16 x i32> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt + ret i32 %r +} + +define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr) { +; CHECK-LABEL: oversized_umin_256 +; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: uminv {{h[0-9]+}}, [[V0]] + %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr + %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ult <16 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ult <16 x i16> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ult <16 x i16> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ult <16 x i16> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt + ret i16 %r +} + +define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr) { +; CHECK-LABEL: oversized_umin_512 +; CHECK: umin v +; CHECK-NEXT: umin v +; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]] + %arr.load = load <16 x i32>, <16 x i32>* %arr + %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ult <16 x i32> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ult <16 x i32> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ult <16 x i32> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ult <16 x i32> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt + ret i32 %r +} + +define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr) { +; CHECK-LABEL: oversized_smax_256 +; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: smaxv {{h[0-9]+}}, [[V0]] + %arr.load = load <16 x i16>, <16 x i16>* %arr + %rdx.shuf = shufflevector <16 x i16> %arr.load, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp sgt <16 x i16> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %arr.load, <16 x i16> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp sgt <16 x i16> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp sgt <16 x i16> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp sgt <16 x i16> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt + ret i16 %r +} + +define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr) { +; CHECK-LABEL: oversized_smax_512 +; CHECK: smax v +; CHECK-NEXT: smax v +; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]] + %arr.load = load <16 x i32>, <16 x i32>* %arr + %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp sgt <16 x i32> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp sgt <16 x i32> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp sgt <16 x i32> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp sgt <16 x i32> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt + ret i32 %r +} + +define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr) { +; CHECK-LABEL: oversized_smin_256 +; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: sminv {{h[0-9]+}}, [[V0]] + %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr + %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp slt <16 x i16> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp slt <16 x i16> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp slt <16 x i16> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt + ret i16 %r +} + +define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr) { +; CHECK-LABEL: oversized_smin_512 +; CHECK: smin v +; CHECK-NEXT: smin v +; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]] + %arr.load = load <16 x i32>, <16 x i32>* %arr + %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp slt <16 x i32> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp slt <16 x i32> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp slt <16 x i32> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp slt <16 x i32> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt + ret i32 %r +} diff --git a/test/CodeGen/AArch64/aarch64-smax-constantfold.ll b/test/CodeGen/AArch64/aarch64-smax-constantfold.ll new file mode 100644 index 0000000000000..0e5b59f95126d --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-smax-constantfold.ll @@ -0,0 +1,12 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s + +; Function Attrs: nounwind readnone +declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) + +; CHECK-LABEL: test +define <4 x i16> @test() { +entry: +; CHECK: movi d{{[0-9]+}}, #0000000000000000 + %0 = tail call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> , <4 x i16> zeroinitializer) + ret <4 x i16> %0 +} diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll index f0c7572ebf136..f30ab89f238bf 100644 --- a/test/CodeGen/AArch64/addsub_ext.ll +++ b/test/CodeGen/AArch64/addsub_ext.ll @@ -80,6 +80,64 @@ end: ret void } +define void @sub_i8rhs() minsize { +; CHECK-LABEL: sub_i8rhs: + %val8_tmp = load i8, i8* @var8 + %lhs32 = load i32, i32* @var32 + %lhs64 = load i64, i64* @var64 + + ; Need this to prevent extension upon load and give a vanilla i8 operand. + %val8 = add i8 %val8_tmp, 123 + + +; Zero-extending to 32-bits + %rhs32_zext = zext i8 %val8 to i32 + %res32_zext = sub i32 %lhs32, %rhs32_zext + store volatile i32 %res32_zext, i32* @var32 +; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxtb + + %rhs32_zext_shift = shl i32 %rhs32_zext, 3 + %res32_zext_shift = sub i32 %lhs32, %rhs32_zext_shift + store volatile i32 %res32_zext_shift, i32* @var32 +; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxtb #3 + + +; Zero-extending to 64-bits + %rhs64_zext = zext i8 %val8 to i64 + %res64_zext = sub i64 %lhs64, %rhs64_zext + store volatile i64 %res64_zext, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtb + + %rhs64_zext_shift = shl i64 %rhs64_zext, 1 + %res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift + store volatile i64 %res64_zext_shift, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtb #1 + +; Sign-extending to 32-bits + %rhs32_sext = sext i8 %val8 to i32 + %res32_sext = sub i32 %lhs32, %rhs32_sext + store volatile i32 %res32_sext, i32* @var32 +; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxtb + + %rhs32_sext_shift = shl i32 %rhs32_sext, 1 + %res32_sext_shift = sub i32 %lhs32, %rhs32_sext_shift + store volatile i32 %res32_sext_shift, i32* @var32 +; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxtb #1 + +; Sign-extending to 64-bits + %rhs64_sext = sext i8 %val8 to i64 + %res64_sext = sub i64 %lhs64, %rhs64_sext + store volatile i64 %res64_sext, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtb + + %rhs64_sext_shift = shl i64 %rhs64_sext, 4 + %res64_sext_shift = sub i64 %lhs64, %rhs64_sext_shift + store volatile i64 %res64_sext_shift, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtb #4 + + ret void +} + define void @addsub_i16rhs() minsize { ; CHECK-LABEL: addsub_i16rhs: %val16_tmp = load i16, i16* @var16 @@ -155,6 +213,64 @@ end: ret void } +define void @sub_i16rhs() minsize { +; CHECK-LABEL: sub_i16rhs: + %val16_tmp = load i16, i16* @var16 + %lhs32 = load i32, i32* @var32 + %lhs64 = load i64, i64* @var64 + + ; Need this to prevent extension upon load and give a vanilla i16 operand. + %val16 = add i16 %val16_tmp, 123 + + +; Zero-extending to 32-bits + %rhs32_zext = zext i16 %val16 to i32 + %res32_zext = sub i32 %lhs32, %rhs32_zext + store volatile i32 %res32_zext, i32* @var32 +; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxth + + %rhs32_zext_shift = shl i32 %rhs32_zext, 3 + %res32_zext_shift = sub i32 %lhs32, %rhs32_zext_shift + store volatile i32 %res32_zext_shift, i32* @var32 +; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxth #3 + + +; Zero-extending to 64-bits + %rhs64_zext = zext i16 %val16 to i64 + %res64_zext = sub i64 %lhs64, %rhs64_zext + store volatile i64 %res64_zext, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxth + + %rhs64_zext_shift = shl i64 %rhs64_zext, 1 + %res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift + store volatile i64 %res64_zext_shift, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxth #1 + +; Sign-extending to 32-bits + %rhs32_sext = sext i16 %val16 to i32 + %res32_sext = sub i32 %lhs32, %rhs32_sext + store volatile i32 %res32_sext, i32* @var32 +; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth + + %rhs32_sext_shift = shl i32 %rhs32_sext, 1 + %res32_sext_shift = sub i32 %lhs32, %rhs32_sext_shift + store volatile i32 %res32_sext_shift, i32* @var32 +; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth #1 + +; Sign-extending to 64-bits + %rhs64_sext = sext i16 %val16 to i64 + %res64_sext = sub i64 %lhs64, %rhs64_sext + store volatile i64 %res64_sext, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxth + + %rhs64_sext_shift = shl i64 %rhs64_sext, 4 + %res64_sext_shift = sub i64 %lhs64, %rhs64_sext_shift + store volatile i64 %res64_sext_shift, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxth #4 + + ret void +} + ; N.b. we could probably check more here ("add w2, w3, w1, uxtw" for ; example), but the remaining instructions are probably not idiomatic ; in the face of "add/sub (shifted register)" so I don't intend to. @@ -187,3 +303,33 @@ define void @addsub_i32rhs() minsize { ret void } + +define void @sub_i32rhs() minsize { +; CHECK-LABEL: sub_i32rhs: + %val32_tmp = load i32, i32* @var32 + %lhs64 = load i64, i64* @var64 + + %val32 = add i32 %val32_tmp, 123 + + %rhs64_zext = zext i32 %val32 to i64 + %res64_zext = sub i64 %lhs64, %rhs64_zext + store volatile i64 %res64_zext, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw + + %rhs64_zext_shift = shl i64 %rhs64_zext, 2 + %res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift + store volatile i64 %res64_zext_shift, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw #2 + + %rhs64_sext = sext i32 %val32 to i64 + %res64_sext = sub i64 %lhs64, %rhs64_sext + store volatile i64 %res64_sext, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw + + %rhs64_sext_shift = shl i64 %rhs64_sext, 2 + %res64_sext_shift = sub i64 %lhs64, %rhs64_sext_shift + store volatile i64 %res64_sext_shift, i64* @var64 +; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw #2 + + ret void +} diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll index 5b2278ce8a351..45754377b2d91 100644 --- a/test/CodeGen/AArch64/alloca.ll +++ b/test/CodeGen/AArch64/alloca.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK -; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s +; RUN: llc -mtriple=aarch64-linux-gnu -disable-post-ra -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s declare void @use_addr(i8*) diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll index 173a440326ac6..a66ea0df2e987 100644 --- a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll +++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll @@ -22,22 +22,22 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone !llvm.dbg.sp = !{!1, !7, !10, !11, !12} !0 = !DIGlobalVariable(name: "vsplive", line: 617, isLocal: true, isDefinition: true, scope: !1, file: !2, type: !6) -!1 = !DISubprogram(name: "drt_vsprintf", line: 616, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4) +!1 = distinct !DISubprogram(name: "drt_vsprintf", line: 616, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4) !2 = !DIFile(filename: "print.i", directory: "/Volumes/Ebi/echeng/radars/r9146594") -!3 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", isOptimized: true, emissionKind: 0, file: !20, enums: !21, retainedTypes: !21) +!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", isOptimized: true, emissionKind: 0, file: !20, enums: !21, retainedTypes: !21) !4 = !DISubroutineType(types: !5) !5 = !{!6} !6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed) -!7 = !DISubprogram(name: "putc_mem", line: 30, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8) +!7 = distinct !DISubprogram(name: "putc_mem", line: 30, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8) !8 = !DISubroutineType(types: !9) !9 = !{null} -!10 = !DISubprogram(name: "print_double", line: 203, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4) -!11 = !DISubprogram(name: "print_number", line: 75, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4) -!12 = !DISubprogram(name: "get_flags", line: 508, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8) +!10 = distinct !DISubprogram(name: "print_double", line: 203, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4) +!11 = distinct !DISubprogram(name: "print_number", line: 75, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4) +!12 = distinct !DISubprogram(name: "get_flags", line: 508, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8) !13 = !DILocation(line: 653, column: 5, scope: !14) !14 = distinct !DILexicalBlock(line: 652, column: 35, file: !20, scope: !15) !15 = distinct !DILexicalBlock(line: 616, column: 1, file: !20, scope: !1) -!16 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "do_tab_convert", line: 853, scope: !17, file: !2, type: !6) +!16 = !DILocalVariable(name: "do_tab_convert", line: 853, scope: !17, file: !2, type: !6) !17 = distinct !DILexicalBlock(line: 850, column: 12, file: !20, scope: !14) !18 = !DILocation(line: 853, column: 11, scope: !17) !19 = !DILocation(line: 853, column: 29, scope: !17) diff --git a/test/CodeGen/AArch64/arm64-aapcs-be.ll b/test/CodeGen/AArch64/arm64-aapcs-be.ll index f27570acc820e..e77952e4b8a1c 100644 --- a/test/CodeGen/AArch64/arm64-aapcs-be.ll +++ b/test/CodeGen/AArch64/arm64-aapcs-be.ll @@ -32,7 +32,7 @@ define float @test_block_addr([8 x float], [1 x float] %in) { define void @test_block_addr_callee() { ; CHECK-LABEL: test_block_addr_callee: -; CHECK: str {{[a-z0-9]+}}, [sp] +; CHECK: str {{[a-z0-9]+}}, [sp, #-16]! ; CHECK: bl test_block_addr %val = insertvalue [1 x float] undef, float 0.0, 0 call float @test_block_addr([8 x float] undef, [1 x float] %val) diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll index d0880cd4f3eb5..441f45bf90b34 100644 --- a/test/CodeGen/AArch64/arm64-aapcs.ll +++ b/test/CodeGen/AArch64/arm64-aapcs.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s +; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false -disable-post-ra < %s | FileCheck %s @var = global i32 0, align 4 @@ -27,12 +27,13 @@ define [2 x i64] @test_i64x2_align(i32, [2 x i64] %arg, i32 %after) { ; Check stack slots are 64-bit at all times. define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short, i32 %int, i64 %long) { - ; Part of last store. Blasted scheduler. -; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32] - %ext_bool = zext i1 %bool to i64 store volatile i64 %ext_bool, i64* @var64, align 8 ; CHECK: ldrb w[[EXT:[0-9]+]], [sp] + + ; Part of last store. Blasted scheduler. +; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32] + ; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1 ; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64] @@ -63,8 +64,8 @@ define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short, define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) { %ext_bool = zext i1 %bool to i64 store volatile i64 %ext_bool, i64* @var64 -; CHECK: and [[EXT:x[0-9]+]], x0, #0x1 -; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64] +; CHECK: and w[[EXT:[0-9]+]], w0, #0x1 +; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64] %ext_char = sext i8 %char to i64 store volatile i64 %ext_char, i64* @var64 @@ -73,13 +74,13 @@ define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) { %ext_short = zext i16 %short to i64 store volatile i64 %ext_short, i64* @var64 -; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff -; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64] +; CHECK: and w[[EXT:[0-9]+]], w2, #0xffff +; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64] %ext_int = zext i32 %int to i64 store volatile i64 %ext_int, i64* @var64 -; CHECK: ubfx [[EXT:x[0-9]+]], x3, #0, #32 -; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64] +; CHECK: mov w[[EXT:[0-9]+]], w3 +; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64] ret void } diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll index 1c1b58b8b140d..dc9884f12f571 100644 --- a/test/CodeGen/AArch64/arm64-abi_align.ll +++ b/test/CodeGen/AArch64/arm64-abi_align.ll @@ -508,7 +508,7 @@ entry: ; "i64 %0" should be in register x7. ; "i32 8" should be on stack at [sp]. ; CHECK: ldr x7, [{{x[0-9]+}}] -; CHECK: str {{w[0-9]+}}, [sp] +; CHECK: str {{w[0-9]+}}, [sp, #-16]! ; FAST-LABEL: i64_split ; FAST: ldr x7, [{{x[0-9]+}}] ; FAST: mov x[[R0:[0-9]+]], sp diff --git a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll index 4703d25a6016b..d46800d34cac8 100644 --- a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll +++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll @@ -1,6 +1,7 @@ -; RUN: llc -march arm64 < %s | FileCheck %s +; RUN: llc -march arm64 < %s -aarch64-collect-loh=false | FileCheck %s ; rdar://13452552 -; ModuleID = 'reduced_test.ll' +; Disable the collecting of LOH so that the labels do not get in the +; way of the NEXT patterns. target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" target triple = "arm64-apple-ios3.0.0" @@ -13,8 +14,8 @@ define zeroext i8 @fullGtU(i32 %i1, i32 %i2) { ; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]] ; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]], w0, sxtw] ; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw] -; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]] -; CHECK-NEXT b.ne +; CHECK-NEXT: cmp [[BLOCKVAL1]], [[BLOCKVAL2]] +; CHECK-NEXT: b.ne ; Next BB ; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw ; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll index eb0cd3547bdad..36424506bee87 100644 --- a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll +++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll @@ -1,9 +1,9 @@ ; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s ; CHECK: foo -; CHECK: ldr w[[REG:[0-9]+]], [x19, #264] -; CHECK: str w[[REG]], [x19, #132] -; CHECK: ldr w{{[0-9]+}}, [x19, #264] +; CHECK: str w[[REG0:[0-9]+]], [x19, #264] +; CHECK: mov w[[REG1:[0-9]+]], w[[REG0]] +; CHECK: str w[[REG1]], [x19, #132] define i32 @foo(i32 %a) nounwind { %retval = alloca i32, align 4 diff --git a/test/CodeGen/AArch64/arm64-arith.ll b/test/CodeGen/AArch64/arm64-arith.ll index f36e706b15dda..d5d9a1b98174b 100644 --- a/test/CodeGen/AArch64/arm64-arith.ll +++ b/test/CodeGen/AArch64/arm64-arith.ll @@ -123,7 +123,8 @@ entry: define i64 @t14(i16 %a, i64 %x) nounwind ssp { entry: ; CHECK-LABEL: t14: -; CHECK: add x0, x1, w0, uxth #3 +; CHECK: and w8, w0, #0xffff +; CHECK: add x0, x1, w8, uxtw #3 ; CHECK: ret %c = zext i16 %a to i64 %d = shl i64 %c, 3 diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll index a76cf74a6d0c2..44c24c51f0df5 100644 --- a/test/CodeGen/AArch64/arm64-atomic-128.ll +++ b/test/CodeGen/AArch64/arm64-atomic-128.ll @@ -173,10 +173,13 @@ define i128 @atomic_load_seq_cst(i128* %p) { ret i128 %r } -define i128 @atomic_load_relaxed(i128* %p) { +define i128 @atomic_load_relaxed(i64, i64, i128* %p) { ; CHECK-LABEL: atomic_load_relaxed: ; CHECK-NOT: dmb -; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x2] +; CHECK-NEXT: stxp [[SUCCESS:w[0-9]+]], [[LO]], [[HI]], [x2] +; CHECK: cbnz [[SUCCESS]], [[LABEL]] ; CHECK-NOT: dmb %r = load atomic i128, i128* %p monotonic, align 16 ret i128 %r diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll index 0824bd881a95c..5d8d60de5fc5c 100644 --- a/test/CodeGen/AArch64/arm64-atomic.ll +++ b/test/CodeGen/AArch64/arm64-atomic.ll @@ -2,13 +2,17 @@ define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 { ; CHECK-LABEL: val_compare_and_swap: -; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]: -; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x0] +; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 +; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: +; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: cmp [[RESULT]], w1 -; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]] -; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x0] -; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]] -; CHECK-NEXT: [[LABEL2]]: +; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] +; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] +; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] +; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: clrex +; CHECK-NEXT: [[EXITBB]]: %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -17,13 +21,16 @@ define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 { define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 { ; CHECK-LABEL: val_compare_and_swap_from_load: ; CHECK-NEXT: ldr [[NEW:w[0-9]+]], [x2] -; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x0] ; CHECK-NEXT: cmp [[RESULT]], w1 -; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0] -; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]] -; CHECK-NEXT: [[LABEL2]]: +; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] +; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: clrex +; CHECK-NEXT: [[EXITBB]]: %new = load i32, i32* %pnew %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 @@ -32,13 +39,17 @@ define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 { define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 { ; CHECK-LABEL: val_compare_and_swap_rel: -; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]: -; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x0] +; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 +; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: +; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]] ; CHECK-NEXT: cmp [[RESULT]], w1 -; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]] -; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x0] -; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]] -; CHECK-NEXT: [[LABEL2]]: +; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] +; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]] +; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] +; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: clrex +; CHECK-NEXT: [[EXITBB]]: %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -47,13 +58,16 @@ define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 { define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 { ; CHECK-LABEL: val_compare_and_swap_64: ; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 -; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK-NEXT: ldxr [[RESULT:x[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: cmp [[RESULT]], x1 -; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]] -; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]] -; CHECK-NEXT: [[LABEL2]]: +; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] +; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: clrex +; CHECK-NEXT: [[EXITBB]]: %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic %val = extractvalue { i64, i1 } %pair, 0 ret i64 %val @@ -61,13 +75,13 @@ define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 { define i32 @fetch_and_nand(i32* %p) #0 { ; CHECK-LABEL: fetch_and_nand: -; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK: ldxr w[[DEST_REG:[0-9]+]], [x0] ; CHECK: mvn [[TMP_REG:w[0-9]+]], w[[DEST_REG]] ; CHECK: orr [[SCRATCH2_REG:w[0-9]+]], [[TMP_REG]], #0xfffffff8 ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]] ; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0] -; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]] ; CHECK: mov x0, x[[DEST_REG]] %val = atomicrmw nand i32* %p, i32 7 release ret i32 %val @@ -76,12 +90,12 @@ define i32 @fetch_and_nand(i32* %p) #0 { define i64 @fetch_and_nand_64(i64* %p) #0 { ; CHECK-LABEL: fetch_and_nand_64: ; CHECK: mov x[[ADDR:[0-9]+]], x0 -; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK: ldaxr x[[DEST_REG:[0-9]+]], [x[[ADDR]]] ; CHECK: mvn w[[TMP_REG:[0-9]+]], w[[DEST_REG]] ; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], x[[TMP_REG]], #0xfffffffffffffff8 ; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]] -; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]] %val = atomicrmw nand i64* %p, i64 7 acq_rel ret i64 %val @@ -90,12 +104,12 @@ define i64 @fetch_and_nand_64(i64* %p) #0 { define i32 @fetch_and_or(i32* %p) #0 { ; CHECK-LABEL: fetch_and_or: ; CHECK: movz [[OLDVAL_REG:w[0-9]+]], #0x5 -; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK: ldaxr w[[DEST_REG:[0-9]+]], [x0] ; CHECK: orr [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]] ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]] ; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0] -; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]] ; CHECK: mov x0, x[[DEST_REG]] %val = atomicrmw or i32* %p, i32 5 seq_cst ret i32 %val @@ -104,11 +118,11 @@ define i32 @fetch_and_or(i32* %p) #0 { define i64 @fetch_and_or_64(i64* %p) #0 { ; CHECK: fetch_and_or_64: ; CHECK: mov x[[ADDR:[0-9]+]], x0 -; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: [[TRYBB:.?LBB[0-9_]+]]: ; CHECK: ldxr [[DEST_REG:x[0-9]+]], [x[[ADDR]]] ; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7 ; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]] -; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]] %val = atomicrmw or i64* %p, i64 7 monotonic ret i64 %val } diff --git a/test/CodeGen/AArch64/arm64-builtins-linux.ll b/test/CodeGen/AArch64/arm64-builtins-linux.ll new file mode 100644 index 0000000000000..34fa1b4715615 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-builtins-linux.ll @@ -0,0 +1,11 @@ +; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s + +; Function Attrs: nounwind readnone +declare i8* @llvm.aarch64.thread.pointer() #1 + +define i8* @thread_pointer() { +; CHECK: thread_pointer: +; CHECK: mrs {{x[0-9]+}}, TPIDR_EL0 + %1 = tail call i8* @llvm.aarch64.thread.pointer() + ret i8* %1 +} diff --git a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll index 4e47ab6c03f3e..25d874e54cb7c 100644 --- a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll +++ b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll @@ -15,10 +15,10 @@ target triple = "arm64-apple-ios7.0.0" ; CHECK: Maze1 ; CHECK: %if.then ; CHECK: cmp x{{[0-9]+}}, #2 -; CHECK-NEXT b.cc +; CHECK-NEXT: b.lo ; CHECK: %if.then ; CHECK: cmp x{{[0-9]+}}, #2 -; CHECK-NEXT b.cc +; CHECK-NEXT: b.lo define i32 @Maze1() nounwind ssp { entry: %0 = load i64, i64* @channelColumns, align 8, !tbaa !0 diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll index ff18f73643371..72d3b8331162f 100644 --- a/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/test/CodeGen/AArch64/arm64-ccmp.ll @@ -104,11 +104,14 @@ if.end: ; preds = %if.then, %lor.lhs.f ; Speculatively execute division by zero. ; The sdiv/udiv instructions do not trap when the divisor is zero, so they are ; safe to speculate. -; CHECK: speculate_division -; CHECK-NOT: cmp -; CHECK: sdiv -; CHECK: cmp -; CHECK-NEXT: ccmp +; CHECK-LABEL: speculate_division: +; CHECK: cmp w0, #1 +; CHECK: sdiv [[DIVRES:w[0-9]+]], w1, w0 +; CHECK: ccmp [[DIVRES]], #16, #0, ge +; CHECK: b.gt [[BLOCK:LBB[0-9_]+]] +; CHECK: bl _foo +; CHECK: [[BLOCK]]: +; CHECK: orr w0, wzr, #0x7 define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp { entry: %cmp = icmp sgt i32 %a, 0 @@ -287,3 +290,156 @@ sw.bb.i.i: %code1.i.i.phi.trans.insert = getelementptr inbounds %str1, %str1* %0, i64 0, i32 0, i32 0, i64 16 br label %sw.bb.i.i } + +; CHECK-LABEL: select_and +define i64 @select_and(i32 %w0, i32 %w1, i64 %x2, i64 %x3) { +; CHECK: cmp w1, #5 +; CHECK-NEXT: ccmp w0, w1, #0, ne +; CHECK-NEXT: csel x0, x2, x3, lt +; CHECK-NEXT: ret + %1 = icmp slt i32 %w0, %w1 + %2 = icmp ne i32 5, %w1 + %3 = and i1 %1, %2 + %sel = select i1 %3, i64 %x2, i64 %x3 + ret i64 %sel +} + +; CHECK-LABEL: select_or +define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) { +; CHECK: cmp w1, #5 +; CHECK-NEXT: ccmp w0, w1, #8, eq +; CHECK-NEXT: csel x0, x2, x3, lt +; CHECK-NEXT: ret + %1 = icmp slt i32 %w0, %w1 + %2 = icmp ne i32 5, %w1 + %3 = or i1 %1, %2 + %sel = select i1 %3, i64 %x2, i64 %x3 + ret i64 %sel +} + +; CHECK-LABEL: select_complicated +define i16 @select_complicated(double %v1, double %v2, i16 %a, i16 %b) { +; CHECK: ldr [[REG:d[0-9]+]], +; CHECK: fcmp d0, d2 +; CHECK-NEXT: fmov d2, #13.00000000 +; CHECK-NEXT: fccmp d1, d2, #4, ne +; CHECK-NEXT: fccmp d0, d1, #1, ne +; CHECK-NEXT: fccmp d0, d1, #4, vc +; CEHCK-NEXT: csel w0, w0, w1, eq + %1 = fcmp one double %v1, %v2 + %2 = fcmp oeq double %v2, 13.0 + %3 = fcmp oeq double %v1, 42.0 + %or0 = or i1 %2, %3 + %or1 = or i1 %1, %or0 + %sel = select i1 %or1, i16 %a, i16 %b + ret i16 %sel +} + +; CHECK-LABEL: gccbug +define i64 @gccbug(i64 %x0, i64 %x1) { +; CHECK: cmp x0, #2 +; CHECK-NEXT: ccmp x0, #4, #4, ne +; CHECK-NEXT: ccmp x1, #0, #0, eq +; CHECK-NEXT: orr w[[REGNUM:[0-9]+]], wzr, #0x1 +; CHECK-NEXT: cinc x0, x[[REGNUM]], eq +; CHECK-NEXT: ret + %cmp0 = icmp eq i64 %x1, 0 + %cmp1 = icmp eq i64 %x0, 2 + %cmp2 = icmp eq i64 %x0, 4 + + %or = or i1 %cmp2, %cmp1 + %and = and i1 %or, %cmp0 + + %sel = select i1 %and, i64 2, i64 1 + ret i64 %sel +} + +; CHECK-LABEL: select_ororand +define i32 @select_ororand(i32 %w0, i32 %w1, i32 %w2, i32 %w3) { +; CHECK: cmp w3, #4 +; CHECK-NEXT: ccmp w2, #2, #0, gt +; CHECK-NEXT: ccmp w1, #13, #2, ge +; CHECK-NEXT: ccmp w0, #0, #4, ls +; CHECK-NEXT: csel w0, w3, wzr, eq +; CHECK-NEXT: ret + %c0 = icmp eq i32 %w0, 0 + %c1 = icmp ugt i32 %w1, 13 + %c2 = icmp slt i32 %w2, 2 + %c4 = icmp sgt i32 %w3, 4 + %or = or i1 %c0, %c1 + %and = and i1 %c2, %c4 + %or1 = or i1 %or, %and + %sel = select i1 %or1, i32 %w3, i32 0 + ret i32 %sel +} + +; CHECK-LABEL: select_andor +define i32 @select_andor(i32 %v1, i32 %v2, i32 %v3) { +; CHECK: cmp w1, w2 +; CHECK-NEXT: ccmp w0, #0, #4, lt +; CHECK-NEXT: ccmp w0, w1, #0, eq +; CHECK-NEXT: csel w0, w0, w1, eq +; CHECK-NEXT: ret + %c0 = icmp eq i32 %v1, %v2 + %c1 = icmp sge i32 %v2, %v3 + %c2 = icmp eq i32 %v1, 0 + %or = or i1 %c2, %c1 + %and = and i1 %or, %c0 + %sel = select i1 %and, i32 %v1, i32 %v2 + ret i32 %sel +} + +; CHECK-LABEL: select_noccmp1 +define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) { +; CHECK: cmp x0, #0 +; CHECK-NEXT: cset [[REG0:w[0-9]+]], lt +; CHECK-NEXT: cmp x0, #13 +; CHECK-NOT: ccmp +; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt +; CHECK-NEXT: cmp x2, #2 +; CHECK-NEXT: cset [[REG2:w[0-9]+]], lt +; CHECK-NEXT: cmp x2, #4 +; CHECK-NEXT: cset [[REG3:w[0-9]+]], gt +; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]] +; CHECK-NEXT: and [[REG5:w[0-9]+]], [[REG2]], [[REG3]] +; CHECK-NEXT: orr [[REG6:w[0-9]+]], [[REG4]], [[REG5]] +; CHECK-NEXT: cmp [[REG6]], #0 +; CHECK-NEXT: csel x0, xzr, x3, ne +; CHECK-NEXT: ret + %c0 = icmp slt i64 %v1, 0 + %c1 = icmp sgt i64 %v1, 13 + %c2 = icmp slt i64 %v3, 2 + %c4 = icmp sgt i64 %v3, 4 + %and0 = and i1 %c0, %c1 + %and1 = and i1 %c2, %c4 + %or = or i1 %and0, %and1 + %sel = select i1 %or, i64 0, i64 %r + ret i64 %sel +} + +@g = global i32 0 + +; Should not use ccmp if we have to compute the or expression in an integer +; register anyway because of other users. +; CHECK-LABEL: select_noccmp2 +define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) { +; CHECK: cmp x0, #0 +; CHECK-NEXT: cset [[REG0:w[0-9]+]], lt +; CHECK-NOT: ccmp +; CHECK-NEXT: cmp x0, #13 +; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt +; CHECK-NEXT: orr [[REG2:w[0-9]+]], [[REG0]], [[REG1]] +; CHECK-NEXT: cmp [[REG2]], #0 +; CHECK-NEXT: csel x0, xzr, x3, ne +; CHECK-NEXT: sbfx [[REG3:w[0-9]+]], [[REG2]], #0, #1 +; CHECK-NEXT: adrp x[[REGN4:[0-9]+]], _g@PAGE +; CHECK-NEXT: str [[REG3]], [x[[REGN4]], _g@PAGEOFF] +; CHECK-NEXT: ret + %c0 = icmp slt i64 %v1, 0 + %c1 = icmp sgt i64 %v1, 13 + %or = or i1 %c0, %c1 + %sel = select i1 %or, i64 0, i64 %r + %ext = sext i1 %or to i32 + store volatile i32 %ext, i32* @g + ret i64 %sel +} diff --git a/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll b/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll new file mode 100644 index 0000000000000..528d2538bb4ab --- /dev/null +++ b/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s | FileCheck %s + +; CHECK: orr w0, wzr, #0x1 +; CHECK-NEXT: bl foo +; CHECK-NEXT: orr w0, wzr, #0x1 +; CHECK-NEXT: bl foo + +target triple = "aarch64--linux-android" +declare i32 @foo(i32) + +; Function Attrs: nounwind uwtable +define i32 @main() { +entry: + %call = tail call i32 @foo(i32 1) + %call1 = tail call i32 @foo(i32 1) + ret i32 0 +} diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll index c0aa63cc43312..59147d401a305 100644 --- a/test/CodeGen/AArch64/arm64-collect-loh.ll +++ b/test/CodeGen/AArch64/arm64-collect-loh.ll @@ -51,3 +51,607 @@ if.end4: ; preds = %if.then2, %if.then, %add6 = add nsw i32 %tmp3, %t.addr.0 ret i32 %add6 } + +@C = common global i32 0, align 4 + +; Check that we catch AdrpLdrGotLdr case when we have a simple chain: +; adrp -> ldrgot -> ldr. +; CHECK-LABEL: _getC +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr w0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define i32 @getC() { + %res = load i32, i32* @C, align 4 + ret i32 %res +} + +; LDRSW supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getSExtC +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldrsw x0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define i64 @getSExtC() { + %res = load i32, i32* @C, align 4 + %sextres = sext i32 %res to i64 + ret i64 %sextres +} + +; It may not be safe to fold the literal in the load if the address is +; used several times. +; Make sure we emit AdrpLdrGot for those. +; CHECK-LABEL: _getSeveralC +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF] +; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0 +; CHECK-NEXT: str [[ADD]], {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]] +define void @getSeveralC(i32 %t) { +entry: + %tmp = load i32, i32* @C, align 4 + %add = add nsw i32 %tmp, %t + store i32 %add, i32* @C, align 4 + ret void +} + +; Make sure we catch that: +; adrp -> ldrgot -> str. +; CHECK-LABEL: _setC +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: str w0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define void @setC(i32 %t) { +entry: + store i32 %t, i32* @C, align 4 + ret void +} + +; Perform the same tests for internal global and a displacement +; in the addressing mode. +; Indeed we will get an ADD for those instead of LOADGot. +@InternalC = internal global i32 0, align 4 + +; Check that we catch AdrpAddLdr case when we have a simple chain: +; adrp -> add -> ldr. +; CHECK-LABEL: _getInternalCPlus4 +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE +; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr w0, {{\[}}[[ADDGOT_REG]], #16] +; CHECK-NEXT: ret +; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]] +define i32 @getInternalCPlus4() { + %addr = getelementptr i32, i32* @InternalC, i32 4 + %res = load i32, i32* %addr, align 4 + ret i32 %res +} + +; LDRSW supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getSExtInternalCPlus4 +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE +; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldrsw x0, {{\[}}[[ADDGOT_REG]], #16] +; CHECK-NEXT: ret +; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]] +define i64 @getSExtInternalCPlus4() { + %addr = getelementptr i32, i32* @InternalC, i32 4 + %res = load i32, i32* %addr, align 4 + %sextres = sext i32 %res to i64 + ret i64 %sextres +} + +; It may not be safe to fold the literal in the load if the address is +; used several times. +; Make sure we emit AdrpAdd for those. +; CHECK-LABEL: _getSeveralInternalCPlus4 +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE +; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF +; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[ADDGOT_REG]], #16] +; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0 +; CHECK-NEXT: str [[ADD]], {{\[}}[[ADDGOT_REG]], #16] +; CHECK-NEXT: ret +; CHECK: .loh AdrpAdd [[ADRP_LABEL]], [[ADDGOT_LABEL]] +define void @getSeveralInternalCPlus4(i32 %t) { +entry: + %addr = getelementptr i32, i32* @InternalC, i32 4 + %tmp = load i32, i32* %addr, align 4 + %add = add nsw i32 %tmp, %t + store i32 %add, i32* %addr, align 4 + ret void +} + +; Make sure we catch that: +; adrp -> add -> str. +; CHECK-LABEL: _setInternalCPlus4 +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE +; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: str w0, {{\[}}[[ADDGOT_REG]], #16] +; CHECK-NEXT: ret +; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]] +define void @setInternalCPlus4(i32 %t) { +entry: + %addr = getelementptr i32, i32* @InternalC, i32 4 + store i32 %t, i32* %addr, align 4 + ret void +} + +; Check that we catch AdrpAddLdr case when we have a simple chain: +; adrp -> ldr. +; CHECK-LABEL: _getInternalC +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr w0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdr [[ADRP_LABEL]], [[LDR_LABEL]] +define i32 @getInternalC() { + %res = load i32, i32* @InternalC, align 4 + ret i32 %res +} + +; LDRSW supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getSExtInternalC +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldrsw x0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdr [[ADRP_LABEL]], [[LDR_LABEL]] +define i64 @getSExtInternalC() { + %res = load i32, i32* @InternalC, align 4 + %sextres = sext i32 %res to i64 + ret i64 %sextres +} + +; It may not be safe to fold the literal in the load if the address is +; used several times. +; Make sure we do not catch anything here. We have a adrp alone, +; there is not much we can do about it. +; CHECK-LABEL: _getSeveralInternalC +; CHECK: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE +; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF] +; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0 +; CHECK-NEXT: str [[ADD]], {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF] +; CHECK-NEXT: ret +define void @getSeveralInternalC(i32 %t) { +entry: + %tmp = load i32, i32* @InternalC, align 4 + %add = add nsw i32 %tmp, %t + store i32 %add, i32* @InternalC, align 4 + ret void +} + +; Make sure we do not catch anything when: +; adrp -> str. +; We cannot fold anything in the str at this point. +; Indeed, strs do not support litterals. +; CHECK-LABEL: _setInternalC +; CHECK: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE +; CHECK-NEXT: str w0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF] +; CHECK-NEXT: ret +define void @setInternalC(i32 %t) { +entry: + store i32 %t, i32* @InternalC, align 4 + ret void +} + +; Now check other variant of loads/stores. + +@D = common global i8 0, align 4 + +; LDRB does not support loading from a literal. +; Make sure we emit AdrpLdrGot and not AdrpLdrGotLdr for those. +; CHECK-LABEL: _getD +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF] +; CHECK-NEXT: ldrb w0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]] +define i8 @getD() { + %res = load i8, i8* @D, align 4 + ret i8 %res +} + +; CHECK-LABEL: _setD +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF] +; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: strb w0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]] +define void @setD(i8 %t) { + store i8 %t, i8* @D, align 4 + ret void +} + +; LDRSB supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getSExtD +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldrsb w0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define i32 @getSExtD() { + %res = load i8, i8* @D, align 4 + %sextres = sext i8 %res to i32 + ret i32 %sextres +} + +; LDRSB supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getSExt64D +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldrsb x0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define i64 @getSExt64D() { + %res = load i8, i8* @D, align 4 + %sextres = sext i8 %res to i64 + ret i64 %sextres +} + +@E = common global i16 0, align 4 + +; LDRH does not support loading from a literal. +; Make sure we emit AdrpLdrGot and not AdrpLdrGotLdr for those. +; CHECK-LABEL: _getE +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF] +; CHECK-NEXT: ldrh w0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]] +define i16 @getE() { + %res = load i16, i16* @E, align 4 + ret i16 %res +} + +; LDRSH supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getSExtE +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldrsh w0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define i32 @getSExtE() { + %res = load i16, i16* @E, align 4 + %sextres = sext i16 %res to i32 + ret i32 %sextres +} + +; CHECK-LABEL: _setE +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF] +; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: strh w0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]] +define void @setE(i16 %t) { + store i16 %t, i16* @E, align 4 + ret void +} + +; LDRSH supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getSExt64E +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldrsh x0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define i64 @getSExt64E() { + %res = load i16, i16* @E, align 4 + %sextres = sext i16 %res to i64 + ret i64 %sextres +} + +@F = common global i64 0, align 4 + +; LDR supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getF +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr x0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define i64 @getF() { + %res = load i64, i64* @F, align 4 + ret i64 %res +} + +; CHECK-LABEL: _setF +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF] +; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: str x0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]] +define void @setF(i64 %t) { + store i64 %t, i64* @F, align 4 + ret void +} + +@G = common global float 0.0, align 4 + +; LDR float supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getG +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr s0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define float @getG() { + %res = load float, float* @G, align 4 + ret float %res +} + +; CHECK-LABEL: _setG +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF] +; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: str s0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]] +define void @setG(float %t) { + store float %t, float* @G, align 4 + ret void +} + +@H = common global half 0.0, align 4 + +; LDR half supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getH +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr h0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define half @getH() { + %res = load half, half* @H, align 4 + ret half %res +} + +; CHECK-LABEL: _setH +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF] +; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: str h0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]] +define void @setH(half %t) { + store half %t, half* @H, align 4 + ret void +} + +@I = common global double 0.0, align 4 + +; LDR double supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getI +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr d0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define double @getI() { + %res = load double, double* @I, align 4 + ret double %res +} + +; CHECK-LABEL: _setI +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF] +; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: str d0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]] +define void @setI(double %t) { + store double %t, double* @I, align 4 + ret void +} + +@J = common global <2 x i32> , align 4 + +; LDR 64-bit vector supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getJ +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr d0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define <2 x i32> @getJ() { + %res = load <2 x i32>, <2 x i32>* @J, align 4 + ret <2 x i32> %res +} + +; CHECK-LABEL: _setJ +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF] +; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: str d0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]] +define void @setJ(<2 x i32> %t) { + store <2 x i32> %t, <2 x i32>* @J, align 4 + ret void +} + +@K = common global <4 x i32> , align 4 + +; LDR 128-bit vector supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getK +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr q0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define <4 x i32> @getK() { + %res = load <4 x i32>, <4 x i32>* @K, align 4 + ret <4 x i32> %res +} + +; CHECK-LABEL: _setK +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF] +; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: str q0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]] +define void @setK(<4 x i32> %t) { + store <4 x i32> %t, <4 x i32>* @K, align 4 + ret void +} + +@L = common global <1 x i8> , align 4 + +; LDR 8-bit vector supports loading from a literal. +; Make sure we emit AdrpLdrGotLdr for those. +; CHECK-LABEL: _getL +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF] +; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr b0, {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]] +define <1 x i8> @getL() { + %res = load <1 x i8>, <1 x i8>* @L, align 4 + ret <1 x i8> %res +} + +; CHECK-LABEL: _setL +; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE +; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF] +; Ultimately we should generate str b0, but right now, we match the vector +; variant which does not allow to fold the immediate into the store. +; CHECK-NEXT: st1.b { v0 }[0], {{\[}}[[LDRGOT_REG]]] +; CHECK-NEXT: ret +; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]] +define void @setL(<1 x i8> %t) { + store <1 x i8> %t, <1 x i8>* @L, align 4 + ret void +} + +; Make sure we do not assert when we do not track +; all the aliases of a tuple register. +; Indeed the tuple register can be tracked because of +; one of its element, but the other elements of the tuple +; do not need to be tracked and we used to assert on that. +; Note: The test case is fragile in the sense that we need +; a tuple register to appear in the lowering. Thus, the target +; cpu is required to have the problem reproduced. +; CHECK-LABEL: _uninterestingSub +; CHECK: adrp [[ADRP_REG:x[0-9]+]], [[CONSTPOOL:lCPI[0-9]+_[0-9]+]]@PAGE +; CHECK-NEXT: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF] +; The tuple comes from the next instruction. +; CHECK-NEXT: tbl.16b v{{[0-9]+}}, { v{{[0-9]+}}, v{{[0-9]+}} }, v[[IDX]] +; CHECK: ret +define void @uninterestingSub(i8* nocapture %row) #0 { + %tmp = bitcast i8* %row to <16 x i8>* + %tmp1 = load <16 x i8>, <16 x i8>* %tmp, align 16 + %vext43 = shufflevector <16 x i8> , <16 x i8> %tmp1, <16 x i32> + %add.i.414 = add <16 x i8> zeroinitializer, %vext43 + store <16 x i8> %add.i.414, <16 x i8>* %tmp, align 16 + %add.ptr51 = getelementptr inbounds i8, i8* %row, i64 16 + %tmp2 = bitcast i8* %add.ptr51 to <16 x i8>* + %tmp3 = load <16 x i8>, <16 x i8>* %tmp2, align 16 + %tmp4 = bitcast i8* undef to <16 x i8>* + %tmp5 = load <16 x i8>, <16 x i8>* %tmp4, align 16 + %vext157 = shufflevector <16 x i8> %tmp3, <16 x i8> %tmp5, <16 x i32> + %add.i.402 = add <16 x i8> zeroinitializer, %vext157 + store <16 x i8> %add.i.402, <16 x i8>* %tmp4, align 16 + ret void +} + +attributes #0 = { "target-cpu"="cyclone" } diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll index 0ef7b143df807..55c9c6036ed57 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-br.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll @@ -94,9 +94,7 @@ entry: store i32 %c, i32* %c.addr, align 4 store i64 %d, i64* %d.addr, align 8 %0 = load i16, i16* %b.addr, align 2 -; CHECK: and w0, w0, #0x1 -; CHECK: cmp w0, #0 -; CHECK: b.eq LBB4_2 +; CHECK: tbz w0, #0, LBB4_2 %conv = trunc i16 %0 to i1 br i1 %conv, label %if.then, label %if.end @@ -106,9 +104,7 @@ if.then: ; preds = %entry if.end: ; preds = %if.then, %entry %1 = load i32, i32* %c.addr, align 4 -; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1 -; CHECK: cmp w[[REG]], #0 -; CHECK: b.eq LBB4_4 +; CHECK: tbz w{{[0-9]+}}, #0, LBB4_4 %conv1 = trunc i32 %1 to i1 br i1 %conv1, label %if.then3, label %if.end4 @@ -118,8 +114,7 @@ if.then3: ; preds = %if.end if.end4: ; preds = %if.then3, %if.end %2 = load i64, i64* %d.addr, align 8 -; CHECK: cmp w{{[0-9]+}}, #0 -; CHECK: b.eq LBB4_6 +; CHECK: tbz w{{[0-9]+}}, #0, LBB4_6 %conv5 = trunc i64 %2 to i1 br i1 %conv5, label %if.then7, label %if.end8 @@ -139,9 +134,7 @@ define i32 @trunc64(i64 %foo) nounwind { ; CHECK: trunc64 ; CHECK: and [[REG1:x[0-9]+]], x0, #0x1 ; CHECK: mov x[[REG2:[0-9]+]], [[REG1]] -; CHECK: and [[REG3:w[0-9]+]], w[[REG2]], #0x1 -; CHECK: cmp [[REG3]], #0 -; CHECK: b.eq LBB5_2 +; CHECK: tbz w[[REG2]], #0, LBB5_2 %a = and i64 %foo, 1 %b = trunc i64 %a to i1 br i1 %b, label %if.then, label %if.else diff --git a/test/CodeGen/AArch64/arm64-fmax-safe.ll b/test/CodeGen/AArch64/arm64-fmax-safe.ll new file mode 100644 index 0000000000000..8b7d66986e786 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-fmax-safe.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=arm64 < %s | FileCheck %s + +define double @test_direct(float %in) { +; CHECK-LABEL: test_direct: + %cmp = fcmp olt float %in, 0.000000e+00 + %val = select i1 %cmp, float 0.000000e+00, float %in + %longer = fpext float %val to double + ret double %longer + +; CHECK: fmax s +} + +define double @test_cross(float %in) { +; CHECK-LABEL: test_cross: + %cmp = fcmp ult float %in, 0.000000e+00 + %val = select i1 %cmp, float %in, float 0.000000e+00 + %longer = fpext float %val to double + ret double %longer + +; CHECK: fmin s +} + +; Same as previous, but with ordered comparison; +; must become fminnm, not fmin. +define double @test_cross_fail_nan(float %in) { +; CHECK-LABEL: test_cross_fail_nan: + %cmp = fcmp olt float %in, 0.000000e+00 + %val = select i1 %cmp, float %in, float 0.000000e+00 + %longer = fpext float %val to double + ret double %longer + +; CHECK: fminnm s +} + +; This isn't a min or a max, but passes the first condition for swapping the +; results. Make sure they're put back before we resort to the normal fcsel. +define float @test_cross_fail(float %lhs, float %rhs) { +; CHECK-LABEL: test_cross_fail: + %tst = fcmp une float %lhs, %rhs + %res = select i1 %tst, float %rhs, float %lhs + ret float %res + + ; The register allocator would have to decide to be deliberately obtuse before + ; other register were used. +; CHECK: fcsel s0, s1, s0, ne +} + +; Make sure the transformation isn't triggered for integers +define i64 @test_integer(i64 %in) { + %cmp = icmp slt i64 %in, 0 + %val = select i1 %cmp, i64 0, i64 %in + ret i64 %val +} diff --git a/test/CodeGen/AArch64/arm64-fmax.ll b/test/CodeGen/AArch64/arm64-fmax.ll index ea281528b84ce..40cc36ea52fa1 100644 --- a/test/CodeGen/AArch64/arm64-fmax.ll +++ b/test/CodeGen/AArch64/arm64-fmax.ll @@ -1,57 +1,48 @@ ; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s -; RUN: llc -march=arm64 < %s | FileCheck %s --check-prefix=CHECK-SAFE define double @test_direct(float %in) { ; CHECK-LABEL: test_direct: -; CHECK-SAFE-LABEL: test_direct: - %cmp = fcmp olt float %in, 0.000000e+00 - %longer = fpext float %in to double - %val = select i1 %cmp, double 0.000000e+00, double %longer - ret double %val + %cmp = fcmp nnan olt float %in, 0.000000e+00 + %val = select i1 %cmp, float 0.000000e+00, float %in + %longer = fpext float %val to double + ret double %longer ; CHECK: fmax -; CHECK-SAFE: fmax } define double @test_cross(float %in) { ; CHECK-LABEL: test_cross: -; CHECK-SAFE-LABEL: test_cross: - %cmp = fcmp ult float %in, 0.000000e+00 - %longer = fpext float %in to double - %val = select i1 %cmp, double %longer, double 0.000000e+00 - ret double %val + %cmp = fcmp nnan ult float %in, 0.000000e+00 + %val = select i1 %cmp, float %in, float 0.000000e+00 + %longer = fpext float %val to double + ret double %longer ; CHECK: fmin -; CHECK-SAFE: fmin } ; Same as previous, but with ordered comparison; ; can't be converted in safe-math mode. define double @test_cross_fail_nan(float %in) { ; CHECK-LABEL: test_cross_fail_nan: -; CHECK-SAFE-LABEL: test_cross_fail_nan: - %cmp = fcmp olt float %in, 0.000000e+00 - %longer = fpext float %in to double - %val = select i1 %cmp, double %longer, double 0.000000e+00 - ret double %val + %cmp = fcmp nnan olt float %in, 0.000000e+00 + %val = select i1 %cmp, float %in, float 0.000000e+00 + %longer = fpext float %val to double + ret double %longer ; CHECK: fmin -; CHECK-SAFE: fcsel d0, d1, d0, mi } ; This isn't a min or a max, but passes the first condition for swapping the ; results. Make sure they're put back before we resort to the normal fcsel. define float @test_cross_fail(float %lhs, float %rhs) { ; CHECK-LABEL: test_cross_fail: -; CHECK-SAFE-LABEL: test_cross_fail: - %tst = fcmp une float %lhs, %rhs + %tst = fcmp nnan une float %lhs, %rhs %res = select i1 %tst, float %rhs, float %lhs ret float %res ; The register allocator would have to decide to be deliberately obtuse before ; other register were used. ; CHECK: fcsel s0, s1, s0, ne -; CHECK-SAFE: fcsel s0, s1, s0, ne } ; Make sure the transformation isn't triggered for integers @@ -60,3 +51,14 @@ define i64 @test_integer(i64 %in) { %val = select i1 %cmp, i64 0, i64 %in ret i64 %val } + +define float @test_f16(half %in) { +; CHECK-LABEL: test_f16: + %cmp = fcmp nnan ult half %in, 0.000000e+00 + %val = select i1 %cmp, half %in, half 0.000000e+00 + %longer = fpext half %val to float + ret float %longer +; FIXME: It'd be nice for this to create an fmin instruction! +; CHECK: fcvt +; CHECK: fcsel +} diff --git a/test/CodeGen/AArch64/arm64-fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll index aaef39fcf512e..097fe2ca6ed9a 100644 --- a/test/CodeGen/AArch64/arm64-fp128.ll +++ b/test/CodeGen/AArch64/arm64-fp128.ll @@ -148,14 +148,9 @@ define i1 @test_setcc2() { ; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs] %val = fcmp ugt fp128 %lhs, %rhs -; CHECK: bl __gttf2 +; CHECK: bl __letf2 ; CHECK: cmp w0, #0 -; CHECK: cset [[GT:w[0-9]+]], gt - -; CHECK: bl __unordtf2 -; CHECK: cmp w0, #0 -; CHECK: cset [[UNORDERED:w[0-9]+]], ne -; CHECK: orr w0, [[UNORDERED]], [[GT]] +; CHECK: cset w0, gt ret i1 %val ; CHECK: ret @@ -169,31 +164,21 @@ define i32 @test_br_cc() { ; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs] ; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs] - ; olt == !uge, which LLVM unfortunately "optimizes" this to. + ; olt == !uge, which LLVM optimizes this to. %cond = fcmp olt fp128 %lhs, %rhs -; CHECK: bl __getf2 -; CHECK: cmp w0, #0 -; CHECK: cset [[OGE:w[0-9]+]], ge - -; CHECK: bl __unordtf2 -; CHECK: cmp w0, #0 -; CHECK: cset [[UNORDERED:w[0-9]+]], ne - -; CHECK: orr [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]] -; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]] +; CHECK: bl __lttf2 +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: b.ge {{.LBB[0-9]+_[0-9]+}} br i1 %cond, label %iftrue, label %iffalse iftrue: ret i32 42 ; CHECK-NEXT: BB# ; CHECK-NEXT: movz w0, #0x2a -; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]] - +; CHECK: ret iffalse: ret i32 29 -; CHECK: [[RET29]]: -; CHECK-NEXT: movz w0, #0x1d -; CHECK-NEXT: [[REALRET]]: +; CHECK: movz w0, #0x1d ; CHECK: ret } diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll index f1c4e9bbaed95..895bfe4b3915a 100644 --- a/test/CodeGen/AArch64/arm64-hello.ll +++ b/test/CodeGen/AArch64/arm64-hello.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s -; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-post-ra | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -disable-post-ra | FileCheck %s --check-prefix=CHECK-LINUX ; CHECK-LABEL: main: ; CHECK: stp x29, x30, [sp, #-16]! diff --git a/test/CodeGen/AArch64/arm64-indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll index b52cddf600ac4..b6ab9934dbc3a 100644 --- a/test/CodeGen/AArch64/arm64-indexed-memory.ll +++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll @@ -81,6 +81,17 @@ define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind } +define void @storef16(half** %out, half %index, half %spacing) nounwind { +; CHECK-LABEL: storef16: +; CHECK: str h{{[0-9+]}}, [x{{[0-9+]}}], #2 +; CHECK: ret + %tmp = load half*, half** %out, align 2 + %incdec.ptr = getelementptr inbounds half, half* %tmp, i64 1 + store half %spacing, half* %tmp, align 2 + store half* %incdec.ptr, half** %out, align 2 + ret void +} + define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp { ; CHECK-LABEL: storef32: ; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4 @@ -125,6 +136,17 @@ define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline ret float *%ptr } +define half* @pref16(half** %out, half %spacing) nounwind { +; CHECK-LABEL: pref16: +; CHECK: ldr x0, [x0] +; CHECK-NEXT: str h0, [x0, #6]! +; CHECK-NEXT: ret + %tmp = load half*, half** %out, align 2 + %ptr = getelementptr inbounds half, half* %tmp, i64 3 + store half %spacing, half* %ptr, align 2 + ret half *%ptr +} + define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp { ; CHECK-LABEL: pre64: ; CHECK: ldr x0, [x0] @@ -230,6 +252,17 @@ define float* @preidxf32(float* %src, float* %out) { ret float* %ptr } +define half* @preidxf16(half* %src, half* %out) { +; CHECK-LABEL: preidxf16: +; CHECK: ldr h0, [x0, #2]! +; CHECK: str h0, [x1] +; CHECK: ret + %ptr = getelementptr inbounds half, half* %src, i64 1 + %tmp = load half, half* %ptr, align 2 + store half %tmp, half* %out, align 2 + ret half* %ptr +} + define i64* @preidx64(i64* %src, i64* %out) { ; CHECK-LABEL: preidx64: ; CHECK: ldr x[[REG:[0-9]+]], [x0, #8]! diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll index ba31513172d56..98d4e3646f56c 100644 --- a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s +; RUN: llc -mtriple=arm64-apple-ios7.0 -disable-post-ra -o - %s | FileCheck %s @ptr = global i8* null @@ -6215,3 +6215,27 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(i16* %bar, i16** %pt } declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) + +; CHECK-LABEL: test_ld1lane_build: +; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[0], [x0] +; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[1], [x1] +; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[0], [x2] +; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[1], [x3] +; CHECK: sub.2s v[[REGNUM2:[0-9]+]], [[REG0]], [[REG1]] +; CHECK-NEXT: str d[[REGNUM2]], [x4] +; CHECK-NEXT: ret +define void @test_ld1lane_build(i32* %ptr0, i32* %ptr1, i32* %ptr2, i32* %ptr3, <2 x i32>* %out) { + %load0 = load i32, i32* %ptr0, align 4 + %load1 = load i32, i32* %ptr1, align 4 + %vec0_0 = insertelement <2 x i32> undef, i32 %load0, i32 0 + %vec0_1 = insertelement <2 x i32> %vec0_0, i32 %load1, i32 1 + + %load2 = load i32, i32* %ptr2, align 4 + %load3 = load i32, i32* %ptr3, align 4 + %vec1_0 = insertelement <2 x i32> undef, i32 %load2, i32 0 + %vec1_1 = insertelement <2 x i32> %vec1_0, i32 %load3, i32 1 + + %sub = sub nsw <2 x i32> %vec0_1, %vec1_1 + store <2 x i32> %sub, <2 x i32>* %out, align 16 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll index 802d95826ce4a..ac6e8a7731c69 100644 --- a/test/CodeGen/AArch64/arm64-inline-asm.ll +++ b/test/CodeGen/AArch64/arm64-inline-asm.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as | FileCheck %s +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as -disable-post-ra | FileCheck %s ; rdar://9167275 diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll index dee0344835419..c65cf95be2e57 100644 --- a/test/CodeGen/AArch64/arm64-join-reserved.ll +++ b/test/CodeGen/AArch64/arm64-join-reserved.ll @@ -5,7 +5,7 @@ target triple = "arm64-apple-macosx10" ; A move isn't necessary. ; ; CHECK-LABEL: g: -; CHECK: str xzr, [sp] +; CHECK: str xzr, [sp, #-16]! ; CHECK: bl ; CHECK: ret define void @g() nounwind ssp { diff --git a/test/CodeGen/AArch64/arm64-large-frame.ll b/test/CodeGen/AArch64/arm64-large-frame.ll index c4cce36bcb74b..d1244e73b0f33 100644 --- a/test/CodeGen/AArch64/arm64-large-frame.ll +++ b/test/CodeGen/AArch64/arm64-large-frame.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim -disable-post-ra < %s | FileCheck %s declare void @use_addr(i8*) @addr = global i8* null diff --git a/test/CodeGen/AArch64/arm64-ld-from-st.ll b/test/CodeGen/AArch64/arm64-ld-from-st.ll new file mode 100644 index 0000000000000..dd8add70cdb7c --- /dev/null +++ b/test/CodeGen/AArch64/arm64-ld-from-st.ll @@ -0,0 +1,666 @@ +; RUN: llc < %s -mtriple aarch64--none-eabi -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: Str64Ldr64 +; CHECK: mov x0, x1 +define i64 @Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i64* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 1 + %1 = load i64, i64* %arrayidx1 + ret i64 %1 +} + +; CHECK-LABEL: Str64Ldr32_0 +; CHECK: and x0, x1, #0xffffffff +define i32 @Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i32* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 2 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Str64Ldr32_1 +; CHECK: lsr x0, x1, #32 +define i32 @Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i32* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 3 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Str64Ldr16_0 +; CHECK: and x0, x1, #0xffff +define i16 @Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 4 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str64Ldr16_1 +; CHECK: ubfx x0, x1, #16, #16 +define i16 @Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 5 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str64Ldr16_2 +; CHECK: ubfx x0, x1, #32, #16 +define i16 @Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 6 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str64Ldr16_3 +; CHECK: lsr x0, x1, #48 +define i16 @Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 7 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str64Ldr8_0 +; CHECK: and x0, x1, #0xff +define i8 @Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 8 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_1 +; CHECK: ubfx x0, x1, #8, #8 +define i8 @Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 9 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_2 +; CHECK: ubfx x0, x1, #16, #8 +define i8 @Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 10 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_3 +; CHECK: ubfx x0, x1, #24, #8 +define i8 @Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 11 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_4 +; CHECK: ubfx x0, x1, #32, #8 +define i8 @Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 12 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_5 +; CHECK: ubfx x0, x1, #40, #8 +define i8 @Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 13 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_6 +; CHECK: ubfx x0, x1, #48, #8 +define i8 @Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 14 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str64Ldr8_7 +; CHECK: lsr x0, x1, #56 +define i8 @Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 15 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str32Ldr32 +; CHECK: mov w0, w1 +define i32 @Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i32* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Str32Ldr16_0 +; CHECK: and w0, w1, #0xffff +define i16 @Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str32Ldr16_1 +; CHECK: lsr w0, w1, #16 +define i16 @Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 3 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str32Ldr8_0 +; CHECK: and w0, w1, #0xff +define i8 @Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 4 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str32Ldr8_1 +; CHECK: ubfx w0, w1, #8, #8 +define i8 @Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 5 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str32Ldr8_2 +; CHECK: ubfx w0, w1, #16, #8 +define i8 @Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 6 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str32Ldr8_3 +; CHECK: lsr w0, w1, #24 +define i8 @Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 7 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str16Ldr16 +; CHECK: and w0, w1, #0xffff +define i16 @Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i16* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Str16Ldr8_0 +; CHECK: and w0, w1, #0xff +define i8 @Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i8* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 2 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Str16Ldr8_1 +; CHECK: ubfx w0, w1, #8, #8 +define i8 @Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i8* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 3 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + + +; CHECK-LABEL: Unscaled_Str64Ldr64 +; CHECK: mov x0, x1 +define i64 @Unscaled_Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i64* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 -1 + %1 = load i64, i64* %arrayidx1 + ret i64 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr32_0 +; CHECK: and x0, x1, #0xffffffff +define i32 @Unscaled_Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i32* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -2 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr32_1 +; CHECK: lsr x0, x1, #32 +define i32 @Unscaled_Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i32* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr16_0 +; CHECK: and x0, x1, #0xffff +define i16 @Unscaled_Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -4 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr16_1 +; CHECK: ubfx x0, x1, #16, #16 +define i16 @Unscaled_Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr16_2 +; CHECK: ubfx x0, x1, #32, #16 +define i16 @Unscaled_Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr16_3 +; CHECK: lsr x0, x1, #48 +define i16 @Unscaled_Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_0 +; CHECK: and x0, x1, #0xff +define i8 @Unscaled_Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -8 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_1 +; CHECK: ubfx x0, x1, #8, #8 +define i8 @Unscaled_Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -7 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_2 +; CHECK: ubfx x0, x1, #16, #8 +define i8 @Unscaled_Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -6 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_3 +; CHECK: ubfx x0, x1, #24, #8 +define i8 @Unscaled_Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -5 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_4 +; CHECK: ubfx x0, x1, #32, #8 +define i8 @Unscaled_Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_5 +; CHECK: ubfx x0, x1, #40, #8 +define i8 @Unscaled_Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_6 +; CHECK: ubfx x0, x1, #48, #8 +define i8 @Unscaled_Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str64Ldr8_7 +; CHECK: lsr x0, x1, #56 +define i8 @Unscaled_Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) { +entry: + %0 = bitcast i64* %P to i8* + %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr32 +; CHECK: mov w0, w1 +define i32 @Unscaled_Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i32* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1 + %1 = load i32, i32* %arrayidx1 + ret i32 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr16_0 +; CHECK: and w0, w1, #0xffff +define i16 @Unscaled_Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr16_1 +; CHECK: lsr w0, w1, #16 +define i16 @Unscaled_Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr8_0 +; CHECK: and w0, w1, #0xff +define i8 @Unscaled_Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr8_1 +; CHECK: ubfx w0, w1, #8, #8 +define i8 @Unscaled_Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr8_2 +; CHECK: ubfx w0, w1, #16, #8 +define i8 @Unscaled_Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str32Ldr8_3 +; CHECK: lsr w0, w1, #24 +define i8 @Unscaled_Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i8* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str16Ldr16 +; CHECK: and w0, w1, #0xffff +define i16 @Unscaled_Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i16* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_Str16Ldr8_0 +; CHECK: and w0, w1, #0xff +define i8 @Unscaled_Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i8* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: Unscaled_Str16Ldr8_1 +; CHECK: ubfx w0, w1, #8, #8 +define i8 @Unscaled_Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) { +entry: + %0 = bitcast i16* %P to i8* + %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1 + store i16 %v, i16* %arrayidx0 + %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1 + %1 = load i8, i8* %arrayidx1 + ret i8 %1 +} + +; CHECK-LABEL: StrVolatileLdr +; CHECK: ldrh +define i16 @StrVolatileLdr(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2 + %1 = load volatile i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: StrNotInRangeLdr +; CHECK: ldrh +define i16 @StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: Unscaled_StrNotInRangeLdr +; CHECK: ldurh +define i16 @Unscaled_StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1 + store i32 %v, i32* %arrayidx0 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +; CHECK-LABEL: StrCallLdr +; CHECK: ldrh +define i16 @StrCallLdr(i32* nocapture %P, i32 %v, i64 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + %c = call i1 @test_dummy() + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} + +declare i1 @test_dummy() + +; CHECK-LABEL: StrStrLdr +; CHECK: ldrh +define i16 @StrStrLdr(i32 %v, i32* %P, i32* %P2, i32 %n) { +entry: + %0 = bitcast i32* %P to i16* + %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1 + store i32 %v, i32* %arrayidx0 + store i32 %n, i32* %P2 + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2 + %1 = load i16, i16* %arrayidx1 + ret i16 %1 +} diff --git a/test/CodeGen/AArch64/arm64-ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll index a192eab112faf..6071d092f8b37 100644 --- a/test/CodeGen/AArch64/arm64-ldp.ll +++ b/test/CodeGen/AArch64/arm64-ldp.ll @@ -1,8 +1,6 @@ ; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\ -; RUN: -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s -; CHECK: ldp_int +; CHECK-LABEL: ldp_int ; CHECK: ldp define i32 @ldp_int(i32* %p) nounwind { %tmp = load i32, i32* %p, align 4 @@ -12,7 +10,7 @@ define i32 @ldp_int(i32* %p) nounwind { ret i32 %add } -; CHECK: ldp_sext_int +; CHECK-LABEL: ldp_sext_int ; CHECK: ldpsw define i64 @ldp_sext_int(i32* %p) nounwind { %tmp = load i32, i32* %p, align 4 @@ -51,7 +49,7 @@ define i64 @ldp_half_sext_res1_int(i32* %p) nounwind { } -; CHECK: ldp_long +; CHECK-LABEL: ldp_long ; CHECK: ldp define i64 @ldp_long(i64* %p) nounwind { %tmp = load i64, i64* %p, align 8 @@ -61,7 +59,7 @@ define i64 @ldp_long(i64* %p) nounwind { ret i64 %add } -; CHECK: ldp_float +; CHECK-LABEL: ldp_float ; CHECK: ldp define float @ldp_float(float* %p) nounwind { %tmp = load float, float* %p, align 4 @@ -71,7 +69,7 @@ define float @ldp_float(float* %p) nounwind { ret float %add } -; CHECK: ldp_double +; CHECK-LABEL: ldp_double ; CHECK: ldp define double @ldp_double(double* %p) nounwind { %tmp = load double, double* %p, align 8 @@ -83,10 +81,10 @@ define double @ldp_double(double* %p) nounwind { ; Test the load/store optimizer---combine ldurs into a ldp, if appropriate define i32 @ldur_int(i32* %a) nounwind { -; LDUR_CHK: ldur_int -; LDUR_CHK: ldp [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8] -; LDUR_CHK-NEXT: add w{{[0-9]+}}, [[DST2]], [[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: ldur_int +; CHECK: ldp [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8] +; CHECK-NEXT: add w{{[0-9]+}}, [[DST2]], [[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %a, i32 -1 %tmp1 = load i32, i32* %p1, align 2 %p2 = getelementptr inbounds i32, i32* %a, i32 -2 @@ -96,10 +94,10 @@ define i32 @ldur_int(i32* %a) nounwind { } define i64 @ldur_sext_int(i32* %a) nounwind { -; LDUR_CHK: ldur_sext_int -; LDUR_CHK: ldpsw [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-8] -; LDUR_CHK-NEXT: add x{{[0-9]+}}, [[DST2]], [[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: ldur_sext_int +; CHECK: ldpsw [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-8] +; CHECK-NEXT: add x{{[0-9]+}}, [[DST2]], [[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %a, i32 -1 %tmp1 = load i32, i32* %p1, align 2 %p2 = getelementptr inbounds i32, i32* %a, i32 -2 @@ -111,11 +109,11 @@ define i64 @ldur_sext_int(i32* %a) nounwind { } define i64 @ldur_half_sext_int_res0(i32* %a) nounwind { -; LDUR_CHK: ldur_half_sext_int_res0 -; LDUR_CHK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8] -; LDUR_CHK: sxtw x[[DST1]], w[[DST1]] -; LDUR_CHK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: ldur_half_sext_int_res0 +; CHECK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8] +; CHECK: sxtw x[[DST1]], w[[DST1]] +; CHECK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %a, i32 -1 %tmp1 = load i32, i32* %p1, align 2 %p2 = getelementptr inbounds i32, i32* %a, i32 -2 @@ -127,11 +125,11 @@ define i64 @ldur_half_sext_int_res0(i32* %a) nounwind { } define i64 @ldur_half_sext_int_res1(i32* %a) nounwind { -; LDUR_CHK: ldur_half_sext_int_res1 -; LDUR_CHK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8] -; LDUR_CHK: sxtw x[[DST2]], w[[DST2]] -; LDUR_CHK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: ldur_half_sext_int_res1 +; CHECK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8] +; CHECK: sxtw x[[DST2]], w[[DST2]] +; CHECK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %a, i32 -1 %tmp1 = load i32, i32* %p1, align 2 %p2 = getelementptr inbounds i32, i32* %a, i32 -2 @@ -144,10 +142,10 @@ define i64 @ldur_half_sext_int_res1(i32* %a) nounwind { define i64 @ldur_long(i64* %a) nounwind ssp { -; LDUR_CHK: ldur_long -; LDUR_CHK: ldp [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16] -; LDUR_CHK-NEXT: add x{{[0-9]+}}, [[DST2]], [[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: ldur_long +; CHECK: ldp [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16] +; CHECK-NEXT: add x{{[0-9]+}}, [[DST2]], [[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i64, i64* %a, i64 -1 %tmp1 = load i64, i64* %p1, align 2 %p2 = getelementptr inbounds i64, i64* %a, i64 -2 @@ -157,10 +155,10 @@ define i64 @ldur_long(i64* %a) nounwind ssp { } define float @ldur_float(float* %a) { -; LDUR_CHK: ldur_float -; LDUR_CHK: ldp [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8] -; LDUR_CHK-NEXT: add s{{[0-9]+}}, [[DST2]], [[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: ldur_float +; CHECK: ldp [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8] +; CHECK-NEXT: add s{{[0-9]+}}, [[DST2]], [[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds float, float* %a, i64 -1 %tmp1 = load float, float* %p1, align 2 %p2 = getelementptr inbounds float, float* %a, i64 -2 @@ -170,10 +168,10 @@ define float @ldur_float(float* %a) { } define double @ldur_double(double* %a) { -; LDUR_CHK: ldur_double -; LDUR_CHK: ldp [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16] -; LDUR_CHK-NEXT: add d{{[0-9]+}}, [[DST2]], [[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: ldur_double +; CHECK: ldp [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16] +; CHECK-NEXT: add d{{[0-9]+}}, [[DST2]], [[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds double, double* %a, i64 -1 %tmp1 = load double, double* %p1, align 2 %p2 = getelementptr inbounds double, double* %a, i64 -2 @@ -184,11 +182,11 @@ define double @ldur_double(double* %a) { ; Now check some boundary conditions define i64 @pairUpBarelyIn(i64* %a) nounwind ssp { -; LDUR_CHK: pairUpBarelyIn -; LDUR_CHK-NOT: ldur -; LDUR_CHK: ldp [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256] -; LDUR_CHK-NEXT: add x{{[0-9]+}}, [[DST2]], [[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: pairUpBarelyIn +; CHECK-NOT: ldur +; CHECK: ldp [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256] +; CHECK-NEXT: add x{{[0-9]+}}, [[DST2]], [[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i64, i64* %a, i64 -31 %tmp1 = load i64, i64* %p1, align 2 %p2 = getelementptr inbounds i64, i64* %a, i64 -32 @@ -198,11 +196,11 @@ define i64 @pairUpBarelyIn(i64* %a) nounwind ssp { } define i64 @pairUpBarelyInSext(i32* %a) nounwind ssp { -; LDUR_CHK: pairUpBarelyInSext -; LDUR_CHK-NOT: ldur -; LDUR_CHK: ldpsw [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256] -; LDUR_CHK-NEXT: add x{{[0-9]+}}, [[DST2]], [[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: pairUpBarelyInSext +; CHECK-NOT: ldur +; CHECK: ldpsw [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256] +; CHECK-NEXT: add x{{[0-9]+}}, [[DST2]], [[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %a, i64 -63 %tmp1 = load i32, i32* %p1, align 2 %p2 = getelementptr inbounds i32, i32* %a, i64 -64 @@ -214,12 +212,12 @@ define i64 @pairUpBarelyInSext(i32* %a) nounwind ssp { } define i64 @pairUpBarelyInHalfSextRes0(i32* %a) nounwind ssp { -; LDUR_CHK: pairUpBarelyInHalfSextRes0 -; LDUR_CHK-NOT: ldur -; LDUR_CHK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256] -; LDUR_CHK: sxtw x[[DST1]], w[[DST1]] -; LDUR_CHK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: pairUpBarelyInHalfSextRes0 +; CHECK-NOT: ldur +; CHECK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256] +; CHECK: sxtw x[[DST1]], w[[DST1]] +; CHECK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %a, i64 -63 %tmp1 = load i32, i32* %p1, align 2 %p2 = getelementptr inbounds i32, i32* %a, i64 -64 @@ -231,12 +229,12 @@ define i64 @pairUpBarelyInHalfSextRes0(i32* %a) nounwind ssp { } define i64 @pairUpBarelyInHalfSextRes1(i32* %a) nounwind ssp { -; LDUR_CHK: pairUpBarelyInHalfSextRes1 -; LDUR_CHK-NOT: ldur -; LDUR_CHK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256] -; LDUR_CHK: sxtw x[[DST2]], w[[DST2]] -; LDUR_CHK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: pairUpBarelyInHalfSextRes1 +; CHECK-NOT: ldur +; CHECK: ldp w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256] +; CHECK: sxtw x[[DST2]], w[[DST2]] +; CHECK-NEXT: add x{{[0-9]+}}, x[[DST2]], x[[DST1]] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %a, i64 -63 %tmp1 = load i32, i32* %p1, align 2 %p2 = getelementptr inbounds i32, i32* %a, i64 -64 @@ -248,12 +246,12 @@ define i64 @pairUpBarelyInHalfSextRes1(i32* %a) nounwind ssp { } define i64 @pairUpBarelyOut(i64* %a) nounwind ssp { -; LDUR_CHK: pairUpBarelyOut -; LDUR_CHK-NOT: ldp +; CHECK-LABEL: pairUpBarelyOut +; CHECK-NOT: ldp ; Don't be fragile about which loads or manipulations of the base register ; are used---just check that there isn't an ldp before the add -; LDUR_CHK: add -; LDUR_CHK-NEXT: ret +; CHECK: add +; CHECK-NEXT: ret %p1 = getelementptr inbounds i64, i64* %a, i64 -32 %tmp1 = load i64, i64* %p1, align 2 %p2 = getelementptr inbounds i64, i64* %a, i64 -33 @@ -263,12 +261,12 @@ define i64 @pairUpBarelyOut(i64* %a) nounwind ssp { } define i64 @pairUpBarelyOutSext(i32* %a) nounwind ssp { -; LDUR_CHK: pairUpBarelyOutSext -; LDUR_CHK-NOT: ldp +; CHECK-LABEL: pairUpBarelyOutSext +; CHECK-NOT: ldp ; Don't be fragile about which loads or manipulations of the base register ; are used---just check that there isn't an ldp before the add -; LDUR_CHK: add -; LDUR_CHK-NEXT: ret +; CHECK: add +; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %a, i64 -64 %tmp1 = load i32, i32* %p1, align 2 %p2 = getelementptr inbounds i32, i32* %a, i64 -65 @@ -280,12 +278,12 @@ define i64 @pairUpBarelyOutSext(i32* %a) nounwind ssp { } define i64 @pairUpNotAligned(i64* %a) nounwind ssp { -; LDUR_CHK: pairUpNotAligned -; LDUR_CHK-NOT: ldp -; LDUR_CHK: ldur -; LDUR_CHK-NEXT: ldur -; LDUR_CHK-NEXT: add -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: pairUpNotAligned +; CHECK-NOT: ldp +; CHECK: ldur +; CHECK-NEXT: ldur +; CHECK-NEXT: add +; CHECK-NEXT: ret %p1 = getelementptr inbounds i64, i64* %a, i64 -18 %bp1 = bitcast i64* %p1 to i8* %bp1p1 = getelementptr inbounds i8, i8* %bp1, i64 1 @@ -303,12 +301,12 @@ define i64 @pairUpNotAligned(i64* %a) nounwind ssp { } define i64 @pairUpNotAlignedSext(i32* %a) nounwind ssp { -; LDUR_CHK: pairUpNotAlignedSext -; LDUR_CHK-NOT: ldp -; LDUR_CHK: ldursw -; LDUR_CHK-NEXT: ldursw -; LDUR_CHK-NEXT: add -; LDUR_CHK-NEXT: ret +; CHECK-LABEL: pairUpNotAlignedSext +; CHECK-NOT: ldp +; CHECK: ldursw +; CHECK-NEXT: ldursw +; CHECK-NEXT: add +; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %a, i64 -18 %bp1 = bitcast i32* %p1 to i8* %bp1p1 = getelementptr inbounds i8, i8* %bp1, i64 1 @@ -326,3 +324,35 @@ define i64 @pairUpNotAlignedSext(i32* %a) nounwind ssp { %tmp3 = add i64 %sexttmp1, %sexttmp2 ret i64 %tmp3 } + +declare void @use-ptr(i32*) + +; CHECK-LABEL: ldp_sext_int_pre +; CHECK: ldpsw x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}, #8] +define i64 @ldp_sext_int_pre(i32* %p) nounwind { + %ptr = getelementptr inbounds i32, i32* %p, i64 2 + call void @use-ptr(i32* %ptr) + %add.ptr = getelementptr inbounds i32, i32* %ptr, i64 0 + %tmp = load i32, i32* %add.ptr, align 4 + %add.ptr1 = getelementptr inbounds i32, i32* %ptr, i64 1 + %tmp1 = load i32, i32* %add.ptr1, align 4 + %sexttmp = sext i32 %tmp to i64 + %sexttmp1 = sext i32 %tmp1 to i64 + %add = add nsw i64 %sexttmp1, %sexttmp + ret i64 %add +} + +; CHECK-LABEL: ldp_sext_int_post +; CHECK: ldpsw x{{[0-9]+}}, x{{[0-9]+}}, [x0], #8 +define i64 @ldp_sext_int_post(i32* %p) nounwind { + %tmp = load i32, i32* %p, align 4 + %add.ptr = getelementptr inbounds i32, i32* %p, i64 1 + %tmp1 = load i32, i32* %add.ptr, align 4 + %sexttmp = sext i32 %tmp to i64 + %sexttmp1 = sext i32 %tmp1 to i64 + %ptr = getelementptr inbounds i32, i32* %add.ptr, i64 1 + call void @use-ptr(i32* %ptr) + %add = add nsw i64 %sexttmp1, %sexttmp + ret i64 %add +} + diff --git a/test/CodeGen/AArch64/arm64-long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll index d5baf16bdd5ce..ad89d3ff711bc 100644 --- a/test/CodeGen/AArch64/arm64-long-shift.ll +++ b/test/CodeGen/AArch64/arm64-long-shift.ll @@ -2,18 +2,20 @@ define i128 @shl(i128 %r, i128 %s) nounwind readnone { ; CHECK-LABEL: shl: -; CHECK: lsl [[XREG_0:x[0-9]+]], x1, x2 -; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40 -; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2 -; CHECK-NEXT: lsr [[XREG_3:x[0-9]+]], x0, [[XREG_2]] -; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]] -; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64 -; CHECK-NEXT: lsl [[XREG_5:x[0-9]+]], x0, [[XREG_4]] -; CHECK-NEXT: cmp [[XREG_4]], #0 -; CHECK-NEXT: csel x1, [[XREG_5]], [[XREG_6]], ge -; CHECK-NEXT: lsl [[SMALLSHIFT_LO:x[0-9]+]], x0, x2 -; CHECK-NEXT: csel x0, xzr, [[SMALLSHIFT_LO]], ge -; CHECK-NEXT: ret +; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40 +; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2 +; CHECK: lsr [[LO_FOR_HI_NORMAL:x[0-9]+]], x0, [[REV_SHIFT]] +; CHECK: cmp x2, #0 +; CHECK: csel [[LO_FOR_HI:x[0-9]+]], xzr, [[LO_FOR_HI_NORMAL]], eq +; CHECK: lsl [[HI_FOR_HI:x[0-9]+]], x1, x2 +; CHECK: orr [[HI_NORMAL:x[0-9]+]], [[LO_FOR_HI]], [[HI_FOR_HI]] +; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64 +; CHECK: lsl [[HI_BIG_SHIFT:x[0-9]+]], x0, [[EXTRA_SHIFT]] +; CHECK: cmp [[EXTRA_SHIFT]], #0 +; CHECK: csel x1, [[HI_BIG_SHIFT]], [[HI_NORMAL]], ge +; CHECK: lsl [[SMALLSHIFT_LO:x[0-9]+]], x0, x2 +; CHECK: csel x0, xzr, [[SMALLSHIFT_LO]], ge +; CHECK: ret %shl = shl i128 %r, %s ret i128 %shl @@ -21,19 +23,21 @@ define i128 @shl(i128 %r, i128 %s) nounwind readnone { define i128 @ashr(i128 %r, i128 %s) nounwind readnone { ; CHECK-LABEL: ashr: -; CHECK: lsr [[XREG_0:x[0-9]+]], x0, x2 -; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40 -; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2 -; CHECK-NEXT: lsl [[XREG_3:x[0-9]+]], x1, [[XREG_2]] -; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]] -; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64 -; CHECK-NEXT: asr [[XREG_6:x[0-9]+]], x1, [[XREG_5]] -; CHECK-NEXT: cmp [[XREG_5]], #0 -; CHECK-NEXT: csel x0, [[XREG_6]], [[XREG_4]], ge -; CHECK-NEXT: asr [[SMALLSHIFT_HI:x[0-9]+]], x1, x2 -; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63 -; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge -; CHECK-NEXT: ret +; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40 +; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2 +; CHECK: lsl [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]] +; CHECK: cmp x2, #0 +; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq +; CHECK: lsr [[LO_FOR_LO:x[0-9]+]], x0, x2 +; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]] +; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64 +; CHECK: asr [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]] +; CHECK: cmp [[EXTRA_SHIFT]], #0 +; CHECK: csel x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge +; CHECK: asr [[SMALLSHIFT_HI:x[0-9]+]], x1, x2 +; CHECK: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63 +; CHECK: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge +; CHECK: ret %shr = ashr i128 %r, %s ret i128 %shr @@ -41,18 +45,20 @@ define i128 @ashr(i128 %r, i128 %s) nounwind readnone { define i128 @lshr(i128 %r, i128 %s) nounwind readnone { ; CHECK-LABEL: lshr: -; CHECK: lsr [[XREG_0:x[0-9]+]], x0, x2 -; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40 -; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2 -; CHECK-NEXT: lsl [[XREG_3:x[0-9]+]], x1, [[XREG_2]] -; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]] -; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64 -; CHECK-NEXT: lsr [[XREG_6:x[0-9]+]], x1, [[XREG_5]] -; CHECK-NEXT: cmp [[XREG_5]], #0 -; CHECK-NEXT: csel x0, [[XREG_6]], [[XREG_4]], ge -; CHECK-NEXT: lsr [[SMALLSHIFT_HI:x[0-9]+]], x1, x2 -; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge -; CHECK-NEXT: ret +; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40 +; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2 +; CHECK: lsl [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]] +; CHECK: cmp x2, #0 +; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq +; CHECK: lsr [[LO_FOR_LO:x[0-9]+]], x0, x2 +; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]] +; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64 +; CHECK: lsr [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]] +; CHECK: cmp [[EXTRA_SHIFT]], #0 +; CHECK: csel x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge +; CHECK: lsr [[SMALLSHIFT_HI:x[0-9]+]], x1, x2 +; CHECK: csel x1, xzr, [[SMALLSHIFT_HI]], ge +; CHECK: ret %shr = lshr i128 %r, %s ret i128 %shr diff --git a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll index 5bc4d71501ba4..85572f2cf0f8c 100644 --- a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll +++ b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=arm64-apple-ios -aarch64-strict-align < %s | FileCheck %s +; RUN: llc -mtriple=arm64-apple-ios -mattr=+strict-align < %s | FileCheck %s ; Small (16-bytes here) unaligned memcpys should stay memcpy calls if ; strict-alignment is turned on. diff --git a/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll b/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll new file mode 100644 index 0000000000000..5276ac334a71e --- /dev/null +++ b/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll @@ -0,0 +1,406 @@ +; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=LE +; RUN: llc < %s -mtriple aarch64_be--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=BE + +; CHECK-LABEL: Ldrh_merge +; CHECK-NOT: ldrh +; CHECK: ldr [[NEW_DEST:w[0-9]+]] +; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff +; CHECK-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16 +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i16 @Ldrh_merge(i16* nocapture readonly %p) { + %1 = load i16, i16* %p, align 2 + %arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1 + %2 = load i16, i16* %arrayidx2, align 2 + %add = sub nuw nsw i16 %1, %2 + ret i16 %add +} + +; CHECK-LABEL: Ldurh_merge +; CHECK-NOT: ldurh +; CHECK: ldur [[NEW_DEST:w[0-9]+]] +; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff +; CHECK-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]] +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i16 @Ldurh_merge(i16* nocapture readonly %p) { +entry: + %arrayidx = getelementptr inbounds i16, i16* %p, i64 -2 + %0 = load i16, i16* %arrayidx + %arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1 + %1 = load i16, i16* %arrayidx3 + %add = sub nuw nsw i16 %0, %1 + ret i16 %add +} + +; CHECK-LABEL: Ldrh_4_merge +; CHECK-NOT: ldrh +; CHECK: ldp [[WORD1:w[0-9]+]], [[WORD2:w[0-9]+]], [x0] +; CHECK-DAG: and [[WORD1LO:w[0-9]+]], [[WORD1]], #0xffff +; CHECK-DAG: lsr [[WORD1HI:w[0-9]+]], [[WORD1]], #16 +; CHECK-DAG: and [[WORD2LO:w[0-9]+]], [[WORD2]], #0xffff +; CHECK-DAG: lsr [[WORD2HI:w[0-9]+]], [[WORD2]], #16 +; LE-DAG: sub [[TEMP1:w[0-9]+]], [[WORD1HI]], [[WORD1LO]] +; BE-DAG: sub [[TEMP1:w[0-9]+]], [[WORD1LO]], [[WORD1HI]] +; LE: udiv [[TEMP2:w[0-9]+]], [[TEMP1]], [[WORD2LO]] +; BE: udiv [[TEMP2:w[0-9]+]], [[TEMP1]], [[WORD2HI]] +; LE: sub w0, [[TEMP2]], [[WORD2HI]] +; BE: sub w0, [[TEMP2]], [[WORD2LO]] +define i16 @Ldrh_4_merge(i16* nocapture readonly %P) { + %arrayidx = getelementptr inbounds i16, i16* %P, i64 0 + %l0 = load i16, i16* %arrayidx + %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1 + %l1 = load i16, i16* %arrayidx2 + %arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2 + %l2 = load i16, i16* %arrayidx7 + %arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3 + %l3 = load i16, i16* %arrayidx12 + %add4 = sub nuw nsw i16 %l1, %l0 + %add9 = udiv i16 %add4, %l2 + %add14 = sub nuw nsw i16 %add9, %l3 + ret i16 %add14 +} + +; CHECK-LABEL: Ldrsh_merge +; CHECK: ldr [[NEW_DEST:w[0-9]+]] +; CHECK-DAG: asr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 +; CHECK-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]] +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] + +define i32 @Ldrsh_merge(i16* %p) nounwind { + %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4 + %tmp = load i16, i16* %add.ptr0 + %add.ptr = getelementptr inbounds i16, i16* %p, i64 5 + %tmp1 = load i16, i16* %add.ptr + %sexttmp = sext i16 %tmp to i32 + %sexttmp1 = sext i16 %tmp1 to i32 + %add = sub nsw i32 %sexttmp1, %sexttmp + ret i32 %add +} + +; CHECK-LABEL: Ldrsh_zsext_merge +; CHECK: ldr [[NEW_DEST:w[0-9]+]] +; LE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff +; LE-DAG: asr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16 +; BE-DAG: sxth [[LO_PART:w[0-9]+]], [[NEW_DEST]] +; BE-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16 +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldrsh_zsext_merge(i16* %p) nounwind { + %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4 + %tmp = load i16, i16* %add.ptr0 + %add.ptr = getelementptr inbounds i16, i16* %p, i64 5 + %tmp1 = load i16, i16* %add.ptr + %sexttmp = zext i16 %tmp to i32 + %sexttmp1 = sext i16 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldrsh_szext_merge +; CHECK: ldr [[NEW_DEST:w[0-9]+]] +; LE-DAG: sxth [[LO_PART:w[0-9]+]], [[NEW_DEST]] +; LE-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16 +; BE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff +; BE-DAG: asr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16 +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldrsh_szext_merge(i16* %p) nounwind { + %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4 + %tmp = load i16, i16* %add.ptr0 + %add.ptr = getelementptr inbounds i16, i16* %p, i64 5 + %tmp1 = load i16, i16* %add.ptr + %sexttmp = sext i16 %tmp to i32 + %sexttmp1 = zext i16 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldrb_merge +; CHECK: ldrh [[NEW_DEST:w[0-9]+]] +; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff +; CHECK-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldrb_merge(i8* %p) nounwind { + %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2 + %tmp = load i8, i8* %add.ptr0 + %add.ptr = getelementptr inbounds i8, i8* %p, i64 3 + %tmp1 = load i8, i8* %add.ptr + %sexttmp = zext i8 %tmp to i32 + %sexttmp1 = zext i8 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldrsb_merge +; CHECK: ldrh [[NEW_DEST:w[0-9]+]] +; CHECK-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]] +; CHECK-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldrsb_merge(i8* %p) nounwind { + %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2 + %tmp = load i8, i8* %add.ptr0 + %add.ptr = getelementptr inbounds i8, i8* %p, i64 3 + %tmp1 = load i8, i8* %add.ptr + %sexttmp = sext i8 %tmp to i32 + %sexttmp1 = sext i8 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldrsb_zsext_merge +; CHECK: ldrh [[NEW_DEST:w[0-9]+]] +; LE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff +; LE-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; BE-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]] +; BE-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldrsb_zsext_merge(i8* %p) nounwind { + %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2 + %tmp = load i8, i8* %add.ptr0 + %add.ptr = getelementptr inbounds i8, i8* %p, i64 3 + %tmp1 = load i8, i8* %add.ptr + %sexttmp = zext i8 %tmp to i32 + %sexttmp1 = sext i8 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldrsb_szext_merge +; CHECK: ldrh [[NEW_DEST:w[0-9]+]] +; LE-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]] +; LE-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; BE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff +; BE-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldrsb_szext_merge(i8* %p) nounwind { + %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2 + %tmp = load i8, i8* %add.ptr0 + %add.ptr = getelementptr inbounds i8, i8* %p, i64 3 + %tmp1 = load i8, i8* %add.ptr + %sexttmp = sext i8 %tmp to i32 + %sexttmp1 = zext i8 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldursh_merge +; CHECK: ldur [[NEW_DEST:w[0-9]+]] +; CHECK-DAG: asr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 +; CHECK-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]] +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldursh_merge(i16* %p) nounwind { + %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1 + %tmp = load i16, i16* %add.ptr0 + %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2 + %tmp1 = load i16, i16* %add.ptr + %sexttmp = sext i16 %tmp to i32 + %sexttmp1 = sext i16 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldursh_zsext_merge +; CHECK: ldur [[NEW_DEST:w[0-9]+]] +; LE-DAG: lsr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 +; LE-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]] +; BE-DAG: asr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 +; BE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xffff +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldursh_zsext_merge(i16* %p) nounwind { + %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1 + %tmp = load i16, i16* %add.ptr0 + %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2 + %tmp1 = load i16, i16* %add.ptr + %sexttmp = zext i16 %tmp to i32 + %sexttmp1 = sext i16 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldursh_szext_merge +; CHECK: ldur [[NEW_DEST:w[0-9]+]] +; LE-DAG: asr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 +; LE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xffff +; BE-DAG: lsr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 +; BE-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]] +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldursh_szext_merge(i16* %p) nounwind { + %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1 + %tmp = load i16, i16* %add.ptr0 + %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2 + %tmp1 = load i16, i16* %add.ptr + %sexttmp = sext i16 %tmp to i32 + %sexttmp1 = zext i16 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldurb_merge +; CHECK: ldurh [[NEW_DEST:w[0-9]+]] +; CHECK-DAG: ubfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; CHECK-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldurb_merge(i8* %p) nounwind { + %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1 + %tmp = load i8, i8* %add.ptr0 + %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2 + %tmp1 = load i8, i8* %add.ptr + %sexttmp = zext i8 %tmp to i32 + %sexttmp1 = zext i8 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldursb_merge +; CHECK: ldurh [[NEW_DEST:w[0-9]+]] +; CHECK-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; CHECK-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]] +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldursb_merge(i8* %p) nounwind { + %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1 + %tmp = load i8, i8* %add.ptr0 + %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2 + %tmp1 = load i8, i8* %add.ptr + %sexttmp = sext i8 %tmp to i32 + %sexttmp1 = sext i8 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldursb_zsext_merge +; CHECK: ldurh [[NEW_DEST:w[0-9]+]] +; LE-DAG: ubfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; LE-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]] +; BE-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; BE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldursb_zsext_merge(i8* %p) nounwind { + %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1 + %tmp = load i8, i8* %add.ptr0 + %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2 + %tmp1 = load i8, i8* %add.ptr + %sexttmp = zext i8 %tmp to i32 + %sexttmp1 = sext i8 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Ldursb_szext_merge +; CHECK: ldurh [[NEW_DEST:w[0-9]+]] +; LE-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; LE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff +; BE-DAG: ubfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 +; BE-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]] +; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] +; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] +define i32 @Ldursb_szext_merge(i8* %p) nounwind { + %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1 + %tmp = load i8, i8* %add.ptr0 + %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2 + %tmp1 = load i8, i8* %add.ptr + %sexttmp = sext i8 %tmp to i32 + %sexttmp1 = zext i8 %tmp1 to i32 + %add = sub nsw i32 %sexttmp, %sexttmp1 + ret i32 %add +} + +; CHECK-LABEL: Strh_zero +; CHECK: str wzr +define void @Strh_zero(i16* nocapture %P, i32 %n) { +entry: + %idxprom = sext i32 %n to i64 + %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom + store i16 0, i16* %arrayidx + %add = add nsw i32 %n, 1 + %idxprom1 = sext i32 %add to i64 + %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 %idxprom1 + store i16 0, i16* %arrayidx2 + ret void +} + +; CHECK-LABEL: Strh_zero_4 +; CHECK: stp wzr, wzr +define void @Strh_zero_4(i16* nocapture %P, i32 %n) { +entry: + %idxprom = sext i32 %n to i64 + %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom + store i16 0, i16* %arrayidx + %add = add nsw i32 %n, 1 + %idxprom1 = sext i32 %add to i64 + %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 %idxprom1 + store i16 0, i16* %arrayidx2 + %add3 = add nsw i32 %n, 2 + %idxprom4 = sext i32 %add3 to i64 + %arrayidx5 = getelementptr inbounds i16, i16* %P, i64 %idxprom4 + store i16 0, i16* %arrayidx5 + %add6 = add nsw i32 %n, 3 + %idxprom7 = sext i32 %add6 to i64 + %arrayidx8 = getelementptr inbounds i16, i16* %P, i64 %idxprom7 + store i16 0, i16* %arrayidx8 + ret void +} + +; CHECK-LABEL: Sturb_zero +; CHECK: sturh wzr +define void @Sturb_zero(i8* nocapture %P, i32 %n) #0 { +entry: + %sub = add nsw i32 %n, -2 + %idxprom = sext i32 %sub to i64 + %arrayidx = getelementptr inbounds i8, i8* %P, i64 %idxprom + store i8 0, i8* %arrayidx + %sub2= add nsw i32 %n, -1 + %idxprom1 = sext i32 %sub2 to i64 + %arrayidx2 = getelementptr inbounds i8, i8* %P, i64 %idxprom1 + store i8 0, i8* %arrayidx2 + ret void +} + +; CHECK-LABEL: Sturh_zero +; CHECK: stur wzr +define void @Sturh_zero(i16* nocapture %P, i32 %n) { +entry: + %sub = add nsw i32 %n, -2 + %idxprom = sext i32 %sub to i64 + %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom + store i16 0, i16* %arrayidx + %sub1 = add nsw i32 %n, -3 + %idxprom2 = sext i32 %sub1 to i64 + %arrayidx3 = getelementptr inbounds i16, i16* %P, i64 %idxprom2 + store i16 0, i16* %arrayidx3 + ret void +} + +; CHECK-LABEL: Sturh_zero_4 +; CHECK: stp wzr, wzr +define void @Sturh_zero_4(i16* nocapture %P, i32 %n) { +entry: + %sub = add nsw i32 %n, -3 + %idxprom = sext i32 %sub to i64 + %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom + store i16 0, i16* %arrayidx + %sub1 = add nsw i32 %n, -4 + %idxprom2 = sext i32 %sub1 to i64 + %arrayidx3 = getelementptr inbounds i16, i16* %P, i64 %idxprom2 + store i16 0, i16* %arrayidx3 + %sub4 = add nsw i32 %n, -2 + %idxprom5 = sext i32 %sub4 to i64 + %arrayidx6 = getelementptr inbounds i16, i16* %P, i64 %idxprom5 + store i16 0, i16* %arrayidx6 + %sub7 = add nsw i32 %n, -1 + %idxprom8 = sext i32 %sub7 to i64 + %arrayidx9 = getelementptr inbounds i16, i16* %P, i64 %idxprom8 + store i16 0, i16* %arrayidx9 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll index 869966caa3ae3..985b5bf483acd 100644 --- a/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -535,6 +535,17 @@ entry: declare double @llvm.fma.f64(double, double, double) +define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) { +; CHECK-LABEL: test_vfmss_lane_f32 +; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret +entry: + %extract.rhs = extractelement <2 x float> %v, i32 1 + %extract = fsub float -0.000000e+00, %extract.rhs + %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) + ret float %0 +} + define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmss_laneq_f32 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] @@ -557,6 +568,50 @@ entry: ret double %0 } +define double @test_vfmsd_lane_f64_0(double %a, double %b, <1 x double> %v) { +; CHCK-LABEL: test_vfmsd_lane_f64_0 +; CHCK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHCK-NEXT: ret +entry: + %tmp0 = fsub <1 x double> , %v + %tmp1 = extractelement <1 x double> %tmp0, i32 0 + %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a) + ret double %0 +} + +define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) { +; CHECK-LABEL: test_vfmss_lane_f32_0 +; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret +entry: + %tmp0 = fsub <2 x float> , %v + %tmp1 = extractelement <2 x float> %tmp0, i32 1 + %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a) + ret float %0 +} + +define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) { +; CHECK-LABEL: test_vfmss_laneq_f32_0 +; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret +entry: + %tmp0 = fsub <4 x float>, %v + %tmp1 = extractelement <4 x float> %tmp0, i32 3 + %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a) + ret float %0 +} + +define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) { +; CHECK-LABEL: test_vfmsd_laneq_f64_0 +; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; CHECK-NEXT: ret +entry: + %tmp0 = fsub <2 x double>, %v + %tmp1 = extractelement <2 x double> %tmp0, i32 1 + %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a) + ret double %0 +} + define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_lane_s16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll index b74a40626ceeb..83b1cac70f5c0 100644 --- a/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -320,21 +320,20 @@ define i32 @smovw8h(<8 x i16> %tmp1) { ret i32 %tmp5 } -define i32 @smovx16b(<16 x i8> %tmp1) { +define i64 @smovx16b(<16 x i8> %tmp1) { ; CHECK-LABEL: smovx16b: -; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[8] +; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8] %tmp3 = extractelement <16 x i8> %tmp1, i32 8 - %tmp4 = sext i8 %tmp3 to i32 - %tmp5 = add i32 %tmp4, %tmp4 - ret i32 %tmp5 + %tmp4 = sext i8 %tmp3 to i64 + ret i64 %tmp4 } -define i32 @smovx8h(<8 x i16> %tmp1) { +define i64 @smovx8h(<8 x i16> %tmp1) { ; CHECK-LABEL: smovx8h: -; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2] +; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2] %tmp3 = extractelement <8 x i16> %tmp1, i32 2 - %tmp4 = sext i16 %tmp3 to i32 - ret i32 %tmp4 + %tmp4 = sext i16 %tmp3 to i64 + ret i64 %tmp4 } define i64 @smovx4s(<4 x i32> %tmp1) { diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll index b8236c5b24795..c2006ccdd064b 100644 --- a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll +++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll @@ -7,7 +7,7 @@ define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { entry: ; CHECK-LABEL: jscall_patchpoint_codegen: ; CHECK: Ltmp -; CHECK: str x{{.+}}, [sp] +; CHECK: str x{{.+}}, [sp, #-16]! ; CHECK-NEXT: mov x0, x{{.+}} ; CHECK: Ltmp ; CHECK-NEXT: movz x16, #0xffff, lsl #32 @@ -16,7 +16,7 @@ entry: ; CHECK-NEXT: blr x16 ; FAST-LABEL: jscall_patchpoint_codegen: ; FAST: Ltmp -; FAST: str x{{.+}}, [sp] +; FAST: str x{{.+}}, [sp, #-16]! ; FAST: Ltmp ; FAST-NEXT: movz x16, #0xffff, lsl #32 ; FAST-NEXT: movk x16, #0xdead, lsl #16 @@ -50,7 +50,7 @@ entry: ; FAST: orr [[REG1:x[0-9]+]], xzr, #0x2 ; FAST-NEXT: orr [[REG2:w[0-9]+]], wzr, #0x4 ; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6 -; FAST-NEXT: str [[REG1]], [sp] +; FAST-NEXT: str [[REG1]], [sp, #-32]! ; FAST-NEXT: str [[REG2]], [sp, #16] ; FAST-NEXT: str [[REG3]], [sp, #24] ; FAST: Ltmp @@ -90,7 +90,7 @@ entry: ; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6 ; FAST-NEXT: orr [[REG4:w[0-9]+]], wzr, #0x8 ; FAST-NEXT: movz [[REG5:x[0-9]+]], #0xa -; FAST-NEXT: str [[REG1]], [sp] +; FAST-NEXT: str [[REG1]], [sp, #-64]! ; FAST-NEXT: str [[REG2]], [sp, #16] ; FAST-NEXT: str [[REG3]], [sp, #24] ; FAST-NEXT: str [[REG4]], [sp, #36] diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll index 60672aa38486b..f3af01a73559f 100644 --- a/test/CodeGen/AArch64/arm64-platform-reg.ll +++ b/test/CodeGen/AArch64/arm64-platform-reg.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18 -; RUN: llc -mtriple=arm64-freebsd-gnu -aarch64-reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18 +; RUN: llc -mtriple=arm64-apple-ios -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18 +; RUN: llc -mtriple=arm64-freebsd-gnu -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18 ; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s ; x18 is reserved as a platform register on Darwin but not on other diff --git a/test/CodeGen/AArch64/arm64-popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll index b0b529a13f413..9ee53a0f92e6b 100644 --- a/test/CodeGen/AArch64/arm64-popcnt.ll +++ b/test/CodeGen/AArch64/arm64-popcnt.ll @@ -4,8 +4,8 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone { %cnt = tail call i32 @llvm.ctpop.i32(i32 %x) ret i32 %cnt -; CHECK: ubfx x{{[0-9]+}} -; CHECK: fmov d0, x{{[0-9]+}} +; CHECK: mov w[[IN64:[0-9]+]], w0 +; CHECK: fmov d0, x[[IN64]] ; CHECK: cnt.8b v0, v0 ; CHECK: uaddlv.8b h0, v0 ; CHECK: fmov w0, s0 @@ -59,7 +59,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat { %cnt = tail call i32 @llvm.ctpop.i32(i32 %x) ret i32 %cnt ; CHECK-LABEL: cnt32: -; CHECK-NOT 16b +; CHECK-NOT: 16b ; CHECK: ret } @@ -67,7 +67,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat { %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) ret i64 %cnt ; CHECK-LABEL: cnt64: -; CHECK-NOT 16b +; CHECK-NOT: 16b ; CHECK: ret } diff --git a/test/CodeGen/AArch64/arm64-rounding.ll b/test/CodeGen/AArch64/arm64-rounding.ll index 931114447adfa..d487aabccc4f1 100644 --- a/test/CodeGen/AArch64/arm64-rounding.ll +++ b/test/CodeGen/AArch64/arm64-rounding.ll @@ -1,10 +1,8 @@ -; RUN: llc -O3 < %s -mcpu=cyclone | FileCheck %s -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64" -target triple = "arm64-apple-ios6.0.0" +; RUN: llc -O3 < %s -mtriple=arm64 | FileCheck %s -; CHECK: test1 -; CHECK: frintx +; CHECK-LABEL: test1: ; CHECK: frintm +; CHECK-NOT: frintx define float @test1(float %a) #0 { entry: %call = tail call float @floorf(float %a) nounwind readnone @@ -13,9 +11,9 @@ entry: declare float @floorf(float) nounwind readnone -; CHECK: test2 -; CHECK: frintx +; CHECK-LABEL: test2: ; CHECK: frintm +; CHECK-NOT: frintx define double @test2(double %a) #0 { entry: %call = tail call double @floor(double %a) nounwind readnone @@ -24,7 +22,7 @@ entry: declare double @floor(double) nounwind readnone -; CHECK: test3 +; CHECK-LABEL: test3: ; CHECK: frinti define float @test3(float %a) #0 { entry: @@ -34,7 +32,7 @@ entry: declare float @nearbyintf(float) nounwind readnone -; CHECK: test4 +; CHECK-LABEL: test4: ; CHECK: frinti define double @test4(double %a) #0 { entry: @@ -44,9 +42,9 @@ entry: declare double @nearbyint(double) nounwind readnone -; CHECK: test5 -; CHECK: frintx +; CHECK-LABEL: test5: ; CHECK: frintp +; CHECK-NOT: frintx define float @test5(float %a) #0 { entry: %call = tail call float @ceilf(float %a) nounwind readnone @@ -55,9 +53,9 @@ entry: declare float @ceilf(float) nounwind readnone -; CHECK: test6 -; CHECK: frintx +; CHECK-LABEL: test6: ; CHECK: frintp +; CHECK-NOT: frintx define double @test6(double %a) #0 { entry: %call = tail call double @ceil(double %a) nounwind readnone @@ -66,7 +64,7 @@ entry: declare double @ceil(double) nounwind readnone -; CHECK: test7 +; CHECK-LABEL: test7: ; CHECK: frintx define float @test7(float %a) #0 { entry: @@ -76,7 +74,7 @@ entry: declare float @rintf(float) nounwind readnone -; CHECK: test8 +; CHECK-LABEL: test8: ; CHECK: frintx define double @test8(double %a) #0 { entry: @@ -86,9 +84,9 @@ entry: declare double @rint(double) nounwind readnone -; CHECK: test9 -; CHECK: frintx +; CHECK-LABEL: test9: ; CHECK: frintz +; CHECK-NOT: frintx define float @test9(float %a) #0 { entry: %call = tail call float @truncf(float %a) nounwind readnone @@ -97,9 +95,9 @@ entry: declare float @truncf(float) nounwind readnone -; CHECK: test10 -; CHECK: frintx +; CHECK-LABEL: test10: ; CHECK: frintz +; CHECK-NOT: frintx define double @test10(double %a) #0 { entry: %call = tail call double @trunc(double %a) nounwind readnone @@ -108,9 +106,9 @@ entry: declare double @trunc(double) nounwind readnone -; CHECK: test11 -; CHECK: frintx +; CHECK-LABEL: test11: ; CHECK: frinta +; CHECK-NOT: frintx define float @test11(float %a) #0 { entry: %call = tail call float @roundf(float %a) nounwind readnone @@ -119,9 +117,9 @@ entry: declare float @roundf(float %a) nounwind readnone -; CHECK: test12 -; CHECK: frintx +; CHECK-LABEL: test12: ; CHECK: frinta +; CHECK-NOT: frintx define double @test12(double %a) #0 { entry: %call = tail call double @round(double %a) nounwind readnone @@ -130,7 +128,7 @@ entry: declare double @round(double %a) nounwind readnone -; CHECK: test13 +; CHECK-LABEL: test13: ; CHECK-NOT: frintx ; CHECK: frintm define float @test13(float %a) #1 { @@ -139,7 +137,7 @@ entry: ret float %call } -; CHECK: test14 +; CHECK-LABEL: test14: ; CHECK-NOT: frintx ; CHECK: frintm define double @test14(double %a) #1 { @@ -148,7 +146,7 @@ entry: ret double %call } -; CHECK: test15 +; CHECK-LABEL: test15: ; CHECK-NOT: frintx ; CHECK: frintp define float @test15(float %a) #1 { @@ -157,7 +155,7 @@ entry: ret float %call } -; CHECK: test16 +; CHECK-LABEL: test16: ; CHECK-NOT: frintx ; CHECK: frintp define double @test16(double %a) #1 { @@ -166,7 +164,7 @@ entry: ret double %call } -; CHECK: test17 +; CHECK-LABEL: test17: ; CHECK-NOT: frintx ; CHECK: frintz define float @test17(float %a) #1 { @@ -175,7 +173,7 @@ entry: ret float %call } -; CHECK: test18 +; CHECK-LABEL: test18: ; CHECK-NOT: frintx ; CHECK: frintz define double @test18(double %a) #1 { @@ -184,7 +182,7 @@ entry: ret double %call } -; CHECK: test19 +; CHECK-LABEL: test19: ; CHECK-NOT: frintx ; CHECK: frinta define float @test19(float %a) #1 { @@ -193,7 +191,7 @@ entry: ret float %call } -; CHECK: test20 +; CHECK-LABEL: test20: ; CHECK-NOT: frintx ; CHECK: frinta define double @test20(double %a) #1 { @@ -202,7 +200,5 @@ entry: ret double %call } - - attributes #0 = { nounwind } attributes #1 = { nounwind "unsafe-fp-math"="true" } diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll index 599712be401c6..2ecd66ddf5d42 100644 --- a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -1,5 +1,5 @@ -; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE -; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE +; RUN: llc %s -o - -enable-shrink-wrap=true -disable-post-ra | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE +; RUN: llc %s -o - -enable-shrink-wrap=false -disable-post-ra | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-ios" @@ -539,3 +539,94 @@ if.end: declare void @abort() #0 attributes #0 = { noreturn nounwind } + +; Make sure that we handle infinite loops properly When checking that the Save +; and Restore blocks are control flow equivalent, the loop searches for the +; immediate (post) dominator for the (restore) save blocks. When either the Save +; or Restore block is located in an infinite loop the only immediate (post) +; dominator is itself. In this case, we cannot perform shrink wrapping, but we +; should return gracefully and continue compilation. +; The only condition for this test is the compilation finishes correctly. +; +; CHECK-LABEL: infiniteloop +; CHECK: ret +define void @infiniteloop() { +entry: + br i1 undef, label %if.then, label %if.end + +if.then: + %ptr = alloca i32, i32 4 + br label %for.body + +for.body: ; preds = %for.body, %entry + %sum.03 = phi i32 [ 0, %if.then ], [ %add, %for.body ] + %call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)() + %add = add nsw i32 %call, %sum.03 + store i32 %add, i32* %ptr + br label %for.body + +if.end: + ret void +} + +; Another infinite loop test this time with a body bigger than just one block. +; CHECK-LABEL: infiniteloop2 +; CHECK: ret +define void @infiniteloop2() { +entry: + br i1 undef, label %if.then, label %if.end + +if.then: + %ptr = alloca i32, i32 4 + br label %for.body + +for.body: ; preds = %for.body, %entry + %sum.03 = phi i32 [ 0, %if.then ], [ %add, %body1 ], [ 1, %body2] + %call = tail call i32 asm "mov $0, #0", "=r,~{x19}"() + %add = add nsw i32 %call, %sum.03 + store i32 %add, i32* %ptr + br i1 undef, label %body1, label %body2 + +body1: + tail call void asm sideeffect "nop", "~{x19}"() + br label %for.body + +body2: + tail call void asm sideeffect "nop", "~{x19}"() + br label %for.body + +if.end: + ret void +} + +; Another infinite loop test this time with two nested infinite loop. +; CHECK-LABEL: infiniteloop3 +; CHECK: ret +define void @infiniteloop3() { +entry: + br i1 undef, label %loop2a, label %body + +body: ; preds = %entry + br i1 undef, label %loop2a, label %end + +loop1: ; preds = %loop2a, %loop2b + %var.phi = phi i32* [ %next.phi, %loop2b ], [ %var, %loop2a ] + %next.phi = phi i32* [ %next.load, %loop2b ], [ %next.var, %loop2a ] + %0 = icmp eq i32* %var, null + %next.load = load i32*, i32** undef + br i1 %0, label %loop2a, label %loop2b + +loop2a: ; preds = %loop1, %body, %entry + %var = phi i32* [ null, %body ], [ null, %entry ], [ %next.phi, %loop1 ] + %next.var = phi i32* [ undef, %body ], [ null, %entry ], [ %next.load, %loop1 ] + br label %loop1 + +loop2b: ; preds = %loop1 + %gep1 = bitcast i32* %var.phi to i32* + %next.ptr = bitcast i32* %gep1 to i32** + store i32* %next.phi, i32** %next.ptr + br label %loop1 + +end: + ret void +} diff --git a/test/CodeGen/AArch64/arm64-spill-lr.ll b/test/CodeGen/AArch64/arm64-spill-lr.ll index 88109088a2ff4..2ea5d7810a146 100644 --- a/test/CodeGen/AArch64/arm64-spill-lr.ll +++ b/test/CodeGen/AArch64/arm64-spill-lr.ll @@ -1,9 +1,9 @@ ; RUN: llc -mtriple=arm64-apple-ios < %s @bar = common global i32 0, align 4 -; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack -; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch -; register. +; Leaf function which uses all callee-saved registers and allocates >= 256 bytes +; on the stack this will cause determineCalleeSaves() to spill LR as an +; additional scratch register. ; ; This is a crash-only regression test for rdar://15124582. define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind { diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll index 1a4df7a6f2d68..3eb1d27530012 100644 --- a/test/CodeGen/AArch64/arm64-stackmap.ll +++ b/test/CodeGen/AArch64/arm64-stackmap.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=arm64-apple-darwin < %s | FileCheck %s -; RUN: llc -mtriple=arm64-apple-darwin -fast-isel -fast-isel-abort=1 < %s | FileCheck %s +; RUN: llc -mtriple=arm64-apple-darwin -mattr=+reserve-x18 < %s | FileCheck %s +; RUN: llc -mtriple=arm64-apple-darwin -mattr=+reserve-x18 -fast-isel -fast-isel-abort=1 < %s | FileCheck %s ; ; Note: Print verbose stackmaps using -debug-only=stackmaps. diff --git a/test/CodeGen/AArch64/arm64-stp.ll b/test/CodeGen/AArch64/arm64-stp.ll index 72561aac6e871..98242d0bb57e5 100644 --- a/test/CodeGen/AArch64/arm64-stp.ll +++ b/test/CodeGen/AArch64/arm64-stp.ll @@ -1,8 +1,6 @@ ; RUN: llc < %s -march=arm64 -aarch64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s -; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\ -; RUN: -verify-machineinstrs -mcpu=cyclone | FileCheck -check-prefix=STUR_CHK %s -; CHECK: stp_int +; CHECK-LABEL: stp_int ; CHECK: stp w0, w1, [x2] define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind { store i32 %a, i32* %p, align 4 @@ -11,7 +9,7 @@ define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind { ret void } -; CHECK: stp_long +; CHECK-LABEL: stp_long ; CHECK: stp x0, x1, [x2] define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind { store i64 %a, i64* %p, align 8 @@ -20,7 +18,7 @@ define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind { ret void } -; CHECK: stp_float +; CHECK-LABEL: stp_float ; CHECK: stp s0, s1, [x0] define void @stp_float(float %a, float %b, float* nocapture %p) nounwind { store float %a, float* %p, align 4 @@ -29,7 +27,7 @@ define void @stp_float(float %a, float %b, float* nocapture %p) nounwind { ret void } -; CHECK: stp_double +; CHECK-LABEL: stp_double ; CHECK: stp d0, d1, [x0] define void @stp_double(double %a, double %b, double* nocapture %p) nounwind { store double %a, double* %p, align 8 @@ -40,9 +38,9 @@ define void @stp_double(double %a, double %b, double* nocapture %p) nounwind { ; Test the load/store optimizer---combine ldurs into a ldp, if appropriate define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind { -; STUR_CHK: stur_int -; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8] -; STUR_CHK-NEXT: ret +; CHECK-LABEL: stur_int +; CHECK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %p, i32 -1 store i32 %a, i32* %p1, align 2 %p2 = getelementptr inbounds i32, i32* %p, i32 -2 @@ -51,9 +49,9 @@ define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind { } define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind { -; STUR_CHK: stur_long -; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16] -; STUR_CHK-NEXT: ret +; CHECK-LABEL: stur_long +; CHECK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16] +; CHECK-NEXT: ret %p1 = getelementptr inbounds i64, i64* %p, i32 -1 store i64 %a, i64* %p1, align 2 %p2 = getelementptr inbounds i64, i64* %p, i32 -2 @@ -62,9 +60,9 @@ define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind { } define void @stur_float(float %a, float %b, float* nocapture %p) nounwind { -; STUR_CHK: stur_float -; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8] -; STUR_CHK-NEXT: ret +; CHECK-LABEL: stur_float +; CHECK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8] +; CHECK-NEXT: ret %p1 = getelementptr inbounds float, float* %p, i32 -1 store float %a, float* %p1, align 2 %p2 = getelementptr inbounds float, float* %p, i32 -2 @@ -73,9 +71,9 @@ define void @stur_float(float %a, float %b, float* nocapture %p) nounwind { } define void @stur_double(double %a, double %b, double* nocapture %p) nounwind { -; STUR_CHK: stur_double -; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16] -; STUR_CHK-NEXT: ret +; CHECK-LABEL: stur_double +; CHECK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16] +; CHECK-NEXT: ret %p1 = getelementptr inbounds double, double* %p, i32 -1 store double %a, double* %p1, align 2 %p2 = getelementptr inbounds double, double* %p, i32 -2 diff --git a/test/CodeGen/AArch64/arm64-strict-align.ll b/test/CodeGen/AArch64/arm64-strict-align.ll index 109f4115d8017..28c158f7a2eb0 100644 --- a/test/CodeGen/AArch64/arm64-strict-align.ll +++ b/test/CodeGen/AArch64/arm64-strict-align.ll @@ -1,7 +1,6 @@ ; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s -; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-no-strict-align | FileCheck %s -; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT -; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align -fast-isel | FileCheck %s --check-prefix=CHECK-STRICT +; RUN: llc < %s -mtriple=arm64-apple-darwin -mattr=+strict-align | FileCheck %s --check-prefix=CHECK-STRICT +; RUN: llc < %s -mtriple=arm64-apple-darwin -mattr=+strict-align -fast-isel | FileCheck %s --check-prefix=CHECK-STRICT define i32 @f0(i32* nocapture %p) nounwind { ; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2] diff --git a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll index f94f88a1183fe..c95eca062ff6a 100644 --- a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll +++ b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll @@ -1,4 +1,7 @@ -; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic \ +; RUN: -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=NOEMU %s +; RUN: llc -emulated-tls -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic \ +; RUN: -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=EMU %s ; If the .tlsdesccall and blr parts are emitted completely separately (even with ; glue) then LLVM will separate them quite happily (with a spill at O0, hence @@ -13,6 +16,40 @@ define i32 @test_generaldynamic() { %val = load i32, i32* @general_dynamic_var ret i32 %val -; CHECK: .tlsdesccall general_dynamic_var -; CHECK-NEXT: blr {{x[0-9]+}} +; NOEMU: .tlsdesccall general_dynamic_var +; NOEMU-NEXT: blr {{x[0-9]+}} +; NOEMU-NOT: __emutls_v.general_dynamic_var: + +; EMU: adrp{{.+}}__emutls_v.general_dynamic_var +; EMU: bl __emutls_get_address + +; EMU-NOT: __emutls_v.general_dynamic_var +; EMU-NOT: __emutls_t.general_dynamic_var +} + +@emulated_init_var = thread_local global i32 37, align 8 + +define i32 @test_emulated_init() { +; COMMON-LABEL: test_emulated_init: + + %val = load i32, i32* @emulated_init_var + ret i32 %val + +; EMU: adrp{{.+}}__emutls_v.emulated_init_var +; EMU: bl __emutls_get_address + +; EMU-NOT: __emutls_v.general_dynamic_var: + +; EMU: .align 3 +; EMU-LABEL: __emutls_v.emulated_init_var: +; EMU-NEXT: .xword 4 +; EMU-NEXT: .xword 8 +; EMU-NEXT: .xword 0 +; EMU-NEXT: .xword __emutls_t.emulated_init_var + +; EMU-LABEL: __emutls_t.emulated_init_var: +; EMU-NEXT: .word 37 } + +; CHECK-NOT: __emutls_v.general_dynamic_var: +; EMU-NOT: __emutls_t.general_dynamic_var diff --git a/test/CodeGen/AArch64/arm64-trunc-store.ll b/test/CodeGen/AArch64/arm64-trunc-store.ll index 7cde629b33ae4..be0388284fb85 100644 --- a/test/CodeGen/AArch64/arm64-trunc-store.ll +++ b/test/CodeGen/AArch64/arm64-trunc-store.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-post-ra | FileCheck %s define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind { ; CHECK-LABEL: bar: diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll index a52c4ebf13e7e..c1800085884c9 100644 --- a/test/CodeGen/AArch64/arm64-vabs.ll +++ b/test/CodeGen/AArch64/arm64-vabs.ll @@ -134,6 +134,72 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ret <2 x i64> %tmp4 } +define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) { +; CHECK-LABEL: uabdl8h_log2_shuffle +; CHECK: uabdl2.8h +; CHECK: uabdl.8h + %aload = load <16 x i8>, <16 x i8>* %a, align 1 + %bload = load <16 x i8>, <16 x i8>* %b, align 1 + %aext = zext <16 x i8> %aload to <16 x i16> + %bext = zext <16 x i8> %bload to <16 x i16> + %abdiff = sub nsw <16 x i16> %aext, %bext + %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer + %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff + %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff + %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> + %bin1.rdx = add <16 x i16> %absel, %rdx.shuf + %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> + %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx + %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> + %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136 + %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> + %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138 + %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0 + ret i16 %reduced_v +} + +define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) { +; CHECK-LABEL: uabdl4s_log2_shuffle +; CHECK: uabdl2.4s +; CHECK: uabdl.4s + %aload = load <8 x i16>, <8 x i16>* %a, align 1 + %bload = load <8 x i16>, <8 x i16>* %b, align 1 + %aext = zext <8 x i16> %aload to <8 x i32> + %bext = zext <8 x i16> %bload to <8 x i32> + %abdiff = sub nsw <8 x i32> %aext, %bext + %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer + %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff + %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff + %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %absel, %rdx.shuf + %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136 + %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> + %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138 + %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0 + ret i32 %reduced_v +} + +define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { +; CHECK: uabdl2d_log2_shuffle +; CHECK: uabdl2.2d +; CHECK: uabdl.2d + %aload = load <4 x i32>, <4 x i32>* %a, align 1 + %bload = load <4 x i32>, <4 x i32>* %b, align 1 + %aext = zext <4 x i32> %aload to <4 x i64> + %bext = zext <4 x i32> %bload to <4 x i64> + %abdiff = sub nsw <4 x i64> %aext, %bext + %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer + %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff + %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff + %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> + %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136 + %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> + %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138 + %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0 + ret i64 %reduced_v +} + define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind { ;CHECK-LABEL: fabd_2s: ;CHECK: fabd.2s diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll index 44f2af1c5e79a..8702b41023d0c 100644 --- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false -disable-post-ra < %s | FileCheck %s %va_list = type {i8*, i8*, i8*, i32, i32} diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll index 5bee1611e6c64..994a9956cf7f8 100644 --- a/test/CodeGen/AArch64/arm64-vector-ext.ll +++ b/test/CodeGen/AArch64/arm64-vector-ext.ll @@ -1,27 +1,27 @@ -; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s - -;CHECK: @func30 -;CHECK: ushll.4s v0, v0, #0 -;CHECK: movi.4s v1, #0x1 -;CHECK: and.16b v0, v0, v1 -;CHECK: str q0, [x0] -;CHECK: ret - -%T0_30 = type <4 x i1> -%T1_30 = type <4 x i32> -define void @func30(%T0_30 %v0, %T1_30* %p1) { - %r = zext %T0_30 %v0 to %T1_30 - store %T1_30 %r, %T1_30* %p1 - ret void -} - -; Extend from v1i1 was crashing things (PR20791). Make sure we do something -; sensible instead. -define <1 x i32> @autogen_SD7918() { -; CHECK-LABEL: autogen_SD7918 -; CHECK: movi d0, #0000000000000000 -; CHECK-NEXT: ret - %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0 - %ZE = zext <1 x i1> %I29 to <1 x i32> - ret <1 x i32> %ZE -} +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s + +;CHECK: @func30 +;CHECK: movi.4h v1, #0x1 +;CHECK: and.8b v0, v0, v1 +;CHECK: ushll.4s v0, v0, #0 +;CHECK: str q0, [x0] +;CHECK: ret + +%T0_30 = type <4 x i1> +%T1_30 = type <4 x i32> +define void @func30(%T0_30 %v0, %T1_30* %p1) { + %r = zext %T0_30 %v0 to %T1_30 + store %T1_30 %r, %T1_30* %p1 + ret void +} + +; Extend from v1i1 was crashing things (PR20791). Make sure we do something +; sensible instead. +define <1 x i32> @autogen_SD7918() { +; CHECK-LABEL: autogen_SD7918 +; CHECK: movi d0, #0000000000000000 +; CHECK-NEXT: ret + %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0 + %ZE = zext <1 x i1> %I29 to <1 x i32> + ret <1 x i32> %ZE +} diff --git a/test/CodeGen/AArch64/arm64-vminmaxnm.ll b/test/CodeGen/AArch64/arm64-vminmaxnm.ll index b5aca45cd479a..302ba9d681c64 100644 --- a/test/CodeGen/AArch64/arm64-vminmaxnm.ll +++ b/test/CodeGen/AArch64/arm64-vminmaxnm.ll @@ -42,13 +42,28 @@ define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp ret <2 x double> %vminnm2.i } +define float @f7(float %a, float %b) nounwind readnone ssp { +; CHECK: fmaxnm s0, s0, s1 +; CHECK: ret + %vmaxnm2.i = tail call float @llvm.aarch64.neon.fmaxnm.f32(float %a, float %b) nounwind + ret float %vmaxnm2.i +} + +define double @f8(double %a, double %b) nounwind readnone ssp { +; CHECK: fminnm d0, d0, d1 +; CHECK: ret + %vmaxnm2.i = tail call double @llvm.aarch64.neon.fminnm.f64(double %a, double %b) nounwind + ret double %vmaxnm2.i +} + declare <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone declare <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone declare <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone declare <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone declare <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone declare <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone - +declare float @llvm.aarch64.neon.fmaxnm.f32(float, float) nounwind readnone +declare double @llvm.aarch64.neon.fminnm.f64(double, double) nounwind readnone define double @test_fmaxnmv(<2 x double> %in) { ; CHECK-LABEL: test_fmaxnmv: diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll index ce9c0a64b5872..ec49110d40526 100644 --- a/test/CodeGen/AArch64/arm64-xaluo.ll +++ b/test/CodeGen/AArch64/arm64-xaluo.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -disable-post-ra -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort=1 -disable-post-ra -verify-machineinstrs < %s | FileCheck %s ; ; Get the actual value of the overflow bit. diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll index cb90caeadc1f3..900d2072925f5 100644 --- a/test/CodeGen/AArch64/atomic-ops.ll +++ b/test/CodeGen/AArch64/atomic-ops.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK -; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG +; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG ; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created @@ -893,6 +893,8 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind { ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]] ; CHECK: stxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] +; CHECK: [[GET_OUT]]: +; CHECK: clrex ; CHECK-NOT: dmb ; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]] @@ -916,6 +918,8 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind { ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]] ; CHECK: stlxrh [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] +; CHECK: [[GET_OUT]]: +; CHECK: clrex ; CHECK-NOT: dmb ; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]] @@ -927,21 +931,21 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind { %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic %old = extractvalue { i32, i1 } %pair, 0 +; CHECK: mov {{[xw]}}[[WANTED:[0-9]+]], {{[xw]}}0 + ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]: ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]] - ; w0 below is a reasonable guess but could change: it certainly comes into the - ; function there. -; CHECK-NEXT: cmp w[[OLD]], w0 +; CHECK-NEXT: cmp w[[OLD]], w[[WANTED]] ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]] ; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] +; CHECK: [[GET_OUT]]: +; CHECK: clrex ; CHECK-NOT: dmb - -; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]] ret i32 %old } @@ -963,6 +967,8 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { ; As above, w1 is a reasonable guess. ; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] +; CHECK: [[GET_OUT]]: +; CHECK: clrex ; CHECK-NOT: dmb ; CHECK: str x[[OLD]], diff --git a/test/CodeGen/AArch64/bitcast-v2i8.ll b/test/CodeGen/AArch64/bitcast-v2i8.ll index 4bdac641c5bca..aff3ffc70a711 100644 --- a/test/CodeGen/AArch64/bitcast-v2i8.ll +++ b/test/CodeGen/AArch64/bitcast-v2i8.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=aarch64-apple-ios | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck %s ; Part of PR21549: going through the stack isn't ideal but is correct. diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll index 9b731fa72a470..509b547a5c82a 100644 --- a/test/CodeGen/AArch64/bitfield-insert.ll +++ b/test/CodeGen/AArch64/bitfield-insert.ll @@ -196,3 +196,44 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) { ret void } + +; Bitfield insert where the second or operand is a better match to be folded into the BFM +define void @test_32bit_opnd1_better(i32* %existing, i32* %new) { +; CHECK-LABEL: test_32bit_opnd1_better: + + %oldval = load volatile i32, i32* %existing + %oldval_keep = and i32 %oldval, 65535 ; 0x0000ffff + + %newval = load i32, i32* %new + %newval_shifted = shl i32 %newval, 16 + %newval_masked = and i32 %newval_shifted, 16711680 ; 0x00ff0000 + + %combined = or i32 %oldval_keep, %newval_masked + store volatile i32 %combined, i32* %existing +; CHECK: and [[BIT:w[0-9]+]], {{w[0-9]+}}, #0xffff +; CHECK: bfi [[BIT]], {{w[0-9]+}}, #16, #8 + + ret void +} + +; Tests when all the bits from one operand are not useful +define i32 @test_nouseful_bits(i8 %a, i32 %b) { +; CHECK-LABEL: test_nouseful_bits: +; CHECK: bfi +; CHECK: bfi +; CHECK: bfi +; CHECK-NOT: bfi +; CHECK-NOT: or +; CHECK: lsl + %conv = zext i8 %a to i32 ; 0 0 0 A + %shl = shl i32 %b, 8 ; B2 B1 B0 0 + %or = or i32 %conv, %shl ; B2 B1 B0 A + %shl.1 = shl i32 %or, 8 ; B1 B0 A 0 + %or.1 = or i32 %conv, %shl.1 ; B1 B0 A A + %shl.2 = shl i32 %or.1, 8 ; B0 A A 0 + %or.2 = or i32 %conv, %shl.2 ; B0 A A A + %shl.3 = shl i32 %or.2, 8 ; A A A 0 + %or.3 = or i32 %conv, %shl.3 ; A A A A + %shl.4 = shl i32 %or.3, 8 ; A A A 0 + ret i32 %shl.4 +} diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll index 78399c80b5de2..5f19b6943b8e2 100644 --- a/test/CodeGen/AArch64/bitfield.ll +++ b/test/CodeGen/AArch64/bitfield.ll @@ -3,51 +3,67 @@ @var32 = global i32 0 @var64 = global i64 0 -define void @test_extendb(i8 %var) { -; CHECK-LABEL: test_extendb: +define void @test_extendb32(i8 %var) { +; CHECK-LABEL: test_extendb32: %sxt32 = sext i8 %var to i32 store volatile i32 %sxt32, i32* @var32 ; CHECK: sxtb {{w[0-9]+}}, {{w[0-9]+}} - %sxt64 = sext i8 %var to i64 - store volatile i64 %sxt64, i64* @var64 -; CHECK: sxtb {{x[0-9]+}}, {{w[0-9]+}} - ; N.b. this doesn't actually produce a bitfield instruction at the ; moment, but it's still a good test to have and the semantics are ; correct. %uxt32 = zext i8 %var to i32 store volatile i32 %uxt32, i32* @var32 ; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xff + ret void +} + +define void @test_extendb64(i8 %var) { +; CHECK-LABEL: test_extendb64: + + %sxt64 = sext i8 %var to i64 + store volatile i64 %sxt64, i64* @var64 +; CHECK: sxtb {{x[0-9]+}}, {{w[0-9]+}} +; N.b. this doesn't actually produce a bitfield instruction at the +; moment, but it's still a good test to have and the semantics are +; correct. %uxt64 = zext i8 %var to i64 store volatile i64 %uxt64, i64* @var64 -; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff +; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xff ret void } -define void @test_extendh(i16 %var) { -; CHECK-LABEL: test_extendh: +define void @test_extendh32(i16 %var) { +; CHECK-LABEL: test_extendh32: %sxt32 = sext i16 %var to i32 store volatile i32 %sxt32, i32* @var32 ; CHECK: sxth {{w[0-9]+}}, {{w[0-9]+}} - %sxt64 = sext i16 %var to i64 - store volatile i64 %sxt64, i64* @var64 -; CHECK: sxth {{x[0-9]+}}, {{w[0-9]+}} - ; N.b. this doesn't actually produce a bitfield instruction at the ; moment, but it's still a good test to have and the semantics are ; correct. %uxt32 = zext i16 %var to i32 store volatile i32 %uxt32, i32* @var32 ; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff + ret void +} + +define void @test_extendh64(i16 %var) { +; CHECK-LABEL: test_extendh64: + + %sxt64 = sext i16 %var to i64 + store volatile i64 %sxt64, i64* @var64 +; CHECK: sxth {{x[0-9]+}}, {{w[0-9]+}} +; N.b. this doesn't actually produce a bitfield instruction at the +; moment, but it's still a good test to have and the semantics are +; correct. %uxt64 = zext i16 %var to i64 store volatile i64 %uxt64, i64* @var64 -; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff +; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff ret void } @@ -60,7 +76,7 @@ define void @test_extendw(i32 %var) { %uxt64 = zext i32 %var to i64 store volatile i64 %uxt64, i64* @var64 -; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32 +; CHECK: mov {{w[0-9]+}}, w0 ret void } diff --git a/test/CodeGen/AArch64/bitreverse.ll b/test/CodeGen/AArch64/bitreverse.ll new file mode 100644 index 0000000000000..936e3554b397f --- /dev/null +++ b/test/CodeGen/AArch64/bitreverse.ll @@ -0,0 +1,87 @@ +; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s + +; These tests just check that the plumbing is in place for @llvm.bitreverse. The +; actual output is massive at the moment as llvm.bitreverse is not yet legal. + +declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone + +define <2 x i16> @f(<2 x i16> %a) { +; CHECK-LABEL: f: +; CHECK: ushr + %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a) + ret <2 x i16> %b +} + +declare i8 @llvm.bitreverse.i8(i8) readnone + +; Unfortunately some of the shift-and-inserts become BFIs, and some do not :( +define i8 @g(i8 %a) { +; CHECK-LABEL: g: +; CHECK-DAG: lsr [[S5:w.*]], w0, #5 +; CHECK-DAG: lsr [[S4:w.*]], w0, #4 +; CHECK-DAG: lsr [[S3:w.*]], w0, #3 +; CHECK-DAG: lsr [[S2:w.*]], w0, #2 +; CHECK-DAG: lsl [[L1:w.*]], w0, #29 +; CHECK-DAG: lsl [[L2:w.*]], w0, #19 +; CHECK-DAG: lsl [[L3:w.*]], w0, #17 + +; CHECK-DAG: and [[T1:w.*]], [[L1]], #0x40000000 +; CHECK-DAG: bfi [[T1]], w0, #31, #1 +; CHECK-DAG: bfi [[T1]], [[S2]], #29, #1 +; CHECK-DAG: bfi [[T1]], [[S3]], #28, #1 +; CHECK-DAG: bfi [[T1]], [[S4]], #27, #1 +; CHECK-DAG: bfi [[T1]], [[S5]], #26, #1 +; CHECK-DAG: and [[T2:w.*]], [[L2]], #0x2000000 +; CHECK-DAG: and [[T3:w.*]], [[L3]], #0x1000000 +; CHECK-DAG: orr [[T4:w.*]], [[T1]], [[T2]] +; CHECK-DAG: orr [[T5:w.*]], [[T4]], [[T3]] +; CHECK: lsr w0, [[T5]], #24 + + %b = call i8 @llvm.bitreverse.i8(i8 %a) + ret i8 %b +} + +declare <8 x i8> @llvm.bitreverse.v8i8(<8 x i8>) readnone + +define <8 x i8> @g_vec(<8 x i8> %a) { +; Try and match as much of the sequence as precisely as possible. + +; CHECK-LABEL: g_vec: +; CHECK-DAG: movi [[M1:v.*]], #0x80 +; CHECK-DAG: movi [[M2:v.*]], #0x40 +; CHECK-DAG: movi [[M3:v.*]], #0x20 +; CHECK-DAG: movi [[M4:v.*]], #0x10 +; CHECK-DAG: movi [[M5:v.*]], #0x8 +; CHECK-DAG: movi [[M6:v.*]], #0x4{{$}} +; CHECK-DAG: movi [[M7:v.*]], #0x2{{$}} +; CHECK-DAG: movi [[M8:v.*]], #0x1{{$}} +; CHECK-DAG: shl [[S1:v.*]], v0.8b, #7 +; CHECK-DAG: shl [[S2:v.*]], v0.8b, #5 +; CHECK-DAG: shl [[S3:v.*]], v0.8b, #3 +; CHECK-DAG: shl [[S4:v.*]], v0.8b, #1 +; CHECK-DAG: ushr [[S5:v.*]], v0.8b, #1 +; CHECK-DAG: ushr [[S6:v.*]], v0.8b, #3 +; CHECK-DAG: ushr [[S7:v.*]], v0.8b, #5 +; CHECK-DAG: ushr [[S8:v.*]], v0.8b, #7 +; CHECK-DAG: and [[A1:v.*]], [[S1]], [[M1]] +; CHECK-DAG: and [[A2:v.*]], [[S2]], [[M2]] +; CHECK-DAG: and [[A3:v.*]], [[S3]], [[M3]] +; CHECK-DAG: and [[A4:v.*]], [[S4]], [[M4]] +; CHECK-DAG: and [[A5:v.*]], [[S5]], [[M5]] +; CHECK-DAG: and [[A6:v.*]], [[S6]], [[M6]] +; CHECK-DAG: and [[A7:v.*]], [[S7]], [[M7]] +; CHECK-DAG: and [[A8:v.*]], [[S8]], [[M8]] + +; The rest can be ORRed together in any order; it's not worth the test +; maintenance to match them precisely. +; CHECK-DAG: orr +; CHECK-DAG: orr +; CHECK-DAG: orr +; CHECK-DAG: orr +; CHECK-DAG: orr +; CHECK-DAG: orr +; CHECK-DAG: orr +; CHECK: ret + %b = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) + ret <8 x i8> %b +} diff --git a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll index c78fabac61874..004267f4e4e04 100644 --- a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll +++ b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll @@ -403,6 +403,32 @@ return: ; preds = %land.lhs.true, %con ret i32 %retval.0 } +define void @cmp_shifted(i32 %in, i32 %lhs, i32 %rhs) { +; CHECK-LABEL: cmp_shifted: +; CHECK: cmp w0, #1 +; [...] +; CHECK: cmp w0, #2, lsl #12 + + %tst_low = icmp sgt i32 %in, 0 + br i1 %tst_low, label %true, label %false + +true: + call i32 @zoo(i32 128) + ret void + +false: + %tst = icmp sgt i32 %in, 8191 + br i1 %tst, label %truer, label %falser + +truer: + call i32 @zoo(i32 42) + ret void + +falser: + call i32 @zoo(i32 1) + ret void +} + declare i32 @zoo(i32) declare double @yoo(i32) diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll index 1266842fcc6d1..a8399f92ebe4e 100644 --- a/test/CodeGen/AArch64/cpus.ll +++ b/test/CodeGen/AArch64/cpus.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a35 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll new file mode 100644 index 0000000000000..a9ae00c8d270b --- /dev/null +++ b/test/CodeGen/AArch64/cxx-tlscc.ll @@ -0,0 +1,76 @@ +; RUN: llc < %s -mtriple=aarch64-apple-ios | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-apple-ios -enable-shrink-wrap=true | FileCheck --check-prefix=CHECK %s +; Shrink wrapping currently does not kick in because we have a TLS CALL +; in the entry block and it will clobber the link register. + +%struct.S = type { i8 } + +@sg = internal thread_local global %struct.S zeroinitializer, align 1 +@__dso_handle = external global i8 +@__tls_guard = internal thread_local unnamed_addr global i1 false + +declare %struct.S* @_ZN1SC1Ev(%struct.S* returned) +declare %struct.S* @_ZN1SD1Ev(%struct.S* returned) +declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*) + +define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind { + %.b.i = load i1, i1* @__tls_guard, align 1 + br i1 %.b.i, label %__tls_init.exit, label %init.i + +init.i: + store i1 true, i1* @__tls_guard, align 1 + %call.i.i = tail call %struct.S* @_ZN1SC1Ev(%struct.S* nonnull @sg) + %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (%struct.S* (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle) + br label %__tls_init.exit + +__tls_init.exit: + ret %struct.S* @sg +} + +; CHECK-LABEL: _ZTW2sg +; CHECK-NOT: stp d31, d30 +; CHECK-NOT: stp d29, d28 +; CHECK-NOT: stp d27, d26 +; CHECK-NOT: stp d25, d24 +; CHECK-NOT: stp d23, d22 +; CHECK-NOT: stp d21, d20 +; CHECK-NOT: stp d19, d18 +; CHECK-NOT: stp d17, d16 +; CHECK-NOT: stp d7, d6 +; CHECK-NOT: stp d5, d4 +; CHECK-NOT: stp d3, d2 +; CHECK-NOT: stp d1, d0 +; CHECK-NOT: stp x20, x19 +; CHECK-NOT: stp x14, x13 +; CHECK-NOT: stp x12, x11 +; CHECK-NOT: stp x10, x9 +; CHECK-NOT: stp x8, x7 +; CHECK-NOT: stp x6, x5 +; CHECK-NOT: stp x4, x3 +; CHECK-NOT: stp x2, x1 +; CHECK: blr +; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]] +; CHECK: blr +; CHECK: tlv_atexit +; CHECK: [[BB_end]]: +; CHECK: blr +; CHECK-NOT: ldp x2, x1 +; CHECK-NOT: ldp x4, x3 +; CHECK-NOT: ldp x6, x5 +; CHECK-NOT: ldp x8, x7 +; CHECK-NOT: ldp x10, x9 +; CHECK-NOT: ldp x12, x11 +; CHECK-NOT: ldp x14, x13 +; CHECK-NOT: ldp x20, x19 +; CHECK-NOT: ldp d1, d0 +; CHECK-NOT: ldp d3, d2 +; CHECK-NOT: ldp d5, d4 +; CHECK-NOT: ldp d7, d6 +; CHECK-NOT: ldp d17, d16 +; CHECK-NOT: ldp d19, d18 +; CHECK-NOT: ldp d21, d20 +; CHECK-NOT: ldp d23, d22 +; CHECK-NOT: ldp d25, d24 +; CHECK-NOT: ldp d27, d26 +; CHECK-NOT: ldp d29, d28 +; CHECK-NOT: ldp d31, d30 diff --git a/test/CodeGen/AArch64/dag-combine-select.ll b/test/CodeGen/AArch64/dag-combine-select.ll new file mode 100644 index 0000000000000..45b998d9136d0 --- /dev/null +++ b/test/CodeGen/AArch64/dag-combine-select.ll @@ -0,0 +1,47 @@ +; RUN: llc -disable-post-ra -o - %s | FileCheck %s +target triple = "arm64--" + +@out = internal global i32 0, align 4 + +; Ensure that we transform select(C0, x, select(C1, x, y)) towards +; select(C0 | C1, x, y) so we can use CMP;CCMP for the implementation. +; CHECK-LABEL: test0: +; CHECK: cmp w0, #7 +; CHECK: ccmp w1, #0, #0, ne +; CHECK: csel w0, w1, w2, gt +; CHECK: ret +define i32 @test0(i32 %v0, i32 %v1, i32 %v2) { + %cmp1 = icmp eq i32 %v0, 7 + %cmp2 = icmp sgt i32 %v1, 0 + %sel0 = select i1 %cmp1, i32 %v1, i32 %v2 + %sel1 = select i1 %cmp2, i32 %v1, i32 %sel0 + ret i32 %sel1 +} + +; Usually we keep select(C0 | C1, x, y) as is on aarch64 to create CMP;CCMP +; sequences. This case should be transformed to select(C0, select(C1, x, y), y) +; anyway to get CSE effects. +; CHECK-LABEL: test1: +; CHECK-NOT: ccmp +; CHECK: cmp w0, #7 +; CHECK: adrp x[[OUTNUM:[0-9]+]], out +; CHECK: csel w[[SEL0NUM:[0-9]+]], w1, w2, eq +; CHECK: cmp w[[SEL0NUM]], #13 +; CHECK: csel w[[SEL1NUM:[0-9]+]], w1, w2, lo +; CHECK: cmp w0, #42 +; CHECK: csel w[[SEL2NUM:[0-9]+]], w1, w[[SEL1NUM]], eq +; CHECK: str w[[SEL1NUM]], [x[[OUTNUM]], :lo12:out] +; CHECK: str w[[SEL2NUM]], [x[[OUTNUM]], :lo12:out] +; CHECK: ret +define void @test1(i32 %bitset, i32 %val0, i32 %val1) { + %cmp1 = icmp eq i32 %bitset, 7 + %cond = select i1 %cmp1, i32 %val0, i32 %val1 + %cmp5 = icmp ult i32 %cond, 13 + %cond11 = select i1 %cmp5, i32 %val0, i32 %val1 + %cmp3 = icmp eq i32 %bitset, 42 + %or.cond = or i1 %cmp3, %cmp5 + %cond17 = select i1 %or.cond, i32 %val0, i32 %val1 + store volatile i32 %cond11, i32* @out, align 4 + store volatile i32 %cond17, i32* @out, align 4 + ret void +} diff --git a/test/CodeGen/AArch64/divrem.ll b/test/CodeGen/AArch64/divrem.ll new file mode 100644 index 0000000000000..9f648eb63eac1 --- /dev/null +++ b/test/CodeGen/AArch64/divrem.ll @@ -0,0 +1,22 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s + +; SDIVREM/UDIVREM DAG nodes are generated but expanded when lowering and +; should not generate select error. +define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) { +; CHECK-LABEL: test_udivrem +; CHECK-DAG: udivrem +; CHECK-NOT: LLVM ERROR: Cannot select + %div = udiv <2 x i32> %x, %y + store <2 x i32> %div, <2 x i32>* %z + %1 = urem <2 x i32> %x, %y + ret <2 x i32> %1 +} + +define <4 x i32> @test_sdivrem(<4 x i32> %x, <4 x i32>* %y) { +; CHECK-LABEL: test_sdivrem +; CHECK-DAG: sdivrem + %div = sdiv <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 > + store <4 x i32> %div, <4 x i32>* %y + %1 = srem <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 > + ret <4 x i32> %1 +} diff --git a/test/CodeGen/AArch64/emutls.ll b/test/CodeGen/AArch64/emutls.ll new file mode 100644 index 0000000000000..ac5762edba98b --- /dev/null +++ b/test/CodeGen/AArch64/emutls.ll @@ -0,0 +1,116 @@ +; RUN: llc -emulated-tls -mtriple=aarch64-linux-android \ +; RUN: -relocation-model=pic < %s | FileCheck -check-prefix=ARM64 %s + +; Copied from X86/emutls.ll + +; Use my_emutls_get_address like __emutls_get_address. +@my_emutls_v_xyz = external global i8*, align 4 +declare i8* @my_emutls_get_address(i8*) + +define i32 @my_get_xyz() { +; ARM64-LABEL: my_get_xyz: +; ARM64: adrp x0, :got:my_emutls_v_xyz +; ARM64-NEXT: ldr x0, [x0, :got_lo12:my_emutls_v_xyz] +; ARM64-NEXT: bl my_emutls_get_address +; ARM64-NEXT: ldr w0, [x0] +; ARM64-NEXT: ldp x29, x30, [sp] + +entry: + %call = call i8* @my_emutls_get_address(i8* bitcast (i8** @my_emutls_v_xyz to i8*)) + %0 = bitcast i8* %call to i32* + %1 = load i32, i32* %0, align 4 + ret i32 %1 +} + +@i1 = thread_local global i32 15 +@i2 = external thread_local global i32 +@i3 = internal thread_local global i32 15 +@i4 = hidden thread_local global i32 15 +@i5 = external hidden thread_local global i32 +@s1 = thread_local global i16 15 +@b1 = thread_local global i8 0 + +define i32 @f1() { +; ARM64-LABEL: f1: +; ARM64: adrp x0, :got:__emutls_v.i1 +; ARM64-NEXT: ldr x0, [x0, :got_lo12:__emutls_v.i1] +; ARM64-NEXT: bl __emutls_get_address +; ARM64-NEXT: ldr w0, [x0] +; ARM64-NEXT: ldp x29, x30, [sp] + +entry: + %tmp1 = load i32, i32* @i1 + ret i32 %tmp1 +} + +define i32* @f2() { +; ARM64-LABEL: f2: +; ARM64: adrp x0, :got:__emutls_v.i1 +; ARM64-NEXT: ldr x0, [x0, :got_lo12:__emutls_v.i1] +; ARM64-NEXT: bl __emutls_get_address +; ARM64-NEXT: ldp x29, x30, [sp] + +entry: + ret i32* @i1 +} + +;;;;;;;;;;;;;; 64-bit __emutls_v. and __emutls_t. + +; ARM64 .section .data.rel.local, +; ARM64-LABEL: __emutls_v.i1: +; ARM64-NEXT: .xword 4 +; ARM64-NEXT: .xword 4 +; ARM64-NEXT: .xword 0 +; ARM64-NEXT: .xword __emutls_t.i1 + +; ARM64 .section .rodata, +; ARM64-LABEL: __emutls_t.i1: +; ARM64-NEXT: .word 15 + +; ARM64-NOT: __emutls_v.i2 + +; ARM64 .section .data.rel.local, +; ARM64-LABEL: __emutls_v.i3: +; ARM64-NEXT: .xword 4 +; ARM64-NEXT: .xword 4 +; ARM64-NEXT: .xword 0 +; ARM64-NEXT: .xword __emutls_t.i3 + +; ARM64 .section .rodata, +; ARM64-LABEL: __emutls_t.i3: +; ARM64-NEXT: .word 15 + +; ARM64 .section .data.rel.local, +; ARM64-LABEL: __emutls_v.i4: +; ARM64-NEXT: .xword 4 +; ARM64-NEXT: .xword 4 +; ARM64-NEXT: .xword 0 +; ARM64-NEXT: .xword __emutls_t.i4 + +; ARM64 .section .rodata, +; ARM64-LABEL: __emutls_t.i4: +; ARM64-NEXT: .word 15 + +; ARM64-NOT: __emutls_v.i5: +; ARM64 .hidden __emutls_v.i5 +; ARM64-NOT: __emutls_v.i5: + +; ARM64 .section .data.rel.local, +; ARM64-LABEL: __emutls_v.s1: +; ARM64-NEXT: .xword 2 +; ARM64-NEXT: .xword 2 +; ARM64-NEXT: .xword 0 +; ARM64-NEXT: .xword __emutls_t.s1 + +; ARM64 .section .rodata, +; ARM64-LABEL: __emutls_t.s1: +; ARM64-NEXT: .hword 15 + +; ARM64 .section .data.rel.local, +; ARM64-LABEL: __emutls_v.b1: +; ARM64-NEXT: .xword 1 +; ARM64-NEXT: .xword 1 +; ARM64-NEXT: .xword 0 +; ARM64-NEXT: .xword 0 + +; ARM64-NOT: __emutls_t.b1 diff --git a/test/CodeGen/AArch64/emutls_generic.ll b/test/CodeGen/AArch64/emutls_generic.ll new file mode 100644 index 0000000000000..7664db3df8d27 --- /dev/null +++ b/test/CodeGen/AArch64/emutls_generic.ll @@ -0,0 +1,59 @@ +; RUN: llc < %s -emulated-tls -mtriple=aarch64-linux-android -relocation-model=pic \ +; RUN: | FileCheck -check-prefix=ARM_64 %s +; RUN: llc < %s -emulated-tls -mtriple=aarch64-linux-android -relocation-model=pic -O3 \ +; RUN: | FileCheck -check-prefix=ARM_64 %s +; RUN: llc < %s -emulated-tls -mtriple=aarch64-linux-android -O3 \ +; RUN: | FileCheck -check-prefix=ARM_64 %s + +; Make sure that TLS symbols are emitted in expected order. + +@external_x = external thread_local global i32, align 8 +@external_y = thread_local global i8 7, align 2 +@internal_y = internal thread_local global i64 9, align 16 + +define i32* @get_external_x() { +entry: + ret i32* @external_x +} + +define i8* @get_external_y() { +entry: + ret i8* @external_y +} + +define i64* @get_internal_y() { +entry: + ret i64* @internal_y +} + +; ARM_64-LABEL: get_external_x: +; ARM_64: __emutls_v.external_x +; ARM_64: __emutls_get_address +; ARM_64-LABEL: get_external_y: +; ARM_64: __emutls_v.external_y +; ARM_64: __emutls_get_address +; ARM_64-LABEL: get_internal_y: +; ARM_64: __emutls_v.internal_y +; ARM_64: __emutls_get_address +; ARM_64-NOT: __emutls_t.external_x +; ARM_64-NOT: __emutls_v.external_x: +; ARM_64: .align 3 +; ARM_64-LABEL: __emutls_v.external_y: +; ARM_64-NEXT: .xword 1 +; ARM_64-NEXT: .xword 2 +; ARM_64-NEXT: .xword 0 +; ARM_64-NEXT: .xword __emutls_t.external_y +; ARM_64-NOT: __emutls_v.external_x: +; ARM_64: .section .rodata, +; ARM_64-LABEL: __emutls_t.external_y: +; ARM_64-NEXT: .byte 7 +; ARM_64: .data +; ARM_64: .align 3 +; ARM_64-LABEL: __emutls_v.internal_y: +; ARM_64-NEXT: .xword 8 +; ARM_64-NEXT: .xword 16 +; ARM_64-NEXT: .xword 0 +; ARM_64-NEXT: .xword __emutls_t.internal_y +; ARM_64: .section .rodata, +; ARM_64-LABEL: __emutls_t.internal_y: +; ARM_64-NEXT: .xword 9 diff --git a/test/CodeGen/AArch64/eon.ll b/test/CodeGen/AArch64/eon.ll new file mode 100644 index 0000000000000..ea61ce34c050a --- /dev/null +++ b/test/CodeGen/AArch64/eon.ll @@ -0,0 +1,29 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s + +; Check that the eon instruction is generated instead of eor,movn +define i64 @test1(i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: test1: +; CHECK: eon +; CHECK: ret +entry: + %shl = shl i64 %b, 4 + %neg = xor i64 %a, -1 + %xor = xor i64 %shl, %neg + ret i64 %xor +} + +; Same check with mutliple uses of %neg +define i64 @test2(i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: test2: +; CHECK: eon +; CHECK: eon +; CHECK: lsl +; CHECK: ret +entry: + %shl = shl i64 %b, 4 + %neg = xor i64 %shl, -1 + %xor = xor i64 %neg, %a + %xor1 = xor i64 %c, %neg + %shl2 = shl i64 %xor, %xor1 + ret i64 %shl2 +} diff --git a/test/CodeGen/AArch64/f16-instructions.ll b/test/CodeGen/AArch64/f16-instructions.ll index be5e2e51385d6..e8ecb13b35645 100644 --- a/test/CodeGen/AArch64/f16-instructions.ll +++ b/test/CodeGen/AArch64/f16-instructions.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s +; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra | FileCheck %s target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -143,6 +143,33 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { ret half %r } +; CHECK-LABEL: test_select_cc_f32_f16: +; CHECK-DAG: fcvt s2, h2 +; CHECK-DAG: fcvt s3, h3 +; CHECK-NEXT: fcmp s2, s3 +; CHECK-NEXT: fcsel s0, s0, s1, ne +; CHECK-NEXT: ret +define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 { + %cc = fcmp une half %c, %d + %r = select i1 %cc, float %a, float %b + ret float %r +} + +; CHECK-LABEL: test_select_cc_f16_f32: +; CHECK-DAG: fcvt s0, h0 +; CHECK-DAG: fcvt s1, h1 +; CHECK-DAG: fcmp s2, s3 +; CHECK-DAG: cset w8, ne +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: fcsel s0, s0, s1, ne +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 { + %cc = fcmp une float %c, %d + %r = select i1 %cc, half %a, half %b + ret half %r +} + ; CHECK-LABEL: test_fcmp_une: ; CHECK-NEXT: fcvt s1, h1 ; CHECK-NEXT: fcvt s0, h0 @@ -644,13 +671,10 @@ define half @test_fabs(half %a) #0 { } ; CHECK-LABEL: test_minnum: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: bl {{_?}}fminf +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fminnm s0, s0, s1 ; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: ldp x29, x30, [sp], #16 ; CHECK-NEXT: ret define half @test_minnum(half %a, half %b) #0 { %r = call half @llvm.minnum.f16(half %a, half %b) @@ -658,13 +682,10 @@ define half @test_minnum(half %a, half %b) #0 { } ; CHECK-LABEL: test_maxnum: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: bl {{_?}}fmaxf +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: ldp x29, x30, [sp], #16 ; CHECK-NEXT: ret define half @test_maxnum(half %a, half %b) #0 { %r = call half @llvm.maxnum.f16(half %a, half %b) @@ -683,11 +704,50 @@ define half @test_copysign(half %a, half %b) #0 { ret half %r } -; CHECK-LABEL: test_floor: -; CHECK-NEXT: fcvt s1, h0 -; CHECK-NEXT: frintm s0, s1 +; CHECK-LABEL: test_copysign_f32: +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: movi.4s v2, #0x80, lsl #24 +; CHECK-NEXT: bit.16b v0, v1, v2 ; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: frintx s1, s1 +; CHECK-NEXT: ret +define half @test_copysign_f32(half %a, float %b) #0 { + %tb = fptrunc float %b to half + %r = call half @llvm.copysign.f16(half %a, half %tb) + ret half %r +} + +; CHECK-LABEL: test_copysign_f64: +; CHECK-NEXT: fcvt s1, d1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: movi.4s v2, #0x80, lsl #24 +; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_copysign_f64(half %a, double %b) #0 { + %tb = fptrunc double %b to half + %r = call half @llvm.copysign.f16(half %a, half %tb) + ret half %r +} + +; Check that the FP promotion will use a truncating FP_ROUND, so we can fold +; away the (fpext (fp_round )) here. + +; CHECK-LABEL: test_copysign_extended: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: movi.4s v2, #0x80, lsl #24 +; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: ret +define float @test_copysign_extended(half %a, half %b) #0 { + %r = call half @llvm.copysign.f16(half %a, half %b) + %xr = fpext half %r to float + ret float %xr +} + +; CHECK-LABEL: test_floor: +; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0 +; CHECK-NEXT: frintm [[INT32:s[0-9]+]], [[FLOAT32]] +; CHECK-NEXT: fcvt h0, [[INT32]] ; CHECK-NEXT: ret define half @test_floor(half %a) #0 { %r = call half @llvm.floor.f16(half %a) @@ -695,10 +755,9 @@ define half @test_floor(half %a) #0 { } ; CHECK-LABEL: test_ceil: -; CHECK-NEXT: fcvt s1, h0 -; CHECK-NEXT: frintp s0, s1 -; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: frintx s1, s1 +; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0 +; CHECK-NEXT: frintp [[INT32:s[0-9]+]], [[FLOAT32]] +; CHECK-NEXT: fcvt h0, [[INT32]] ; CHECK-NEXT: ret define half @test_ceil(half %a) #0 { %r = call half @llvm.ceil.f16(half %a) @@ -706,10 +765,9 @@ define half @test_ceil(half %a) #0 { } ; CHECK-LABEL: test_trunc: -; CHECK-NEXT: fcvt s1, h0 -; CHECK-NEXT: frintz s0, s1 -; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: frintx s1, s1 +; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0 +; CHECK-NEXT: frintz [[INT32:s[0-9]+]], [[FLOAT32]] +; CHECK-NEXT: fcvt h0, [[INT32]] ; CHECK-NEXT: ret define half @test_trunc(half %a) #0 { %r = call half @llvm.trunc.f16(half %a) @@ -737,10 +795,9 @@ define half @test_nearbyint(half %a) #0 { } ; CHECK-LABEL: test_round: -; CHECK-NEXT: fcvt s1, h0 -; CHECK-NEXT: frinta s0, s1 -; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: frintx s1, s1 +; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0 +; CHECK-NEXT: frinta [[INT32:s[0-9]+]], [[FLOAT32]] +; CHECK-NEXT: fcvt h0, [[INT32]] ; CHECK-NEXT: ret define half @test_round(half %a) #0 { %r = call half @llvm.round.f16(half %a) diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll new file mode 100644 index 0000000000000..55fbf63319ee3 --- /dev/null +++ b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=aarch64-apple-darwin -O0 -fast-isel -fast-isel-abort=0 -verify-machineinstrs < %s | FileCheck %s + +define void @test(i64 %a, i64 %b, i2* %c) { +; CHECK-LABEL: test +; CHECK: and [[REG1:w[0-9]+]], w8, #0x3 +; CHECK-NEXT: strb [[REG1]], {{\[}}x2{{\]}} +; CHECK-NEXT: tbz w9, #0, + %1 = trunc i64 %a to i2 + %2 = trunc i64 %b to i1 +; Force fast-isel to fall back to SDAG. + store i2 %1, i2* %c, align 8 + br i1 %2, label %bb1, label %bb2 + +bb1: + ret void + +bb2: + ret void +} diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll index da6ddbf5101ea..e04a62b85c8eb 100644 --- a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll +++ b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s -; CHECK-label: test_or +; CHECK-LABEL: test_or ; CHECK: cbnz w0, {{LBB[0-9]+_2}} ; CHECK: cbz w1, {{LBB[0-9]+_1}} define i64 @test_or(i32 %a, i32 %b) { @@ -18,7 +18,7 @@ bb4: ret i64 %2 } -; CHECK-label: test_ans +; CHECK-LABEL: test_and ; CHECK: cbz w0, {{LBB[0-9]+_2}} ; CHECK: cbnz w1, {{LBB[0-9]+_3}} define i64 @test_and(i32 %a, i32 %b) { @@ -36,7 +36,55 @@ bb4: ret i64 %2 } +; If the branch is unpredictable, don't add another branch. + +; CHECK-LABEL: test_or_unpredictable +; CHECK: cmp w0, #0 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: cmp w1, #0 +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: tbnz w8, #0, +define i64 @test_or_unpredictable(i32 %a, i32 %b) { +bb1: + %0 = icmp eq i32 %a, 0 + %1 = icmp eq i32 %b, 0 + %or.cond = or i1 %0, %1 + br i1 %or.cond, label %bb3, label %bb4, !unpredictable !2 + +bb3: + ret i64 0 + +bb4: + %2 = call i64 @bar() + ret i64 %2 +} + +; CHECK-LABEL: test_and_unpredictable +; CHECK: cmp w0, #0 +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: cmp w1, #0 +; CHECK-NEXT: cset w9, ne +; CHECK-NEXT: and w8, w8, w9 +; CHECK-NEXT: tbz w8, #0, +define i64 @test_and_unpredictable(i32 %a, i32 %b) { +bb1: + %0 = icmp ne i32 %a, 0 + %1 = icmp ne i32 %b, 0 + %or.cond = and i1 %0, %1 + br i1 %or.cond, label %bb4, label %bb3, !unpredictable !2 + +bb3: + ret i64 0 + +bb4: + %2 = call i64 @bar() + ret i64 %2 +} + declare i64 @bar() !0 = !{!"branch_weights", i32 5128, i32 32} !1 = !{!"branch_weights", i32 1024, i32 4136} +!2 = !{} + diff --git a/test/CodeGen/AArch64/fast-isel-cmp-vec.ll b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll new file mode 100644 index 0000000000000..2855419a1ca0c --- /dev/null +++ b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll @@ -0,0 +1,100 @@ +; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs \ +; RUN: -aarch64-atomic-cfg-tidy=0 -disable-cgp -disable-branch-fold \ +; RUN: < %s | FileCheck %s + +; +; Verify that we don't mess up vector comparisons in fast-isel. +; + +define <2 x i32> @icmp_v2i32(<2 x i32> %a) { +; CHECK-LABEL: icmp_v2i32: +; CHECK: ; BB#0: +; CHECK-NEXT: cmeq.2s [[CMP:v[0-9]+]], v0, #0 +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: movi.2s [[MASK:v[0-9]+]], #0x1 +; CHECK-NEXT: and.8b v0, [[CMP]], [[MASK]] +; CHECK-NEXT: ret + %c = icmp eq <2 x i32> %a, zeroinitializer + br label %bb2 +bb2: + %z = zext <2 x i1> %c to <2 x i32> + ret <2 x i32> %z +} + +define <2 x i32> @icmp_constfold_v2i32(<2 x i32> %a) { +; CHECK-LABEL: icmp_constfold_v2i32: +; CHECK: ; BB#0: +; CHECK-NEXT: movi d[[CMP:[0-9]+]], #0xffffffffffffffff +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: movi.2s [[MASK:v[0-9]+]], #0x1 +; CHECK-NEXT: and.8b v0, v[[CMP]], [[MASK]] +; CHECK-NEXT: ret + %1 = icmp eq <2 x i32> %a, %a + br label %bb2 +bb2: + %2 = zext <2 x i1> %1 to <2 x i32> + ret <2 x i32> %2 +} + +define <4 x i32> @icmp_v4i32(<4 x i32> %a) { +; CHECK-LABEL: icmp_v4i32: +; CHECK: ; BB#0: +; CHECK-NEXT: cmeq.4s [[CMP:v[0-9]+]], v0, #0 +; CHECK-NEXT: xtn.4h [[CMPV4I16:v[0-9]+]], [[CMP]] +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: movi.4h [[MASK:v[0-9]+]], #0x1 +; CHECK-NEXT: and.8b [[ZEXT:v[0-9]+]], [[CMPV4I16]], [[MASK]] +; CHECK-NEXT: ushll.4s v0, [[ZEXT]], #0 +; CHECK-NEXT: ret + %c = icmp eq <4 x i32> %a, zeroinitializer + br label %bb2 +bb2: + %z = zext <4 x i1> %c to <4 x i32> + ret <4 x i32> %z +} + +define <4 x i32> @icmp_constfold_v4i32(<4 x i32> %a) { +; CHECK-LABEL: icmp_constfold_v4i32: +; CHECK: ; BB#0: +; CHECK-NEXT: movi d[[CMP:[0-9]+]], #0xffffffffffffffff +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: movi.4h [[MASK:v[0-9]+]], #0x1 +; CHECK-NEXT: and.8b [[ZEXT:v[0-9]+]], v[[CMP]], [[MASK]] +; CHECK-NEXT: ushll.4s v0, [[ZEXT]], #0 +; CHECK-NEXT: ret + %1 = icmp eq <4 x i32> %a, %a + br label %bb2 +bb2: + %2 = zext <4 x i1> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <16 x i8> @icmp_v16i8(<16 x i8> %a) { +; CHECK-LABEL: icmp_v16i8: +; CHECK: ; BB#0: +; CHECK-NEXT: cmeq.16b [[CMP:v[0-9]+]], v0, #0 +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: movi.16b [[MASK:v[0-9]+]], #0x1 +; CHECK-NEXT: and.16b v0, [[CMP]], [[MASK]] +; CHECK-NEXT: ret + %c = icmp eq <16 x i8> %a, zeroinitializer + br label %bb2 +bb2: + %z = zext <16 x i1> %c to <16 x i8> + ret <16 x i8> %z +} + +define <16 x i8> @icmp_constfold_v16i8(<16 x i8> %a) { +; CHECK-LABEL: icmp_constfold_v16i8: +; CHECK: ; BB#0: +; CHECK-NEXT: movi.2d [[CMP:v[0-9]+]], #0xffffffffffffffff +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: movi.16b [[MASK:v[0-9]+]], #0x1 +; CHECK-NEXT: and.16b v0, [[CMP]], [[MASK]] +; CHECK-NEXT: ret + %1 = icmp eq <16 x i8> %a, %a + br label %bb2 +bb2: + %2 = zext <16 x i1> %1 to <16 x i8> + ret <16 x i8> %2 +} diff --git a/test/CodeGen/AArch64/fast-isel-folded-shift.ll b/test/CodeGen/AArch64/fast-isel-folded-shift.ll new file mode 100644 index 0000000000000..b881ef5c6d52d --- /dev/null +++ b/test/CodeGen/AArch64/fast-isel-folded-shift.ll @@ -0,0 +1,125 @@ +; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=1 -verify-machineinstrs < %s | FileCheck %s + +; Test invalid shift values. This will fall-back to SDAG. +; AND +define zeroext i8 @and_rs_i8(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: and_rs_i8 +; CHECK: and [[REG:w[0-9]+]], w0, w8 +; CHECK-NEXT: and {{w[0-9]+}}, [[REG]], #0xff + %1 = shl i8 %b, 8 + %2 = and i8 %a, %1 + ret i8 %2 +} + +define zeroext i16 @and_rs_i16(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: and_rs_i16 +; CHECK: and [[REG:w[0-9]+]], w0, w8 +; CHECK-NEXT: and {{w[0-9]+}}, [[REG]], #0xffff + %1 = shl i16 %b, 16 + %2 = and i16 %a, %1 + ret i16 %2 +} + +define i32 @and_rs_i32(i32 %a, i32 %b) { +; CHECK-LABEL: and_rs_i32 +; CHECK: and w0, w0, w8 + %1 = shl i32 %b, 32 + %2 = and i32 %a, %1 + ret i32 %2 +} + +define i64 @and_rs_i64(i64 %a, i64 %b) { +; CHECK-LABEL: and_rs_i64 +; CHECK: and x0, x0, x8 + %1 = shl i64 %b, 64 + %2 = and i64 %a, %1 + ret i64 %2 +} + +; OR +define zeroext i8 @or_rs_i8(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: or_rs_i8 +; CHECK: orr [[REG:w[0-9]+]], w0, w8 +; CHECK-NEXT: and {{w[0-9]+}}, [[REG]], #0xff + %1 = shl i8 %b, 8 + %2 = or i8 %a, %1 + ret i8 %2 +} + +define zeroext i16 @or_rs_i16(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: or_rs_i16 +; CHECK: orr [[REG:w[0-9]+]], w0, w8 +; CHECK-NEXT: and {{w[0-9]+}}, [[REG]], #0xffff + %1 = shl i16 %b, 16 + %2 = or i16 %a, %1 + ret i16 %2 +} + +define i32 @or_rs_i32(i32 %a, i32 %b) { +; CHECK-LABEL: or_rs_i32 +; CHECK: orr w0, w0, w8 + %1 = shl i32 %b, 32 + %2 = or i32 %a, %1 + ret i32 %2 +} + +define i64 @or_rs_i64(i64 %a, i64 %b) { +; CHECK-LABEL: or_rs_i64 +; CHECK: orr x0, x0, x8 + %1 = shl i64 %b, 64 + %2 = or i64 %a, %1 + ret i64 %2 +} + +; XOR +define zeroext i8 @xor_rs_i8(i8 %a, i8 %b) { +; CHECK-LABEL: xor_rs_i8 +; CHECK: eor [[REG:w[0-9]+]], w0, w8 +; CHECK-NEXT: and {{w[0-9]+}}, [[REG]], #0xff + %1 = shl i8 %b, 8 + %2 = xor i8 %a, %1 + ret i8 %2 +} + +define zeroext i16 @xor_rs_i16(i16 %a, i16 %b) { +; CHECK-LABEL: xor_rs_i16 +; CHECK: eor [[REG:w[0-9]+]], w0, w8 +; CHECK-NEXT: and {{w[0-9]+}}, [[REG]], #0xffff + %1 = shl i16 %b, 16 + %2 = xor i16 %a, %1 + ret i16 %2 +} + +define i32 @xor_rs_i32(i32 %a, i32 %b) { +; CHECK-LABEL: xor_rs_i32 +; CHECK: eor w0, w0, w8 + %1 = shl i32 %b, 32 + %2 = xor i32 %a, %1 + ret i32 %2 +} + +define i64 @xor_rs_i64(i64 %a, i64 %b) { +; CHECK-LABEL: xor_rs_i64 +; CHECK: eor x0, x0, x8 + %1 = shl i64 %b, 64 + %2 = xor i64 %a, %1 + ret i64 %2 +} + +;ADD +define i32 @add_rs_i32(i32 %a, i32 %b) { +; CHECK-LABEL: add_rs_i32 +; CHECK: add w0, w0, w8 + %1 = shl i32 %b, 32 + %2 = add i32 %a, %1 + ret i32 %2 +} + +define i64 @add_rs_i64(i64 %a, i64 %b) { +; CHECK-LABEL: add_rs_i64 +; CHECK: add x0, x0, x8 + %1 = shl i64 %b, 64 + %2 = add i64 %a, %1 + ret i64 %2 +} + diff --git a/test/CodeGen/AArch64/fast-isel-logic-op.ll b/test/CodeGen/AArch64/fast-isel-logic-op.ll index 89c5f2c480243..16d0429fe98df 100644 --- a/test/CodeGen/AArch64/fast-isel-logic-op.ll +++ b/test/CodeGen/AArch64/fast-isel-logic-op.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=0 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=0 -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=1 -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s ; AND diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll index a392619a768dc..b5e03f08280ff 100644 --- a/test/CodeGen/AArch64/fastcc-reserved.ll +++ b/test/CodeGen/AArch64/fastcc-reserved.ll @@ -16,7 +16,7 @@ define fastcc void @foo(i32 %in) { ; CHECK: mov x29, sp ; Reserve space for call-frame: -; CHECK: sub sp, sp, #16 +; CHECK: str w{{[0-9]+}}, [sp, #-16]! call fastcc void @will_pop([8 x i32] undef, i32 42) ; CHECK: bl will_pop @@ -42,7 +42,7 @@ define void @foo1(i32 %in) { ; CHECK: mov x29, sp ; Reserve space for call-frame -; CHECK: sub sp, sp, #16 +; CHECK: str w{{[0-9]+}}, [sp, #-16]! call void @wont_pop([8 x i32] undef, i32 42) ; CHECK: bl wont_pop diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll index 9917fcd044fdd..f021eb2326188 100644 --- a/test/CodeGen/AArch64/fastcc.ll +++ b/test/CodeGen/AArch64/fastcc.ll @@ -7,12 +7,12 @@ define fastcc void @func_stack0() { ; CHECK-LABEL: func_stack0: ; CHECK: mov x29, sp -; CHECK-NEXT: sub sp, sp, #32 +; CHECK: str w{{[0-9]+}}, [sp, #-32]! ; CHECK-TAIL-LABEL: func_stack0: ; CHECK-TAIL: stp x29, x30, [sp, #-16]! ; CHECK-TAIL-NEXT: mov x29, sp -; CHECK-TAIL-NEXT: sub sp, sp, #32 +; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]! call fastcc void @func_stack8([8 x i32] undef, i32 42) @@ -55,13 +55,13 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) { ; CHECK-LABEL: func_stack8: ; CHECK: stp x29, x30, [sp, #-16]! ; CHECK: mov x29, sp -; CHECK: sub sp, sp, #32 +; CHECK: str w{{[0-9]+}}, [sp, #-32]! ; CHECK-TAIL-LABEL: func_stack8: ; CHECK-TAIL: stp x29, x30, [sp, #-16]! ; CHECK-TAIL: mov x29, sp -; CHECK-TAIL: sub sp, sp, #32 +; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]! call fastcc void @func_stack8([8 x i32] undef, i32 42) diff --git a/test/CodeGen/AArch64/fcvt_combine.ll b/test/CodeGen/AArch64/fcvt_combine.ll new file mode 100644 index 0000000000000..093ce4a4cd857 --- /dev/null +++ b/test/CodeGen/AArch64/fcvt_combine.ll @@ -0,0 +1,154 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK-NOT: fmul.2s +; CHECK: fcvtzs.2s v0, v0, #4 +; CHECK: ret +define <2 x i32> @test1(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; CHECK-LABEL: test2 +; CHECK-NOT: fmul.4s +; CHECK: fcvtzs.4s v0, v0, #3 +; CHECK: ret +define <4 x i32> @test2(<4 x float> %f) { + %mul.i = fmul <4 x float> %f, + %vcvt.i = fptosi <4 x float> %mul.i to <4 x i32> + ret <4 x i32> %vcvt.i +} + +; CHECK-LABEL: test3 +; CHECK-NOT: fmul.2d +; CHECK: fcvtzs.2d v0, v0, #5 +; CHECK: ret +define <2 x i64> @test3(<2 x double> %d) { + %mul.i = fmul <2 x double> %d, + %vcvt.i = fptosi <2 x double> %mul.i to <2 x i64> + ret <2 x i64> %vcvt.i +} + +; Truncate double to i32 +; CHECK-LABEL: test4 +; CHECK-NOT: fmul.2d v0, v0, #4 +; CHECK: fcvtzs.2d v0, v0 +; CHECK: xtn.2s +; CHECK: ret +define <2 x i32> @test4(<2 x double> %d) { + %mul.i = fmul <2 x double> %d, + %vcvt.i = fptosi <2 x double> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Truncate float to i16 +; CHECK-LABEL: test5 +; CHECK-NOT: fmul.2s +; CHECK: fcvtzs.2s v0, v0, #4 +; CHECK: ret +define <2 x i16> @test5(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i16> + ret <2 x i16> %vcvt.i +} + +; Don't convert float to i64 +; CHECK-LABEL: test6 +; CHECK: fmov.2s v1, #16.00000000 +; CHECK: fmul.2s v0, v0, v1 +; CHECK: fcvtl v0.2d, v0.2s +; CHECK: fcvtzs.2d v0, v0 +; CHECK: ret +define <2 x i64> @test6(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i64> + ret <2 x i64> %vcvt.i +} + +; Check unsigned conversion. +; CHECK-LABEL: test7 +; CHECK-NOT: fmul.2s +; CHECK: fcvtzu.2s v0, v0, #4 +; CHECK: ret +define <2 x i32> @test7(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Test which should not fold due to non-power of 2. +; CHECK-LABEL: test8 +; CHECK: fmov.2s v1, #17.00000000 +; CHECK: fmul.2s v0, v0, v1 +; CHECK: fcvtzu.2s v0, v0 +; CHECK: ret +define <2 x i32> @test8(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Test which should not fold due to non-matching power of 2. +; CHECK-LABEL: test9 +; CHECK: fmul.2s v0, v0, v1 +; CHECK: fcvtzu.2s v0, v0 +; CHECK: ret +define <2 x i32> @test9(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Don't combine all undefs. +; CHECK-LABEL: test10 +; CHECK: fmul.2s v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CHECK: fcvtzu.2s v{{[0-9]+}}, v{{[0-9]+}} +; CHECK: ret +define <2 x i32> @test10(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Combine if mix of undef and pow2. +; CHECK-LABEL: test11 +; CHECK: fcvtzu.2s v0, v0, #3 +; CHECK: ret +define <2 x i32> @test11(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Don't combine when multiplied by 0.0. +; CHECK-LABEL: test12 +; CHECK: fmul.2s v0, v0, v1 +; CHECK: fcvtzs.2s v0, v0 +; CHECK: ret +define <2 x i32> @test12(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Test which should not fold due to power of 2 out of range (i.e., 2^33). +; CHECK-LABEL: test13 +; CHECK: fmul.2s v0, v0, v1 +; CHECK: fcvtzs.2s v0, v0 +; CHECK: ret +define <2 x i32> @test13(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Test case where const is max power of 2 (i.e., 2^32). +; CHECK-LABEL: test14 +; CHECK: fcvtzs.2s v0, v0, #32 +; CHECK: ret +define <2 x i32> @test14(<2 x float> %f) { + %mul.i = fmul <2 x float> %f, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} diff --git a/test/CodeGen/AArch64/fdiv_combine.ll b/test/CodeGen/AArch64/fdiv_combine.ll new file mode 100644 index 0000000000000..6f38a267ec3fe --- /dev/null +++ b/test/CodeGen/AArch64/fdiv_combine.ll @@ -0,0 +1,115 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s + +; Test signed conversion. +; CHECK-LABEL: @test1 +; CHECK: scvtf.2s v0, v0, #4 +; CHECK: ret +define <2 x float> @test1(<2 x i32> %in) { +entry: + %vcvt.i = sitofp <2 x i32> %in to <2 x float> + %div.i = fdiv <2 x float> %vcvt.i, + ret <2 x float> %div.i +} + +; Test unsigned conversion. +; CHECK-LABEL: @test2 +; CHECK: ucvtf.2s v0, v0, #3 +; CHECK: ret +define <2 x float> @test2(<2 x i32> %in) { +entry: + %vcvt.i = uitofp <2 x i32> %in to <2 x float> + %div.i = fdiv <2 x float> %vcvt.i, + ret <2 x float> %div.i +} + +; Test which should not fold due to non-power of 2. +; CHECK-LABEL: @test3 +; CHECK: scvtf.2s v0, v0 +; CHECK: fmov.2s v1, #9.00000000 +; CHECK: fdiv.2s v0, v0, v1 +; CHECK: ret +define <2 x float> @test3(<2 x i32> %in) { +entry: + %vcvt.i = sitofp <2 x i32> %in to <2 x float> + %div.i = fdiv <2 x float> %vcvt.i, + ret <2 x float> %div.i +} + +; Test which should not fold due to power of 2 out of range. +; CHECK-LABEL: @test4 +; CHECK: scvtf.2s v0, v0 +; CHECK: movi.2s v1, #0x50, lsl #24 +; CHECK: fdiv.2s v0, v0, v1 +; CHECK: ret +define <2 x float> @test4(<2 x i32> %in) { +entry: + %vcvt.i = sitofp <2 x i32> %in to <2 x float> + %div.i = fdiv <2 x float> %vcvt.i, + ret <2 x float> %div.i +} + +; Test case where const is max power of 2 (i.e., 2^32). +; CHECK-LABEL: @test5 +; CHECK: scvtf.2s v0, v0, #32 +; CHECK: ret +define <2 x float> @test5(<2 x i32> %in) { +entry: + %vcvt.i = sitofp <2 x i32> %in to <2 x float> + %div.i = fdiv <2 x float> %vcvt.i, + ret <2 x float> %div.i +} + +; Test quadword. +; CHECK-LABEL: @test6 +; CHECK: scvtf.4s v0, v0, #2 +; CHECK: ret +define <4 x float> @test6(<4 x i32> %in) { +entry: + %vcvt.i = sitofp <4 x i32> %in to <4 x float> + %div.i = fdiv <4 x float> %vcvt.i, + ret <4 x float> %div.i +} + +; Test unsigned i16 to float +; CHECK-LABEL: @test7 +; CHECK: ushll.4s v0, v0, #0 +; CHECK: ucvtf.4s v0, v0, #1 +; CHECK: ret +define <4 x float> @test7(<4 x i16> %in) { + %conv = uitofp <4 x i16> %in to <4 x float> + %shift = fdiv <4 x float> %conv, + ret <4 x float> %shift +} + +; Test signed i16 to float +; CHECK-LABEL: @test8 +; CHECK: sshll.4s v0, v0, #0 +; CHECK: scvtf.4s v0, v0, #2 +; CHECK: ret +define <4 x float> @test8(<4 x i16> %in) { + %conv = sitofp <4 x i16> %in to <4 x float> + %shift = fdiv <4 x float> %conv, + ret <4 x float> %shift +} + +; Can't convert i64 to float. +; CHECK-LABEL: @test9 +; CHECK: ucvtf.2d v0, v0 +; CHECK: fcvtn v0.2s, v0.2d +; CHECK: movi.2s v1, #0x40, lsl #24 +; CHECK: fdiv.2s v0, v0, v1 +; CHECK: ret +define <2 x float> @test9(<2 x i64> %in) { + %conv = uitofp <2 x i64> %in to <2 x float> + %shift = fdiv <2 x float> %conv, + ret <2 x float> %shift +} + +; CHECK-LABEL: @test10 +; CHECK: ucvtf.2d v0, v0, #1 +; CHECK: ret +define <2 x double> @test10(<2 x i64> %in) { + %conv = uitofp <2 x i64> %in to <2 x double> + %shift = fdiv <2 x double> %conv, + ret <2 x double> %shift +} diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll index 2dd0d1245930b..c0fec4d171cd1 100644 --- a/test/CodeGen/AArch64/fold-constants.ll +++ b/test/CodeGen/AArch64/fold-constants.ll @@ -3,9 +3,6 @@ define i64 @dotests_616() { ; CHECK-LABEL: dotests_616 ; CHECK: movi d0, #0000000000000000 -; CHECK-NEXT: umov w8, v0.b[2] -; CHECK-NEXT: sbfx w8, w8, #0, #1 -; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -19,3 +16,19 @@ entry: %vget_lane = extractelement <1 x i64> %4, i32 0 ret i64 %vget_lane } + +; PR25763 - folding constant vector comparisons with sign-extended result +define <8 x i16> @dotests_458() { +; CHECK-LABEL: dotests_458 +; CHECK: movi d0, #0x00000000ff0000 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ret +entry: + %vclz_v.i = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> , i1 false) #6 + %vsra_n = lshr <8 x i8> %vclz_v.i, + %name_6 = or <8 x i8> %vsra_n, + %cmp.i603 = icmp slt <8 x i8> %name_6, + %vmovl.i4.i = sext <8 x i1> %cmp.i603 to <8 x i16> + ret <8 x i16> %vmovl.i4.i +} +declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll index 0dbda152fca91..f6e4bdf734599 100644 --- a/test/CodeGen/AArch64/fp16-v4-instructions.ll +++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll @@ -130,7 +130,6 @@ define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) { ret <4 x i16> %2 } - define <4 x half> @sitofp_i8(<4 x i8> %a) #0 { ; CHECK-LABEL: sitofp_i8: ; CHECK-NEXT: shl [[OP1:v[0-9]+\.4h]], v0.4h, #8 @@ -218,4 +217,54 @@ define <4 x half> @uitofp_i64(<4 x i64> %a) #0 { ret <4 x half> %1 } +define void @test_insert_at_zero(half %a, <4 x half>* %b) #0 { +; CHECK-LABEL: test_insert_at_zero: +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %1 = insertelement <4 x half> undef, half %a, i64 0 + store <4 x half> %1, <4 x half>* %b, align 4 + ret void +} + +define <4 x i8> @fptosi_i8(<4 x half> %a) #0 { +; CHECK-LABEL: fptosi_i8: +; CHECK-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h +; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] +; CHECK-NEXT: xtn v0.4h, [[REG2]] +; CHECK-NEXT: ret + %1 = fptosi<4 x half> %a to <4 x i8> + ret <4 x i8> %1 +} + +define <4 x i16> @fptosi_i16(<4 x half> %a) #0 { +; CHECK-LABEL: fptosi_i16: +; CHECK-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h +; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] +; CHECK-NEXT: xtn v0.4h, [[REG2]] +; CHECK-NEXT: ret + %1 = fptosi<4 x half> %a to <4 x i16> + ret <4 x i16> %1 +} + +define <4 x i8> @fptoui_i8(<4 x half> %a) #0 { +; CHECK-LABEL: fptoui_i8: +; CHECK-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h +; NOTE: fcvtzs selected here because the xtn shaves the sign bit +; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]] +; CHECK-NEXT: xtn v0.4h, [[REG2]] +; CHECK-NEXT: ret + %1 = fptoui<4 x half> %a to <4 x i8> + ret <4 x i8> %1 +} + +define <4 x i16> @fptoui_i16(<4 x half> %a) #0 { +; CHECK-LABEL: fptoui_i16: +; CHECK-NEXT: fcvtl [[REG1:v[0-9]+\.4s]], v0.4h +; CHECK-NEXT: fcvtzu [[REG2:v[0-9]+\.4s]], [[REG1]] +; CHECK-NEXT: xtn v0.4h, [[REG2]] +; CHECK-NEXT: ret + %1 = fptoui<4 x half> %a to <4 x i16> + ret <4 x i16> %1 +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll index 10a8c22d6f7ef..137d1f358a304 100644 --- a/test/CodeGen/AArch64/fp16-v8-instructions.ll +++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll @@ -358,4 +358,67 @@ define <8 x half> @uitofp_i64(<8 x i64> %a) #0 { ret <8 x half> %1 } +define void @test_insert_at_zero(half %a, <8 x half>* %b) #0 { +; CHECK-LABEL: test_insert_at_zero: +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %1 = insertelement <8 x half> undef, half %a, i64 0 + store <8 x half> %1, <8 x half>* %b, align 4 + ret void +} + +define <8 x i8> @fptosi_i8(<8 x half> %a) #0 { +; CHECK-LABEL: fptosi_i8: +; CHECK-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h +; CHECK-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h +; CHECK-DAG: fcvtzs [[LOF32:v[0-9]+\.4s]], [[LO]] +; CHECK-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] +; CHECK-DAG: fcvtzs [[HIF32:v[0-9]+\.4s]], [[HI]] +; CHECK-DAG: xtn2 [[I16]].8h, [[HIF32]] +; CHECK-NEXT: xtn v0.8b, [[I16]].8h +; CHECK-NEXT: ret + %1 = fptosi<8 x half> %a to <8 x i8> + ret <8 x i8> %1 +} + +define <8 x i16> @fptosi_i16(<8 x half> %a) #0 { +; CHECK-LABEL: fptosi_i16: +; CHECK-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h +; CHECK-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h +; CHECK-DAG: fcvtzs [[LOF32:v[0-9]+\.4s]], [[LO]] +; CHECK-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] +; CHECK-DAG: fcvtzs [[HIF32:v[0-9]+\.4s]], [[HI]] +; CHECK-NEXT: xtn2 [[I16]].8h, [[HIF32]] +; CHECK-NEXT: ret + %1 = fptosi<8 x half> %a to <8 x i16> + ret <8 x i16> %1 +} + +define <8 x i8> @fptoui_i8(<8 x half> %a) #0 { +; CHECK-LABEL: fptoui_i8: +; CHECK-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h +; CHECK-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h +; CHECK-DAG: fcvtzu [[LOF32:v[0-9]+\.4s]], [[LO]] +; CHECK-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] +; CHECK-DAG: fcvtzu [[HIF32:v[0-9]+\.4s]], [[HI]] +; CHECK-DAG: xtn2 [[I16]].8h, [[HIF32]] +; CHECK-NEXT: xtn v0.8b, [[I16]].8h +; CHECK-NEXT: ret + %1 = fptoui<8 x half> %a to <8 x i8> + ret <8 x i8> %1 +} + +define <8 x i16> @fptoui_i16(<8 x half> %a) #0 { +; CHECK-LABEL: fptoui_i16: +; CHECK-DAG: fcvtl [[LO:v[0-9]+\.4s]], v0.4h +; CHECK-DAG: fcvtl2 [[HI:v[0-9]+\.4s]], v0.8h +; CHECK-DAG: fcvtzu [[LOF32:v[0-9]+\.4s]], [[LO]] +; CHECK-DAG: xtn [[I16:v[0-9]+]].4h, [[LOF32]] +; CHECK-DAG: fcvtzu [[HIF32:v[0-9]+\.4s]], [[HI]] +; CHECK-NEXT: xtn2 [[I16]].8h, [[HIF32]] +; CHECK-NEXT: ret + %1 = fptoui<8 x half> %a to <8 x i16> + ret <8 x i16> %1 +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/free-zext.ll b/test/CodeGen/AArch64/free-zext.ll index cff11f85bda4e..ea4f1f4e10f3e 100644 --- a/test/CodeGen/AArch64/free-zext.ll +++ b/test/CodeGen/AArch64/free-zext.ll @@ -1,7 +1,7 @@ ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s define i64 @test_free_zext(i8* %a, i16* %b) { -; CHECK-LABEL: test_free_zext +; CHECK-LABEL: test_free_zext: ; CHECK-DAG: ldrb w[[A:[0-9]+]], [x0] ; CHECK: ldrh w[[B:[0-9]+]], [x1] ; CHECK: add x0, x[[B]], x[[A]] @@ -12,3 +12,60 @@ define i64 @test_free_zext(i8* %a, i16* %b) { %add = add nsw i64 %conv1, %conv ret i64 %add } + +define void @test_free_zext2(i32* %ptr, i32* %dst1, i64* %dst2) { +; CHECK-LABEL: test_free_zext2: +; CHECK: ldrh w[[A:[0-9]+]], [x0] +; CHECK-NOT: and x +; CHECK: str w[[A]], [x1] +; CHECK: str x[[A]], [x2] + %load = load i32, i32* %ptr, align 8 + %load16 = and i32 %load, 65535 + %load64 = zext i32 %load16 to i64 + store i32 %load16, i32* %dst1, align 4 + store i64 %load64, i64* %dst2, align 8 + ret void +} + +; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads +; feeding a phi that zext's each loaded value. +define i32 @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i32 %c) { +; CHECK-LABEL: test_free_zext3: +bb1: +; CHECK: ldrh [[REG:w[0-9]+]] +; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff + %tmp1 = load i32, i32* %ptr, align 4 + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %bb2, label %bb3 +bb2: +; CHECK: ldrh [[REG2:w[0-9]+]] +; CHECK-NOT: and {{w[0-9]+}}, [[REG2]], #0xffff + %tmp2 = load i32, i32* %ptr2, align 4 + br label %bb3 +bb3: + %tmp3 = phi i32 [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] +; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff + %tmpand = and i32 %tmp3, 65535 + ret i32 %tmpand +} + +; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able +; load feeding a phi in the same block. +define void @test_free_zext4(i32* %ptr, i32* %ptr2, i32* %dst) { +; CHECK-LABEL: test_free_zext4: +; CHECK: ldrh [[REG:w[0-9]+]] +; TODO: fix isel to remove final and XCHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff +; CHECK: ldrh [[REG:w[0-9]+]] +bb1: + %load1 = load i32, i32* %ptr, align 4 + br label %loop +loop: + %phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ] + %and = and i32 %phi, 65535 + store i32 %and, i32* %dst, align 4 + %load2 = load i32, i32* %ptr2, align 4 + %cmp = icmp ne i32 %and, 0 + br i1 %cmp, label %loop, label %end +end: + ret void +} diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll index 9100ae39282bb..2ea13e3888678 100644 --- a/test/CodeGen/AArch64/func-argpassing.ll +++ b/test/CodeGen/AArch64/func-argpassing.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK %s -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-post-ra | FileCheck --check-prefix=CHECK %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -disable-post-ra | FileCheck --check-prefix=CHECK-NOFP %s %myStruct = type { i64 , i8, i32 } diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll index 22a33157fd555..2f45666ba13ae 100644 --- a/test/CodeGen/AArch64/func-calls.ll +++ b/test/CodeGen/AArch64/func-calls.ll @@ -89,11 +89,11 @@ define void @check_stack_args() { ; that varstruct is passed on the stack. Rather dependent on how a ; memcpy gets created, but the following works for now. -; CHECK-DAG: str {{q[0-9]+}}, [sp] +; CHECK-DAG: str {{q[0-9]+}}, [sp, #-16] ; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0 ; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b -; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp] +; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp, #-16]! ; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0 ; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]] diff --git a/test/CodeGen/AArch64/global-alignment.ll b/test/CodeGen/AArch64/global-alignment.ll index 657778e34187d..5e820b8bb3037 100644 --- a/test/CodeGen/AArch64/global-alignment.ll +++ b/test/CodeGen/AArch64/global-alignment.ll @@ -3,7 +3,7 @@ @var32 = global [3 x i32] zeroinitializer @var64 = global [3 x i64] zeroinitializer @var32_align64 = global [3 x i32] zeroinitializer, align 8 -@alias = alias [3 x i32]* @var32_align64 +@alias = alias [3 x i32], [3 x i32]* @var32_align64 define i64 @test_align32() { ; CHECK-LABEL: test_align32: diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll index 14b04303ffb38..b93f41c07df9e 100644 --- a/test/CodeGen/AArch64/global-merge-1.ll +++ b/test/CodeGen/AArch64/global-merge-1.ll @@ -12,16 +12,20 @@ define void @f1(i32 %a1, i32 %a2) { ;CHECK-APPLE-IOS-NOT: adrp -;CHECK-APPLE-IOS: adrp x8, __MergedGlobals@PAGE +;CHECK-APPLE-IOS: adrp x8, l__MergedGlobals@PAGE ;CHECK-APPLE-IOS-NOT: adrp -;CHECK-APPLE-IOS: add x8, x8, __MergedGlobals@PAGEOFF +;CHECK-APPLE-IOS: add x8, x8, l__MergedGlobals@PAGEOFF store i32 %a1, i32* @m, align 4 store i32 %a2, i32* @n, align 4 ret void } -;CHECK: .type _MergedGlobals,@object // @_MergedGlobals -;CHECK: .local _MergedGlobals -;CHECK: .comm _MergedGlobals,8,8 +;CHECK: .type .L_MergedGlobals,@object // @_MergedGlobals +;CHECK: .local .L_MergedGlobals +;CHECK: .comm .L_MergedGlobals,8,8 +;CHECK: m = .L_MergedGlobals +;CHECK: n = .L_MergedGlobals+4 -;CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,3 ; @_MergedGlobals +;CHECK-APPLE-IOS: .zerofill __DATA,__bss,l__MergedGlobals,8,3 ; @_MergedGlobals +;CHECK-APPLE-IOS-NOT: _m = l__MergedGlobals +;CHECK-APPLE-IOS-NOT: _n = l__MergedGlobals+4 diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll index af684039bf10f..53bed1d9bc093 100644 --- a/test/CodeGen/AArch64/global-merge-2.ll +++ b/test/CodeGen/AArch64/global-merge-2.ll @@ -9,8 +9,8 @@ define void @f1(i32 %a1, i32 %a2) { ;CHECK-APPLE-IOS-LABEL: _f1: ;CHECK-APPLE-IOS-NOT: adrp -;CHECK-APPLE-IOS: adrp x8, __MergedGlobals_x@PAGE -;CHECK-APPLE-IOS: add x8, x8, __MergedGlobals_x@PAGEOFF +;CHECK-APPLE-IOS: adrp x8, l__MergedGlobals@PAGE +;CHECK-APPLE-IOS: add x8, x8, l__MergedGlobals@PAGEOFF ;CHECK-APPLE-IOS-NOT: adrp store i32 %a1, i32* @x, align 4 store i32 %a2, i32* @y, align 4 @@ -19,34 +19,34 @@ define void @f1(i32 %a1, i32 %a2) { define void @g1(i32 %a1, i32 %a2) { ;CHECK-APPLE-IOS-LABEL: _g1: -;CHECK-APPLE-IOS: adrp x8, __MergedGlobals_x@PAGE -;CHECK-APPLE-IOS: add x8, x8, __MergedGlobals_x@PAGEOFF +;CHECK-APPLE-IOS: adrp x8, l__MergedGlobals@PAGE +;CHECK-APPLE-IOS: add x8, x8, l__MergedGlobals@PAGEOFF ;CHECK-APPLE-IOS-NOT: adrp store i32 %a1, i32* @y, align 4 store i32 %a2, i32* @z, align 4 ret void } -;CHECK: .type _MergedGlobals_x,@object // @_MergedGlobals_x -;CHECK: .globl _MergedGlobals_x -;CHECK: .align 3 -;CHECK: _MergedGlobals_x: -;CHECK: .size _MergedGlobals_x, 12 +;CHECK: .type .L_MergedGlobals,@object // @_MergedGlobals +;CHECK: .local .L_MergedGlobals +;CHECK: .comm .L_MergedGlobals,12,8 ;CHECK: .globl x -;CHECK: x = _MergedGlobals_x +;CHECK: x = .L_MergedGlobals +;CHECK: .size x, 4 ;CHECK: .globl y -;CHECK: y = _MergedGlobals_x+4 +;CHECK: y = .L_MergedGlobals+4 +;CHECK: .size y, 4 ;CHECK: .globl z -;CHECK: z = _MergedGlobals_x+8 +;CHECK: z = .L_MergedGlobals+8 +;CHECK: .size z, 4 -;CHECK-APPLE-IOS: .globl __MergedGlobals_x ; @_MergedGlobals_x -;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,12,3 +;CHECK-APPLE-IOS: .zerofill __DATA,__bss,l__MergedGlobals,12,3 ;CHECK-APPLE-IOS: .globl _x -;CHECK-APPLE-IOS: _x = __MergedGlobals_x +;CHECK-APPLE-IOS: = l__MergedGlobals ;CHECK-APPLE-IOS: .globl _y -;CHECK-APPLE-IOS: _y = __MergedGlobals_x+4 +;CHECK-APPLE-IOS: _y = l__MergedGlobals+4 ;CHECK-APPLE-IOS: .globl _z -;CHECK-APPLE-IOS: _z = __MergedGlobals_x+8 +;CHECK-APPLE-IOS: _z = l__MergedGlobals+8 ;CHECK-APPLE-IOS: .subsections_via_symbols diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll index 925108308e563..6895380ca63e9 100644 --- a/test/CodeGen/AArch64/global-merge-3.ll +++ b/test/CodeGen/AArch64/global-merge-3.ll @@ -1,17 +1,17 @@ -; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS +; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS @x = global [1000 x i32] zeroinitializer, align 1 @y = global [1000 x i32] zeroinitializer, align 1 @z = internal global i32 1, align 4 define void @f1(i32 %a1, i32 %a2, i32 %a3) { -;CHECK-APPLE-IOS: adrp x8, __MergedGlobals_x@PAGE +;CHECK-APPLE-IOS: adrp x8, l__MergedGlobals@PAGE ;CHECK-APPLE-IOS-NOT: adrp -;CHECK-APPLE-IOS: add x8, x8, __MergedGlobals_x@PAGEOFF -;CHECK-APPLE-IOS: adrp x9, __MergedGlobals_y@PAGE -;CHECK-APPLE-IOS: add x9, x9, __MergedGlobals_y@PAGEOFF +;CHECK-APPLE-IOS: add x8, x8, l__MergedGlobals@PAGEOFF +;CHECK-APPLE-IOS: adrp x9, l__MergedGlobals.1@PAGE +;CHECK-APPLE-IOS: add x9, x9, l__MergedGlobals.1@PAGEOFF %x3 = getelementptr inbounds [1000 x i32], [1000 x i32]* @x, i32 0, i64 3 %y3 = getelementptr inbounds [1000 x i32], [1000 x i32]* @y, i32 0, i64 3 store i32 %a1, i32* %x3, align 4 @@ -20,32 +20,32 @@ define void @f1(i32 %a1, i32 %a2, i32 %a3) { ret void } -;CHECK: .type _MergedGlobals_x,@object // @_MergedGlobals_x -;CHECK: .globl _MergedGlobals_x +;CHECK: .type .L_MergedGlobals,@object // @_MergedGlobals ;CHECK: .align 4 -;CHECK: _MergedGlobals_x: -;CHECK: .size _MergedGlobals_x, 4004 +;CHECK: .L_MergedGlobals: +;CHECK: .size .L_MergedGlobals, 4004 -;CHECK: .type _MergedGlobals_y,@object // @_MergedGlobals_y -;CHECK: .globl _MergedGlobals_y -;CHECK: _MergedGlobals_y: -;CHECK: .size _MergedGlobals_y, 4000 +;CHECK: .type .L_MergedGlobals.1,@object // @_MergedGlobals.1 +;CHECK: .local .L_MergedGlobals.1 +;CHECK: .comm .L_MergedGlobals.1,4000,16 -;CHECK-APPLE-IOS: .globl __MergedGlobals_x ; @_MergedGlobals_x ;CHECK-APPLE-IOS: .align 4 -;CHECK-APPLE-IOS: __MergedGlobals_x: +;CHECK-APPLE-IOS: l__MergedGlobals: ;CHECK-APPLE-IOS: .long 1 ;CHECK-APPLE-IOS: .space 4000 -;CHECK-APPLE-IOS: .globl __MergedGlobals_y ; @_MergedGlobals_y -;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_y,4000,4 +;CHECK-APPLE-IOS: .zerofill __DATA,__bss,l__MergedGlobals.1,4000,4 +;CHECK: z = .L_MergedGlobals ;CHECK: .globl x -;CHECK: x = _MergedGlobals_x+4 +;CHECK: x = .L_MergedGlobals+4 +;CHECK: .size x, 4000 ;CHECK: .globl y -;CHECK: y = _MergedGlobals_y +;CHECK: y = .L_MergedGlobals.1 +;CHECK: .size y, 4000 +;CHECK-APPLE-IOS-NOT: _z = l__MergedGlobals ;CHECK-APPLE-IOS:.globl _x -;CHECK-APPLE-IOS: _x = __MergedGlobals_x+4 +;CHECK-APPLE-IOS: _x = l__MergedGlobals+4 ;CHECK-APPLE-IOS:.globl _y -;CHECK-APPLE-IOS: _y = __MergedGlobals_y +;CHECK-APPLE-IOS: _y = l__MergedGlobals.1 diff --git a/test/CodeGen/AArch64/global-merge-4.ll b/test/CodeGen/AArch64/global-merge-4.ll index bc6b68a9c046a..a5109f6e8ea59 100644 --- a/test/CodeGen/AArch64/global-merge-4.ll +++ b/test/CodeGen/AArch64/global-merge-4.ll @@ -64,9 +64,9 @@ define internal i32* @returnFoo() #1 { ret i32* getelementptr inbounds ([5 x i32], [5 x i32]* @foo, i64 0, i64 0) } -;CHECK: .type _MergedGlobals,@object // @_MergedGlobals -;CHECK: .local _MergedGlobals -;CHECK: .comm _MergedGlobals,60,16 +;CHECK: .type .L_MergedGlobals,@object // @_MergedGlobals +;CHECK: .local .L_MergedGlobals +;CHECK: .comm .L_MergedGlobals,60,16 attributes #0 = { nounwind ssp } attributes #1 = { nounwind readnone ssp } diff --git a/test/CodeGen/AArch64/global-merge-group-by-use.ll b/test/CodeGen/AArch64/global-merge-group-by-use.ll index ddc044ed9e082..8b3fc97c9e2e3 100644 --- a/test/CodeGen/AArch64/global-merge-group-by-use.ll +++ b/test/CodeGen/AArch64/global-merge-group-by-use.ll @@ -12,7 +12,7 @@ ; CHECK-LABEL: f1: define void @f1(i32 %a1, i32 %a2) #0 { -; CHECK-NEXT: adrp x8, [[SET1:__MergedGlobals.[0-9]*]]@PAGE +; CHECK-NEXT: adrp x8, [[SET1:l__MergedGlobals.[0-9]*]]@PAGE ; CHECK-NEXT: add x8, x8, [[SET1]]@PAGEOFF ; CHECK-NEXT: stp w0, w1, [x8] ; CHECK-NEXT: ret @@ -27,7 +27,7 @@ define void @f1(i32 %a1, i32 %a2) #0 { ; CHECK-LABEL: f2: define void @f2(i32 %a1, i32 %a2, i32 %a3) #0 { -; CHECK-NEXT: adrp x8, [[SET2:__MergedGlobals.[0-9]*]]@PAGE +; CHECK-NEXT: adrp x8, [[SET2:l__MergedGlobals.[0-9]*]]@PAGE ; CHECK-NEXT: add x8, x8, [[SET2]]@PAGEOFF ; CHECK-NEXT: stp w0, w1, [x8] ; CHECK-NEXT: str w2, [x8, #8] @@ -48,7 +48,7 @@ define void @f2(i32 %a1, i32 %a2, i32 %a3) #0 { ; CHECK-LABEL: f3: define void @f3(i32 %a1, i32 %a2) #0 { ; CHECK-NEXT: adrp x8, _m3@PAGE -; CHECK-NEXT: adrp x9, [[SET3:__MergedGlobals[0-9]*]]@PAGE +; CHECK-NEXT: adrp x9, [[SET3:l__MergedGlobals[0-9]*]]@PAGE ; CHECK-NEXT: str w0, [x8, _m3@PAGEOFF] ; CHECK-NEXT: str w1, [x9, [[SET3]]@PAGEOFF] ; CHECK-NEXT: ret diff --git a/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll b/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll index e83cbab140a74..3994389257719 100644 --- a/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll +++ b/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll @@ -11,7 +11,7 @@ ; CHECK-LABEL: f1: define void @f1(i32 %a1, i32 %a2) minsize nounwind { -; CHECK-NEXT: adrp x8, [[SET:__MergedGlobals]]@PAGE +; CHECK-NEXT: adrp x8, [[SET:l__MergedGlobals]]@PAGE ; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF ; CHECK-NEXT: stp w0, w1, [x8] ; CHECK-NEXT: ret diff --git a/test/CodeGen/AArch64/global-merge-ignore-single-use.ll b/test/CodeGen/AArch64/global-merge-ignore-single-use.ll index e6de4699132ae..c3756a85feff5 100644 --- a/test/CodeGen/AArch64/global-merge-ignore-single-use.ll +++ b/test/CodeGen/AArch64/global-merge-ignore-single-use.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: f1: define void @f1(i32 %a1, i32 %a2) #0 { -; CHECK-NEXT: adrp x8, [[SET:__MergedGlobals]]@PAGE +; CHECK-NEXT: adrp x8, [[SET:l__MergedGlobals]]@PAGE ; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF ; CHECK-NEXT: stp w0, w1, [x8] ; CHECK-NEXT: ret diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll index b2c11c7517c0b..d2133213f1864 100644 --- a/test/CodeGen/AArch64/ldst-opt.ll +++ b/test/CodeGen/AArch64/ldst-opt.ll @@ -3,11 +3,15 @@ ; This file contains tests for the AArch64 load/store optimizer. %padding = type { i8*, i8*, i8*, i8* } +%s.byte = type { i8, i8 } +%s.halfword = type { i16, i16 } %s.word = type { i32, i32 } %s.doubleword = type { i64, i32 } %s.quadword = type { fp128, i32 } %s.float = type { float, i32 } %s.double = type { double, i32 } +%struct.byte = type { %padding, %s.byte } +%struct.halfword = type { %padding, %s.halfword } %struct.word = type { %padding, %s.word } %struct.doubleword = type { %padding, %s.doubleword } %struct.quadword = type { %padding, %s.quadword } @@ -24,6 +28,62 @@ ; ; with X being either w1, x1, s0, d0 or q0. +declare void @bar_byte(%s.byte*, i8) + +define void @load-pre-indexed-byte(%struct.byte* %ptr) nounwind { +; CHECK-LABEL: load-pre-indexed-byte +; CHECK: ldrb w{{[0-9]+}}, [x{{[0-9]+}}, #32]! +entry: + %a = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1, i32 0 + %add = load i8, i8* %a, align 4 + br label %bar +bar: + %c = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1 + tail call void @bar_byte(%s.byte* %c, i8 %add) + ret void +} + +define void @store-pre-indexed-byte(%struct.byte* %ptr, i8 %val) nounwind { +; CHECK-LABEL: store-pre-indexed-byte +; CHECK: strb w{{[0-9]+}}, [x{{[0-9]+}}, #32]! +entry: + %a = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1, i32 0 + store i8 %val, i8* %a, align 4 + br label %bar +bar: + %c = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1 + tail call void @bar_byte(%s.byte* %c, i8 %val) + ret void +} + +declare void @bar_halfword(%s.halfword*, i16) + +define void @load-pre-indexed-halfword(%struct.halfword* %ptr) nounwind { +; CHECK-LABEL: load-pre-indexed-halfword +; CHECK: ldrh w{{[0-9]+}}, [x{{[0-9]+}}, #32]! +entry: + %a = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1, i32 0 + %add = load i16, i16* %a, align 4 + br label %bar +bar: + %c = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1 + tail call void @bar_halfword(%s.halfword* %c, i16 %add) + ret void +} + +define void @store-pre-indexed-halfword(%struct.halfword* %ptr, i16 %val) nounwind { +; CHECK-LABEL: store-pre-indexed-halfword +; CHECK: strh w{{[0-9]+}}, [x{{[0-9]+}}, #32]! +entry: + %a = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1, i32 0 + store i16 %val, i16* %a, align 4 + br label %bar +bar: + %c = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1 + tail call void @bar_halfword(%s.halfword* %c, i16 %val) + ret void +} + declare void @bar_word(%s.word*, i32) define void @load-pre-indexed-word(%struct.word* %ptr) nounwind { @@ -164,6 +224,48 @@ bar: ret void } +; Check the following transform: +; +; (ldp|stp) w1, w2 [x0, #32] +; ... +; add x0, x0, #32 +; -> +; (ldp|stp) w1, w2, [x0, #32]! +; + +define void @load-pair-pre-indexed-word(%struct.word* %ptr) nounwind { +; CHECK-LABEL: load-pair-pre-indexed-word +; CHECK: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]! +; CHECK-NOT: add x0, x0, #32 +entry: + %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0 + %a1 = load i32, i32* %a, align 4 + %b = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 1 + %b1 = load i32, i32* %b, align 4 + %add = add i32 %a1, %b1 + br label %bar +bar: + %c = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1 + tail call void @bar_word(%s.word* %c, i32 %add) + ret void +} + +define void @store-pair-pre-indexed-word(%struct.word* %ptr, i32 %val) nounwind { +; CHECK-LABEL: store-pair-pre-indexed-word +; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]! +; CHECK-NOT: add x0, x0, #32 +entry: + %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0 + store i32 %val, i32* %a, align 4 + %b = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 1 + store i32 %val, i32* %b, align 4 + br label %bar +bar: + %c = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1 + tail call void @bar_word(%s.word* %c, i32 %val) + ret void +} + ; Check the following transform: ; ; add x8, x8, #16 @@ -174,11 +276,11 @@ bar: ; ; with X being either w0, x0, s0, d0 or q0. -%pre.struct.i32 = type { i32, i32, i32} -%pre.struct.i64 = type { i32, i64, i64} -%pre.struct.i128 = type { i32, <2 x i64>, <2 x i64>} -%pre.struct.float = type { i32, float, float} -%pre.struct.double = type { i32, double, double} +%pre.struct.i32 = type { i32, i32, i32, i32, i32} +%pre.struct.i64 = type { i32, i64, i64, i64, i64} +%pre.struct.i128 = type { i32, <2 x i64>, <2 x i64>, <2 x i64>} +%pre.struct.float = type { i32, float, float, float} +%pre.struct.double = type { i32, double, double, double} define i32 @load-pre-indexed-word2(%pre.struct.i32** %this, i1 %cond, %pre.struct.i32* %load2) nounwind { @@ -270,6 +372,96 @@ return: ret double %ret } +define i32 @load-pre-indexed-word3(%pre.struct.i32** %this, i1 %cond, + %pre.struct.i32* %load2) nounwind { +; CHECK-LABEL: load-pre-indexed-word3 +; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, #12]! + br i1 %cond, label %if.then, label %if.end +if.then: + %load1 = load %pre.struct.i32*, %pre.struct.i32** %this + %gep1 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load1, i64 0, i32 3 + br label %return +if.end: + %gep2 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load2, i64 0, i32 4 + br label %return +return: + %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ] + %ret = load i32, i32* %retptr + ret i32 %ret +} + +define i64 @load-pre-indexed-doubleword3(%pre.struct.i64** %this, i1 %cond, + %pre.struct.i64* %load2) nounwind { +; CHECK-LABEL: load-pre-indexed-doubleword3 +; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #16]! + br i1 %cond, label %if.then, label %if.end +if.then: + %load1 = load %pre.struct.i64*, %pre.struct.i64** %this + %gep1 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load1, i64 0, i32 2 + br label %return +if.end: + %gep2 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load2, i64 0, i32 3 + br label %return +return: + %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ] + %ret = load i64, i64* %retptr + ret i64 %ret +} + +define <2 x i64> @load-pre-indexed-quadword3(%pre.struct.i128** %this, i1 %cond, + %pre.struct.i128* %load2) nounwind { +; CHECK-LABEL: load-pre-indexed-quadword3 +; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]! + br i1 %cond, label %if.then, label %if.end +if.then: + %load1 = load %pre.struct.i128*, %pre.struct.i128** %this + %gep1 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load1, i64 0, i32 2 + br label %return +if.end: + %gep2 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load2, i64 0, i32 3 + br label %return +return: + %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ] + %ret = load <2 x i64>, <2 x i64>* %retptr + ret <2 x i64> %ret +} + +define float @load-pre-indexed-float3(%pre.struct.float** %this, i1 %cond, + %pre.struct.float* %load2) nounwind { +; CHECK-LABEL: load-pre-indexed-float3 +; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}, #8]! + br i1 %cond, label %if.then, label %if.end +if.then: + %load1 = load %pre.struct.float*, %pre.struct.float** %this + %gep1 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load1, i64 0, i32 2 + br label %return +if.end: + %gep2 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load2, i64 0, i32 3 + br label %return +return: + %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ] + %ret = load float, float* %retptr + ret float %ret +} + +define double @load-pre-indexed-double3(%pre.struct.double** %this, i1 %cond, + %pre.struct.double* %load2) nounwind { +; CHECK-LABEL: load-pre-indexed-double3 +; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #16]! + br i1 %cond, label %if.then, label %if.end +if.then: + %load1 = load %pre.struct.double*, %pre.struct.double** %this + %gep1 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load1, i64 0, i32 2 + br label %return +if.end: + %gep2 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load2, i64 0, i32 3 + br label %return +return: + %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ] + %ret = load double, double* %retptr + ret double %ret +} + ; Check the following transform: ; ; add x8, x8, #16 @@ -375,6 +567,101 @@ return: ret void } +define void @store-pre-indexed-word3(%pre.struct.i32** %this, i1 %cond, + %pre.struct.i32* %load2, + i32 %val) nounwind { +; CHECK-LABEL: store-pre-indexed-word3 +; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #12]! + br i1 %cond, label %if.then, label %if.end +if.then: + %load1 = load %pre.struct.i32*, %pre.struct.i32** %this + %gep1 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load1, i64 0, i32 3 + br label %return +if.end: + %gep2 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load2, i64 0, i32 4 + br label %return +return: + %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ] + store i32 %val, i32* %retptr + ret void +} + +define void @store-pre-indexed-doubleword3(%pre.struct.i64** %this, i1 %cond, + %pre.struct.i64* %load2, + i64 %val) nounwind { +; CHECK-LABEL: store-pre-indexed-doubleword3 +; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #24]! + br i1 %cond, label %if.then, label %if.end +if.then: + %load1 = load %pre.struct.i64*, %pre.struct.i64** %this + %gep1 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load1, i64 0, i32 3 + br label %return +if.end: + %gep2 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load2, i64 0, i32 4 + br label %return +return: + %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ] + store i64 %val, i64* %retptr + ret void +} + +define void @store-pre-indexed-quadword3(%pre.struct.i128** %this, i1 %cond, + %pre.struct.i128* %load2, + <2 x i64> %val) nounwind { +; CHECK-LABEL: store-pre-indexed-quadword3 +; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]! + br i1 %cond, label %if.then, label %if.end +if.then: + %load1 = load %pre.struct.i128*, %pre.struct.i128** %this + %gep1 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load1, i64 0, i32 2 + br label %return +if.end: + %gep2 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load2, i64 0, i32 3 + br label %return +return: + %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ] + store <2 x i64> %val, <2 x i64>* %retptr + ret void +} + +define void @store-pre-indexed-float3(%pre.struct.float** %this, i1 %cond, + %pre.struct.float* %load2, + float %val) nounwind { +; CHECK-LABEL: store-pre-indexed-float3 +; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}, #8]! + br i1 %cond, label %if.then, label %if.end +if.then: + %load1 = load %pre.struct.float*, %pre.struct.float** %this + %gep1 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load1, i64 0, i32 2 + br label %return +if.end: + %gep2 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load2, i64 0, i32 3 + br label %return +return: + %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ] + store float %val, float* %retptr + ret void +} + +define void @store-pre-indexed-double3(%pre.struct.double** %this, i1 %cond, + %pre.struct.double* %load2, + double %val) nounwind { +; CHECK-LABEL: store-pre-indexed-double3 +; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #16]! + br i1 %cond, label %if.then, label %if.end +if.then: + %load1 = load %pre.struct.double*, %pre.struct.double** %this + %gep1 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load1, i64 0, i32 2 + br label %return +if.end: + %gep2 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load2, i64 0, i32 3 + br label %return +return: + %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ] + store double %val, double* %retptr + ret void +} + ; Check the following transform: ; ; ldr X, [x20] @@ -385,6 +672,54 @@ return: ; ; with X being either w0, x0, s0, d0 or q0. +define void @load-post-indexed-byte(i8* %array, i64 %count) nounwind { +; CHECK-LABEL: load-post-indexed-byte +; CHECK: ldrb w{{[0-9]+}}, [x{{[0-9]+}}], #4 +entry: + %gep1 = getelementptr i8, i8* %array, i64 2 + br label %body + +body: + %iv2 = phi i8* [ %gep3, %body ], [ %gep1, %entry ] + %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ] + %gep2 = getelementptr i8, i8* %iv2, i64 -1 + %load = load i8, i8* %gep2 + call void @use-byte(i8 %load) + %load2 = load i8, i8* %iv2 + call void @use-byte(i8 %load2) + %iv.next = add i64 %iv, -4 + %gep3 = getelementptr i8, i8* %iv2, i64 4 + %cond = icmp eq i64 %iv.next, 0 + br i1 %cond, label %exit, label %body + +exit: + ret void +} + +define void @load-post-indexed-halfword(i16* %array, i64 %count) nounwind { +; CHECK-LABEL: load-post-indexed-halfword +; CHECK: ldrh w{{[0-9]+}}, [x{{[0-9]+}}], #8 +entry: + %gep1 = getelementptr i16, i16* %array, i64 2 + br label %body + +body: + %iv2 = phi i16* [ %gep3, %body ], [ %gep1, %entry ] + %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ] + %gep2 = getelementptr i16, i16* %iv2, i64 -1 + %load = load i16, i16* %gep2 + call void @use-halfword(i16 %load) + %load2 = load i16, i16* %iv2 + call void @use-halfword(i16 %load2) + %iv.next = add i64 %iv, -4 + %gep3 = getelementptr i16, i16* %iv2, i64 4 + %cond = icmp eq i64 %iv.next, 0 + br i1 %cond, label %exit, label %body + +exit: + ret void +} + define void @load-post-indexed-word(i32* %array, i64 %count) nounwind { ; CHECK-LABEL: load-post-indexed-word ; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #16 @@ -515,6 +850,52 @@ exit: ; ; with X being either w0, x0, s0, d0 or q0. +define void @store-post-indexed-byte(i8* %array, i64 %count, i8 %val) nounwind { +; CHECK-LABEL: store-post-indexed-byte +; CHECK: strb w{{[0-9]+}}, [x{{[0-9]+}}], #4 +entry: + %gep1 = getelementptr i8, i8* %array, i64 2 + br label %body + +body: + %iv2 = phi i8* [ %gep3, %body ], [ %gep1, %entry ] + %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ] + %gep2 = getelementptr i8, i8* %iv2, i64 -1 + %load = load i8, i8* %gep2 + call void @use-byte(i8 %load) + store i8 %val, i8* %iv2 + %iv.next = add i64 %iv, -4 + %gep3 = getelementptr i8, i8* %iv2, i64 4 + %cond = icmp eq i64 %iv.next, 0 + br i1 %cond, label %exit, label %body + +exit: + ret void +} + +define void @store-post-indexed-halfword(i16* %array, i64 %count, i16 %val) nounwind { +; CHECK-LABEL: store-post-indexed-halfword +; CHECK: strh w{{[0-9]+}}, [x{{[0-9]+}}], #8 +entry: + %gep1 = getelementptr i16, i16* %array, i64 2 + br label %body + +body: + %iv2 = phi i16* [ %gep3, %body ], [ %gep1, %entry ] + %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ] + %gep2 = getelementptr i16, i16* %iv2, i64 -1 + %load = load i16, i16* %gep2 + call void @use-halfword(i16 %load) + store i16 %val, i16* %iv2 + %iv.next = add i64 %iv, -4 + %gep3 = getelementptr i16, i16* %iv2, i64 4 + %cond = icmp eq i64 %iv.next, 0 + br i1 %cond, label %exit, label %body + +exit: + ret void +} + define void @store-post-indexed-word(i32* %array, i64 %count, i32 %val) nounwind { ; CHECK-LABEL: store-post-indexed-word ; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}], #16 @@ -630,12 +1011,98 @@ exit: ret void } +declare void @use-byte(i8) +declare void @use-halfword(i16) declare void @use-word(i32) declare void @use-doubleword(i64) declare void @use-quadword(<2 x i64>) declare void @use-float(float) declare void @use-double(double) +; Check the following transform: +; +; stp w0, [x20] +; ... +; add x20, x20, #32 +; -> +; stp w0, [x20], #32 + +define void @store-pair-post-indexed-word() nounwind { +; CHECK-LABEL: store-pair-post-indexed-word +; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [sp], #16 +; CHECK: ret + %src = alloca { i32, i32 }, align 8 + %dst = alloca { i32, i32 }, align 8 + + %src.realp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %src, i32 0, i32 0 + %src.real = load i32, i32* %src.realp + %src.imagp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %src, i32 0, i32 1 + %src.imag = load i32, i32* %src.imagp + + %dst.realp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %dst, i32 0, i32 1 + store i32 %src.real, i32* %dst.realp + store i32 %src.imag, i32* %dst.imagp + ret void +} + +define void @store-pair-post-indexed-doubleword() nounwind { +; CHECK-LABEL: store-pair-post-indexed-doubleword +; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [sp], #32 +; CHECK: ret + %src = alloca { i64, i64 }, align 8 + %dst = alloca { i64, i64 }, align 8 + + %src.realp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %src, i32 0, i32 0 + %src.real = load i64, i64* %src.realp + %src.imagp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %src, i32 0, i32 1 + %src.imag = load i64, i64* %src.imagp + + %dst.realp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %dst, i32 0, i32 1 + store i64 %src.real, i64* %dst.realp + store i64 %src.imag, i64* %dst.imagp + ret void +} + +define void @store-pair-post-indexed-float() nounwind { +; CHECK-LABEL: store-pair-post-indexed-float +; CHECK: stp s{{[0-9]+}}, s{{[0-9]+}}, [sp], #16 +; CHECK: ret + %src = alloca { float, float }, align 8 + %dst = alloca { float, float }, align 8 + + %src.realp = getelementptr inbounds { float, float }, { float, float }* %src, i32 0, i32 0 + %src.real = load float, float* %src.realp + %src.imagp = getelementptr inbounds { float, float }, { float, float }* %src, i32 0, i32 1 + %src.imag = load float, float* %src.imagp + + %dst.realp = getelementptr inbounds { float, float }, { float, float }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { float, float }, { float, float }* %dst, i32 0, i32 1 + store float %src.real, float* %dst.realp + store float %src.imag, float* %dst.imagp + ret void +} + +define void @store-pair-post-indexed-double() nounwind { +; CHECK-LABEL: store-pair-post-indexed-double +; CHECK: stp d{{[0-9]+}}, d{{[0-9]+}}, [sp], #32 +; CHECK: ret + %src = alloca { double, double }, align 8 + %dst = alloca { double, double }, align 8 + + %src.realp = getelementptr inbounds { double, double }, { double, double }* %src, i32 0, i32 0 + %src.real = load double, double* %src.realp + %src.imagp = getelementptr inbounds { double, double }, { double, double }* %src, i32 0, i32 1 + %src.imag = load double, double* %src.imagp + + %dst.realp = getelementptr inbounds { double, double }, { double, double }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { double, double }, { double, double }* %dst, i32 0, i32 1 + store double %src.real, double* %dst.realp + store double %src.imag, double* %dst.imagp + ret void +} + ; Check the following transform: ; ; (ldr|str) X, [x20] diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll index 18dbad4ce25b1..86f5edd5da1d4 100644 --- a/test/CodeGen/AArch64/merge-store.ll +++ b/test/CodeGen/AArch64/merge-store.ll @@ -1,4 +1,5 @@ ; RUN: llc -march aarch64 %s -o - | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cyclone | FileCheck %s --check-prefix=CYCLONE @g0 = external global <3 x float>, align 16 @g1 = external global <3 x float>, align 4 @@ -18,3 +19,32 @@ define void @blam() { store float %tmp9, float* %tmp7 ret void; } + + +; PR21711 - Merge vector stores into wider vector stores. + +; On Cyclone, the stores should not get merged into a 16-byte store because +; unaligned 16-byte stores are slow. This test would infinite loop when +; the fastness of unaligned accesses was not specified correctly. + +define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) { + %idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3 + %idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4 + + %shuffle0 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> + %shuffle1 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> + + store <2 x float> %shuffle0, <2 x float>* %idx0, align 8 + store <2 x float> %shuffle1, <2 x float>* %idx1, align 8 + ret void + +; CHECK-LABEL: merge_vec_extract_stores +; CHECK: stur q0, [x0, #24] +; CHECK-NEXT: ret + +; CYCLONE-LABEL: merge_vec_extract_stores +; CYCLONE: ext v1.16b, v0.16b, v0.16b, #8 +; CYCLONE-NEXT: str d0, [x0, #24] +; CYCLONE-NEXT: str d1, [x0, #32] +; CYCLONE-NEXT: ret +} diff --git a/test/CodeGen/AArch64/misched-fusion.ll b/test/CodeGen/AArch64/misched-fusion.ll new file mode 100644 index 0000000000000..d38869329034f --- /dev/null +++ b/test/CodeGen/AArch64/misched-fusion.ll @@ -0,0 +1,34 @@ +; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s +target triple = "arm64-apple-ios" + +declare void @foobar(i32 %v0, i32 %v1) + +; Make sure sub is scheduled in front of cbnz +; CHECK-LABEL: test_sub_cbz: +; CHECK: add w[[ADDRES:[0-9]+]], w1, #7 +; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13 +; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]] +; CHECK: mov x0, x[[ADDRES]] +; CHECK: mov x1, x[[SUBRES]] +; CHECK: bl _foobar +; CHECK: [[SKIPBLOCK]]: +; CHECK: mov x0, x[[SUBRES]] +; CHECK: mov x1, x[[ADDRES]] +; CHECK: bl _foobar +define void @test_sub_cbz(i32 %a0, i32 %a1) { +entry: + ; except for the fusion opportunity the sub/add should be equal so the + ; scheduler would leave them in source order if it weren't for the scheduling + %v0 = sub i32 %a0, 13 + %cond = icmp eq i32 %v0, 0 + %v1 = add i32 %a1, 7 + br i1 %cond, label %if, label %exit + +if: + call void @foobar(i32 %v1, i32 %v0) + br label %exit + +exit: + call void @foobar(i32 %v0, i32 %v1) + ret void +} diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll index 4515697b99918..e93521858a312 100644 --- a/test/CodeGen/AArch64/mul-lohi.ll +++ b/test/CodeGen/AArch64/mul-lohi.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s ; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - | FileCheck --check-prefix=CHECK-BE %s + define i128 @test_128bitmul(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: test_128bitmul: ; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3 @@ -16,3 +17,31 @@ define i128 @test_128bitmul(i128 %lhs, i128 %rhs) { %prod = mul i128 %lhs, %rhs ret i128 %prod } + +; The machine combiner should create madd instructions when +; optimizing for size because that's smaller than mul + add. + +define i128 @test_128bitmul_optsize(i128 %lhs, i128 %rhs) optsize { +; CHECK-LABEL: test_128bitmul_optsize: +; CHECK: umulh [[HI:x[0-9]+]], x0, x2 +; CHECK-NEXT: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]] +; CHECK-NEXT: madd x1, x1, x2, [[TEMP1]] +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: ret + + %prod = mul i128 %lhs, %rhs + ret i128 %prod +} + +define i128 @test_128bitmul_minsize(i128 %lhs, i128 %rhs) minsize { +; CHECK-LABEL: test_128bitmul_minsize: +; CHECK: umulh [[HI:x[0-9]+]], x0, x2 +; CHECK-NEXT: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]] +; CHECK-NEXT: madd x1, x1, x2, [[TEMP1]] +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: ret + + %prod = mul i128 %lhs, %rhs + ret i128 %prod +} + diff --git a/test/CodeGen/AArch64/nest-register.ll b/test/CodeGen/AArch64/nest-register.ll index 9c659fb74ec44..cc42913e10a6c 100644 --- a/test/CodeGen/AArch64/nest-register.ll +++ b/test/CodeGen/AArch64/nest-register.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -disable-post-ra -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s ; Tests that the 'nest' parameter attribute causes the relevant parameter to be ; passed in the right register. diff --git a/test/CodeGen/AArch64/nontemporal.ll b/test/CodeGen/AArch64/nontemporal.ll new file mode 100644 index 0000000000000..db9779e031904 --- /dev/null +++ b/test/CodeGen/AArch64/nontemporal.ll @@ -0,0 +1,339 @@ +; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false -disable-post-ra | FileCheck %s + +define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 { +; CHECK-LABEL: test_stnp_v4i64: +; CHECK-NEXT: mov d[[HI1:[0-9]+]], v1[1] +; CHECK-NEXT: mov d[[HI0:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d1, d[[HI1]], [x0, #16] +; CHECK-NEXT: stnp d0, d[[HI0]], [x0] +; CHECK-NEXT: ret + store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 { +; CHECK-LABEL: test_stnp_v4i32: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0] +; CHECK-NEXT: ret + store <4 x i32> %v, <4 x i32>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v8i16(<8 x i16>* %p, <8 x i16> %v) #0 { +; CHECK-LABEL: test_stnp_v8i16: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0] +; CHECK-NEXT: ret + store <8 x i16> %v, <8 x i16>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { +; CHECK-LABEL: test_stnp_v16i8: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0] +; CHECK-NEXT: ret + store <16 x i8> %v, <16 x i8>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2i32(<2 x i32>* %p, <2 x i32> %v) #0 { +; CHECK-LABEL: test_stnp_v2i32: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <2 x i32> %v, <2 x i32>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4i16(<4 x i16>* %p, <4 x i16> %v) #0 { +; CHECK-LABEL: test_stnp_v4i16: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <4 x i16> %v, <4 x i16>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v8i8(<8 x i8>* %p, <8 x i8> %v) #0 { +; CHECK-LABEL: test_stnp_v8i8: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <8 x i8> %v, <8 x i8>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f64(<2 x double>* %p, <2 x double> %v) #0 { +; CHECK-LABEL: test_stnp_v2f64: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0] +; CHECK-NEXT: ret + store <2 x double> %v, <2 x double>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4f32(<4 x float>* %p, <4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0] +; CHECK-NEXT: ret + store <4 x float> %v, <4 x float>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f32(<2 x float>* %p, <2 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v2f32: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <2 x float> %v, <2 x float>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v1f64(<1 x double>* %p, <1 x double> %v) #0 { +; CHECK-LABEL: test_stnp_v1f64: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <1 x double> %v, <1 x double>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v1i64(<1 x i64>* %p, <1 x i64> %v) #0 { +; CHECK-LABEL: test_stnp_v1i64: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0] +; CHECK-NEXT: ret + store <1 x i64> %v, <1 x i64>* %p, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_i64(i64* %p, i64 %v) #0 { +; CHECK-LABEL: test_stnp_i64: +; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32 +; CHECK-NEXT: stnp w1, w[[HI]], [x0] +; CHECK-NEXT: ret + store i64 %v, i64* %p, align 1, !nontemporal !0 + ret void +} + + +define void @test_stnp_v2f64_offset(<2 x double>* %p, <2 x double> %v) #0 { +; CHECK-LABEL: test_stnp_v2f64_offset: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0, #16] +; CHECK-NEXT: ret + %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 1 + store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f64_offset_neg(<2 x double>* %p, <2 x double> %v) #0 { +; CHECK-LABEL: test_stnp_v2f64_offset_neg: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0, #-16] +; CHECK-NEXT: ret + %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 -1 + store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f32_offset(<2 x float>* %p, <2 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v2f32_offset: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0, #8] +; CHECK-NEXT: ret + %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 1 + store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f32_offset_neg(<2 x float>* %p, <2 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v2f32_offset_neg: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0, #-8] +; CHECK-NEXT: ret + %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 -1 + store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_i64_offset(i64* %p, i64 %v) #0 { +; CHECK-LABEL: test_stnp_i64_offset: +; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32 +; CHECK-NEXT: stnp w1, w[[HI]], [x0, #8] +; CHECK-NEXT: ret + %tmp0 = getelementptr i64, i64* %p, i32 1 + store i64 %v, i64* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_i64_offset_neg(i64* %p, i64 %v) #0 { +; CHECK-LABEL: test_stnp_i64_offset_neg: +; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32 +; CHECK-NEXT: stnp w1, w[[HI]], [x0, #-8] +; CHECK-NEXT: ret + %tmp0 = getelementptr i64, i64* %p, i32 -1 + store i64 %v, i64* %tmp0, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4f32_invalid_offset_4(i8* %p, <4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32_invalid_offset_4: +; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #4 +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 4 + %tmp1 = bitcast i8* %tmp0 to <4 x float>* + store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4f32_invalid_offset_neg_4(i8* %p, <4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32_invalid_offset_neg_4: +; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #4 +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 -4 + %tmp1 = bitcast i8* %tmp0 to <4 x float>* + store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4f32_invalid_offset_512(i8* %p, <4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32_invalid_offset_512: +; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #512 +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 512 + %tmp1 = bitcast i8* %tmp0 to <4 x float>* + store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4f32_offset_504(i8* %p, <4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32_offset_504: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0, #504] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 504 + %tmp1 = bitcast i8* %tmp0 to <4 x float>* + store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4f32_invalid_offset_508(i8* %p, <4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32_invalid_offset_508: +; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #508 +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 508 + %tmp1 = bitcast i8* %tmp0 to <4 x float>* + store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4f32_invalid_offset_neg_520(i8* %p, <4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32_invalid_offset_neg_520: +; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #520 +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 -520 + %tmp1 = bitcast i8* %tmp0 to <4 x float>* + store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v4f32_offset_neg_512(i8* %p, <4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32_offset_neg_512: +; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp d0, d[[HI]], [x0, #-512] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 -512 + %tmp1 = bitcast i8* %tmp0 to <4 x float>* + store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + + +define void @test_stnp_v2f32_invalid_offset_256(i8* %p, <2 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v2f32_invalid_offset_256: +; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #256 +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 256 + %tmp1 = bitcast i8* %tmp0 to <2 x float>* + store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f32_offset_252(i8* %p, <2 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v2f32_offset_252: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0, #252] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 252 + %tmp1 = bitcast i8* %tmp0 to <2 x float>* + store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f32_invalid_offset_neg_260(i8* %p, <2 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v2f32_invalid_offset_neg_260: +; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #260 +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x[[PTR]]] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 -260 + %tmp1 = bitcast i8* %tmp0 to <2 x float>* + store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + +define void @test_stnp_v2f32_offset_neg_256(i8* %p, <2 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v2f32_offset_neg_256: +; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1] +; CHECK-NEXT: stnp s0, s[[HI]], [x0, #-256] +; CHECK-NEXT: ret + %tmp0 = getelementptr i8, i8* %p, i32 -256 + %tmp1 = bitcast i8* %tmp0 to <2 x float>* + store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0 + ret void +} + +declare void @dummy(<4 x float>*) + +define void @test_stnp_v4f32_offset_alloca(<4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32_offset_alloca: +; CHECK: stnp d0, d{{.*}}, [sp] +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: bl _dummy + %tmp0 = alloca <4 x float> + store <4 x float> %v, <4 x float>* %tmp0, align 1, !nontemporal !0 + call void @dummy(<4 x float>* %tmp0) + ret void +} + +define void @test_stnp_v4f32_offset_alloca_2(<4 x float> %v) #0 { +; CHECK-LABEL: test_stnp_v4f32_offset_alloca_2: +; CHECK: stnp d0, d{{.*}}, [sp, #16] +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: bl _dummy + %tmp0 = alloca <4 x float>, i32 2 + %tmp1 = getelementptr <4 x float>, <4 x float>* %tmp0, i32 1 + store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0 + call void @dummy(<4 x float>* %tmp0) + ret void +} + +!0 = !{ i32 1 } + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll index 143558f7b2c72..c59a5b6743d63 100644 --- a/test/CodeGen/AArch64/pic-eh-stubs.ll +++ b/test/CodeGen/AArch64/pic-eh-stubs.ll @@ -15,7 +15,7 @@ ; CHECK-NEXT: .xword .L_ZTIi.DW.stub-[[TYPEINFO_LBL]] ; .. and which is properly defined (in a writable section for the dynamic loader) later. -; CHECK: .section .data.rel,"aw" +; CHECK: .data ; CHECK: .L_ZTIi.DW.stub: ; CHECK-NEXT: .xword _ZTIi diff --git a/test/CodeGen/AArch64/readcyclecounter.ll b/test/CodeGen/AArch64/readcyclecounter.ll new file mode 100644 index 0000000000000..037f118093869 --- /dev/null +++ b/test/CodeGen/AArch64/readcyclecounter.ll @@ -0,0 +1,15 @@ +; RUN: llc -mtriple=aarch64-unknown-unknown -asm-verbose=false < %s |\ +; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=PERFMON +; RUN: llc -mtriple=aarch64-unknown-unknown -mattr=-perfmon -asm-verbose=false < %s |\ +; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=NOPERFMON + +define i64 @test_readcyclecounter() nounwind { + ; CHECK-LABEL: test_readcyclecounter: + ; PERFMON-NEXT: mrs x0, PMCCNTR_EL0 + ; NOPERFMON-NEXT: mov x0, xzr + ; CHECK-NEXT: ret + %tmp0 = call i64 @llvm.readcyclecounter() + ret i64 %tmp0 +} + +declare i64 @llvm.readcyclecounter() diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll index 0d301bbd502a3..ba34873eaa5b7 100644 --- a/test/CodeGen/AArch64/regress-tblgen-chains.ll +++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll @@ -27,8 +27,8 @@ define i64 @test_chains() { ; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]] ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1 -; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]] -; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]] +; CHECK: sturb w[[STRVAL:[0-9]+]], [x29, [[LOCADDR]]] +; CHECK; and w0, w[[STRVAL]], #0xff %ret.1 = load i8, i8* %locvar %ret.2 = zext i8 %ret.1 to i64 diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll index 8b3e6dd5ad92a..a397c339a2d7b 100644 --- a/test/CodeGen/AArch64/remat.ll +++ b/test/CodeGen/AArch64/remat.ll @@ -1,3 +1,4 @@ +; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a35 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s diff --git a/test/CodeGen/AArch64/rotate.ll b/test/CodeGen/AArch64/rotate.ll new file mode 100644 index 0000000000000..5ac86d5f59c9d --- /dev/null +++ b/test/CodeGen/AArch64/rotate.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=aarch64--linux-gnueabihf | FileCheck %s + +;; This used to cause a backend crash about not being able to +;; select ROTL. Make sure if generates the basic ushr/shl. +define <2 x i64> @testcase(<2 x i64>* %in) { +; CHECK-LABEL: testcase +; CHECK: ushr {{v[0-9]+}}.2d +; CHECK: shl {{v[0-9]+}}.2d + %1 = load <2 x i64>, <2 x i64>* %in + %2 = lshr <2 x i64> %1, + %3 = shl <2 x i64> %1, + %4 = or <2 x i64> %2, %3 + ret <2 x i64> %4 +} diff --git a/test/CodeGen/AArch64/round-conv.ll b/test/CodeGen/AArch64/round-conv.ll new file mode 100644 index 0000000000000..5ed7d9409e3dd --- /dev/null +++ b/test/CodeGen/AArch64/round-conv.ll @@ -0,0 +1,330 @@ +; RUN: llc < %s -mtriple=arm64 | FileCheck %s + +; CHECK-LABEL: testmsws: +; CHECK: fcvtms w0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i32 @testmsws(float %a) { +entry: + %call = call float @floorf(float %a) nounwind readnone + %conv = fptosi float %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testmsxs: +; CHECK: fcvtms x0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i64 @testmsxs(float %a) { +entry: + %call = call float @floorf(float %a) nounwind readnone + %conv = fptosi float %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testmswd: +; CHECK: fcvtms w0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i32 @testmswd(double %a) { +entry: + %call = call double @floor(double %a) nounwind readnone + %conv = fptosi double %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testmsxd: +; CHECK: fcvtms x0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i64 @testmsxd(double %a) { +entry: + %call = call double @floor(double %a) nounwind readnone + %conv = fptosi double %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testmuws: +; CHECK: fcvtmu w0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i32 @testmuws(float %a) { +entry: + %call = call float @floorf(float %a) nounwind readnone + %conv = fptoui float %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testmuxs: +; CHECK: fcvtmu x0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i64 @testmuxs(float %a) { +entry: + %call = call float @floorf(float %a) nounwind readnone + %conv = fptoui float %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testmuwd: +; CHECK: fcvtmu w0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i32 @testmuwd(double %a) { +entry: + %call = call double @floor(double %a) nounwind readnone + %conv = fptoui double %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testmuxd: +; CHECK: fcvtmu x0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i64 @testmuxd(double %a) { +entry: + %call = call double @floor(double %a) nounwind readnone + %conv = fptoui double %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testpsws: +; CHECK: fcvtps w0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i32 @testpsws(float %a) { +entry: + %call = call float @ceilf(float %a) nounwind readnone + %conv = fptosi float %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testpsxs: +; CHECK: fcvtps x0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i64 @testpsxs(float %a) { +entry: + %call = call float @ceilf(float %a) nounwind readnone + %conv = fptosi float %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testpswd: +; CHECK: fcvtps w0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i32 @testpswd(double %a) { +entry: + %call = call double @ceil(double %a) nounwind readnone + %conv = fptosi double %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testpsxd: +; CHECK: fcvtps x0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i64 @testpsxd(double %a) { +entry: + %call = call double @ceil(double %a) nounwind readnone + %conv = fptosi double %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testpuws: +; CHECK: fcvtpu w0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i32 @testpuws(float %a) { +entry: + %call = call float @ceilf(float %a) nounwind readnone + %conv = fptoui float %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testpuxs: +; CHECK: fcvtpu x0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i64 @testpuxs(float %a) { +entry: + %call = call float @ceilf(float %a) nounwind readnone + %conv = fptoui float %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testpuwd: +; CHECK: fcvtpu w0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i32 @testpuwd(double %a) { +entry: + %call = call double @ceil(double %a) nounwind readnone + %conv = fptoui double %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testpuxd: +; CHECK: fcvtpu x0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i64 @testpuxd(double %a) { +entry: + %call = call double @ceil(double %a) nounwind readnone + %conv = fptoui double %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testzsws: +; CHECK: fcvtzs w0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i32 @testzsws(float %a) { +entry: + %call = call float @truncf(float %a) nounwind readnone + %conv = fptosi float %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testzsxs: +; CHECK: fcvtzs x0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i64 @testzsxs(float %a) { +entry: + %call = call float @truncf(float %a) nounwind readnone + %conv = fptosi float %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testzswd: +; CHECK: fcvtzs w0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i32 @testzswd(double %a) { +entry: + %call = call double @trunc(double %a) nounwind readnone + %conv = fptosi double %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testzsxd: +; CHECK: fcvtzs x0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i64 @testzsxd(double %a) { +entry: + %call = call double @trunc(double %a) nounwind readnone + %conv = fptosi double %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testzuws: +; CHECK: fcvtzu w0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i32 @testzuws(float %a) { +entry: + %call = call float @truncf(float %a) nounwind readnone + %conv = fptoui float %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testzuxs: +; CHECK: fcvtzu x0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i64 @testzuxs(float %a) { +entry: + %call = call float @truncf(float %a) nounwind readnone + %conv = fptoui float %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testzuwd: +; CHECK: fcvtzu w0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i32 @testzuwd(double %a) { +entry: + %call = call double @trunc(double %a) nounwind readnone + %conv = fptoui double %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testzuxd: +; CHECK: fcvtzu x0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i64 @testzuxd(double %a) { +entry: + %call = call double @trunc(double %a) nounwind readnone + %conv = fptoui double %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testasws: +; CHECK: fcvtas w0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i32 @testasws(float %a) { +entry: + %call = call float @roundf(float %a) nounwind readnone + %conv = fptosi float %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testasxs: +; CHECK: fcvtas x0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i64 @testasxs(float %a) { +entry: + %call = call float @roundf(float %a) nounwind readnone + %conv = fptosi float %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testaswd: +; CHECK: fcvtas w0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i32 @testaswd(double %a) { +entry: + %call = call double @round(double %a) nounwind readnone + %conv = fptosi double %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testasxd: +; CHECK: fcvtas x0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i64 @testasxd(double %a) { +entry: + %call = call double @round(double %a) nounwind readnone + %conv = fptosi double %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testauws: +; CHECK: fcvtau w0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i32 @testauws(float %a) { +entry: + %call = call float @roundf(float %a) nounwind readnone + %conv = fptoui float %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testauxs: +; CHECK: fcvtau x0, s0 +; CHECK-NOT: frintx {{s[0-9]+}}, s0 +define i64 @testauxs(float %a) { +entry: + %call = call float @roundf(float %a) nounwind readnone + %conv = fptoui float %call to i64 + ret i64 %conv +} + +; CHECK-LABEL: testauwd: +; CHECK: fcvtau w0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i32 @testauwd(double %a) { +entry: + %call = call double @round(double %a) nounwind readnone + %conv = fptoui double %call to i32 + ret i32 %conv +} + +; CHECK-LABEL: testauxd: +; CHECK: fcvtau x0, d0 +; CHECK-NOT: frintx {{d[0-9]+}}, d0 +define i64 @testauxd(double %a) { +entry: + %call = call double @round(double %a) nounwind readnone + %conv = fptoui double %call to i64 + ret i64 %conv +} + +declare float @floorf(float) nounwind readnone +declare double @floor(double) nounwind readnone +declare float @ceilf(float) nounwind readnone +declare double @ceil(double) nounwind readnone +declare float @truncf(float) nounwind readnone +declare double @trunc(double) nounwind readnone +declare float @roundf(float) nounwind readnone +declare double @round(double) nounwind readnone diff --git a/test/CodeGen/AArch64/shrink-wrap.ll b/test/CodeGen/AArch64/shrink-wrap.ll new file mode 100755 index 0000000000000..ea101a8da15d5 --- /dev/null +++ b/test/CodeGen/AArch64/shrink-wrap.ll @@ -0,0 +1,184 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -o - %s + +; Regression test for a crash in the ShrinkWrap pass not handling targets +; requiring a register scavenger. + +%type1 = type { i32, i32, i32 } + +@g1 = external unnamed_addr global i32, align 4 +@g2 = external unnamed_addr global i1 +@g3 = external unnamed_addr global [144 x i32], align 4 +@g4 = external unnamed_addr constant [144 x i32], align 4 +@g5 = external unnamed_addr constant [144 x i32], align 4 +@g6 = external unnamed_addr constant [144 x i32], align 4 +@g7 = external unnamed_addr constant [144 x i32], align 4 +@g8 = external unnamed_addr constant [144 x i32], align 4 +@g9 = external unnamed_addr constant [144 x i32], align 4 +@g10 = external unnamed_addr constant [144 x i32], align 4 +@g11 = external unnamed_addr global i32, align 4 +@g12 = external unnamed_addr global [144 x [144 x i8]], align 1 +@g13 = external unnamed_addr global %type1*, align 8 +@g14 = external unnamed_addr global [144 x [144 x i8]], align 1 +@g15 = external unnamed_addr global [144 x [144 x i8]], align 1 +@g16 = external unnamed_addr global [144 x [144 x i8]], align 1 +@g17 = external unnamed_addr global [62 x i32], align 4 +@g18 = external unnamed_addr global i32, align 4 +@g19 = external unnamed_addr constant [144 x i32], align 4 +@g20 = external unnamed_addr global [144 x [144 x i8]], align 1 +@g21 = external unnamed_addr global i32, align 4 + +declare fastcc i32 @foo() + +declare fastcc i32 @bar() + +define internal fastcc i32 @func(i32 %alpha, i32 %beta) { +entry: + %v1 = alloca [2 x [11 x i32]], align 4 + %v2 = alloca [11 x i32], align 16 + %v3 = alloca [11 x i32], align 16 + switch i32 undef, label %if.end.9 [ + i32 4, label %if.then.6 + i32 3, label %if.then.2 + ] + +if.then.2: + %call3 = tail call fastcc i32 @bar() + br label %cleanup + +if.then.6: + %call7 = tail call fastcc i32 @foo() + unreachable + +if.end.9: + %tmp = load i32, i32* @g1, align 4 + %rem.i = urem i32 %tmp, 1000000 + %idxprom.1.i = zext i32 %rem.i to i64 + %tmp1 = load %type1*, %type1** @g13, align 8 + %v4 = getelementptr inbounds %type1, %type1* %tmp1, i64 %idxprom.1.i, i32 0 + %.b = load i1, i1* @g2, align 1 + %v5 = select i1 %.b, i32 2, i32 0 + %tmp2 = load i32, i32* @g18, align 4 + %tmp3 = load i32, i32* @g11, align 4 + %idxprom58 = sext i32 %tmp3 to i64 + %tmp4 = load i32, i32* @g21, align 4 + %idxprom69 = sext i32 %tmp4 to i64 + br label %for.body + +for.body: + %v6 = phi i32 [ 0, %if.end.9 ], [ %v7, %for.inc ] + %a.0983 = phi i32 [ 1, %if.end.9 ], [ %a.1, %for.inc ] + %arrayidx = getelementptr inbounds [62 x i32], [62 x i32]* @g17, i64 0, i64 undef + %tmp5 = load i32, i32* %arrayidx, align 4 + br i1 undef, label %for.inc, label %if.else.51 + +if.else.51: + %idxprom53 = sext i32 %tmp5 to i64 + %arrayidx54 = getelementptr inbounds [144 x i32], [144 x i32]* @g3, i64 0, i64 %idxprom53 + %tmp6 = load i32, i32* %arrayidx54, align 4 + switch i32 %tmp6, label %for.inc [ + i32 1, label %block.bb + i32 10, label %block.bb.159 + i32 7, label %block.bb.75 + i32 8, label %block.bb.87 + i32 9, label %block.bb.147 + i32 12, label %block.bb.111 + i32 3, label %block.bb.123 + i32 4, label %block.bb.135 + ] + +block.bb: + %arrayidx56 = getelementptr inbounds [144 x i32], [144 x i32]* @g6, i64 0, i64 %idxprom53 + %tmp7 = load i32, i32* %arrayidx56, align 4 + %shr = ashr i32 %tmp7, %v5 + %add57 = add nsw i32 %shr, 0 + %arrayidx61 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g14, i64 0, i64 %idxprom53, i64 %idxprom58 + %tmp8 = load i8, i8* %arrayidx61, align 1 + %conv = zext i8 %tmp8 to i32 + %add62 = add nsw i32 %conv, %add57 + br label %for.inc + +block.bb.75: + %arrayidx78 = getelementptr inbounds [144 x i32], [144 x i32]* @g10, i64 0, i64 %idxprom53 + %tmp9 = load i32, i32* %arrayidx78, align 4 + %shr79 = ashr i32 %tmp9, %v5 + %add80 = add nsw i32 %shr79, 0 + %add86 = add nsw i32 0, %add80 + br label %for.inc + +block.bb.87: + %arrayidx90 = getelementptr inbounds [144 x i32], [144 x i32]* @g9, i64 0, i64 %idxprom53 + %tmp10 = load i32, i32* %arrayidx90, align 4 + %shr91 = ashr i32 %tmp10, 0 + %sub92 = sub nsw i32 0, %shr91 + %arrayidx96 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g15, i64 0, i64 %idxprom53, i64 %idxprom69 + %tmp11 = load i8, i8* %arrayidx96, align 1 + %conv97 = zext i8 %tmp11 to i32 + %sub98 = sub nsw i32 %sub92, %conv97 + br label %for.inc + +block.bb.111: + %arrayidx114 = getelementptr inbounds [144 x i32], [144 x i32]* @g19, i64 0, i64 %idxprom53 + %tmp12 = load i32, i32* %arrayidx114, align 4 + %shr115 = ashr i32 %tmp12, 0 + %sub116 = sub nsw i32 0, %shr115 + %arrayidx120 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g12, i64 0, i64 %idxprom53, i64 %idxprom69 + %tmp13 = load i8, i8* %arrayidx120, align 1 + %conv121 = zext i8 %tmp13 to i32 + %sub122 = sub nsw i32 %sub116, %conv121 + br label %for.inc + +block.bb.123: + %arrayidx126 = getelementptr inbounds [144 x i32], [144 x i32]* @g5, i64 0, i64 %idxprom53 + %tmp14 = load i32, i32* %arrayidx126, align 4 + %shr127 = ashr i32 %tmp14, %v5 + %add128 = add nsw i32 %shr127, 0 + %add134 = add nsw i32 0, %add128 + br label %for.inc + +block.bb.135: + %arrayidx138 = getelementptr inbounds [144 x i32], [144 x i32]* @g4, i64 0, i64 %idxprom53 + %tmp15 = load i32, i32* %arrayidx138, align 4 + %shr139 = ashr i32 %tmp15, 0 + %sub140 = sub nsw i32 0, %shr139 + %arrayidx144 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g20, i64 0, i64 %idxprom53, i64 %idxprom69 + %tmp16 = load i8, i8* %arrayidx144, align 1 + %conv145 = zext i8 %tmp16 to i32 + %sub146 = sub nsw i32 %sub140, %conv145 + br label %for.inc + +block.bb.147: + %arrayidx150 = getelementptr inbounds [144 x i32], [144 x i32]* @g8, i64 0, i64 %idxprom53 + %tmp17 = load i32, i32* %arrayidx150, align 4 + %shr151 = ashr i32 %tmp17, %v5 + %add152 = add nsw i32 %shr151, 0 + %arrayidx156 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g16, i64 0, i64 %idxprom53, i64 %idxprom58 + %tmp18 = load i8, i8* %arrayidx156, align 1 + %conv157 = zext i8 %tmp18 to i32 + %add158 = add nsw i32 %conv157, %add152 + br label %for.inc + +block.bb.159: + %sub160 = add nsw i32 %v6, -450 + %arrayidx162 = getelementptr inbounds [144 x i32], [144 x i32]* @g7, i64 0, i64 %idxprom53 + %tmp19 = load i32, i32* %arrayidx162, align 4 + %shr163 = ashr i32 %tmp19, 0 + %sub164 = sub nsw i32 %sub160, %shr163 + %sub170 = sub nsw i32 %sub164, 0 + br label %for.inc + +for.inc: + %v7 = phi i32 [ %v6, %for.body ], [ %v6, %if.else.51 ], [ %sub170, %block.bb.159 ], [ %add158, %block.bb.147 ], [ %sub146, %block.bb.135 ], [ %add134, %block.bb.123 ], [ %sub122, %block.bb.111 ], [ %sub98, %block.bb.87 ], [ %add86, %block.bb.75 ], [ %add62, %block.bb ] + %a.1 = phi i32 [ %a.0983, %for.body ], [ undef, %if.else.51 ], [ undef, %block.bb.159 ], [ undef, %block.bb.147 ], [ undef, %block.bb.135 ], [ undef, %block.bb.123 ], [ undef, %block.bb.111 ], [ undef, %block.bb.87 ], [ undef, %block.bb.75 ], [ undef, %block.bb ] + %cmp48 = icmp sgt i32 %a.1, %tmp2 + br i1 %cmp48, label %for.end, label %for.body + +for.end: + store i32 %tmp, i32* %v4, align 4 + %hold_hash.i.7 = getelementptr inbounds %type1, %type1* %tmp1, i64 %idxprom.1.i, i32 1 + store i32 0, i32* %hold_hash.i.7, align 4 + br label %cleanup + +cleanup: + %retval.0 = phi i32 [ %call3, %if.then.2 ], [ undef, %for.end ] + ret i32 %retval.0 +} diff --git a/test/CodeGen/AArch64/stackmap-frame-setup.ll b/test/CodeGen/AArch64/stackmap-frame-setup.ll new file mode 100644 index 0000000000000..4712012b0d25d --- /dev/null +++ b/test/CodeGen/AArch64/stackmap-frame-setup.ll @@ -0,0 +1,20 @@ +; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=aarch64-apple-darwin -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL +; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL + +define void @caller_meta_leaf() { +entry: + %metadata = alloca i64, i32 3, align 8 + store i64 11, i64* %metadata + store i64 12, i64* %metadata + store i64 13, i64* %metadata +; ISEL: ADJCALLSTACKDOWN 0, implicit-def +; ISEL-NEXT: STACKMAP +; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def + call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata) +; FAST-ISEL: ADJCALLSTACKDOWN 0, implicit-def +; FAST-ISEL-NEXT: STACKMAP +; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def + ret void +} + +declare void @llvm.experimental.stackmap(i64, i32, ...) diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll index e5766154bb46f..fa5d8b943b6b5 100644 --- a/test/CodeGen/AArch64/tail-call.ll +++ b/test/CodeGen/AArch64/tail-call.ll @@ -59,8 +59,7 @@ define fastcc void @caller_to16_from8([8 x i32], i64 %a) { ; callee will not deallocate the space, even in fastcc. tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2) -; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] -; CHECK-NEXT: add sp, sp, #16 +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]! ; CHECK-NEXT: b callee_stack16 ret void } @@ -89,8 +88,7 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) { ret void ; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] -; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] -; CHECK-NEXT: add sp, sp, #16 +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]! ; CHECK-NEXT: b callee_stack16 } diff --git a/test/CodeGen/AArch64/tailcall-explicit-sret.ll b/test/CodeGen/AArch64/tailcall-explicit-sret.ll index 4d80f2ac5c121..bcc8af8d0690f 100644 --- a/test/CodeGen/AArch64/tailcall-explicit-sret.ll +++ b/test/CodeGen/AArch64/tailcall-explicit-sret.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s +; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false -disable-post-ra | FileCheck %s ; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks. target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" diff --git a/test/CodeGen/AArch64/tbi.ll b/test/CodeGen/AArch64/tbi.ll new file mode 100644 index 0000000000000..ab2d31b7cacc0 --- /dev/null +++ b/test/CodeGen/AArch64/tbi.ll @@ -0,0 +1,102 @@ +; RUN: llc -aarch64-use-tbi -mtriple=arm64-apple-ios8.0.0 < %s \ +; RUN: | FileCheck --check-prefix=TBI --check-prefix=BOTH %s +; RUN: llc -aarch64-use-tbi -mtriple=arm64-apple-ios7.1.0 < %s \ +; RUN: | FileCheck --check-prefix=NO_TBI --check-prefix=BOTH %s + +; BOTH-LABEL:ld_and32: +; TBI-NOT: and x +; NO_TBI: and x +define i32 @ld_and32(i64 %p) { + %and = and i64 %p, 72057594037927935 + %cast = inttoptr i64 %and to i32* + %load = load i32, i32* %cast + ret i32 %load +} + +; load (r & MASK) + 4 +; BOTH-LABEL:ld_and_plus_offset: +; TBI-NOT: and x +; NO_TBI: and x +define i32 @ld_and_plus_offset(i64 %p) { + %and = and i64 %p, 72057594037927935 + %cast = inttoptr i64 %and to i32* + %gep = getelementptr i32, i32* %cast, i64 4 + %load = load i32, i32* %gep + ret i32 %load +} + +; load (r & WIDER_MASK) +; BOTH-LABEL:ld_and32_wider: +; TBI-NOT: and x +; NO_TBI: and x +define i32 @ld_and32_wider(i64 %p) { + %and = and i64 %p, 1152921504606846975 + %cast = inttoptr i64 %and to i32* + %load = load i32, i32* %cast + ret i32 %load +} + +; BOTH-LABEL:ld_and64: +; TBI-NOT: and x +; NO_TBI: and x +define i64 @ld_and64(i64 %p) { + %and = and i64 %p, 72057594037927935 + %cast = inttoptr i64 %and to i64* + %load = load i64, i64* %cast + ret i64 %load +} + +; BOTH-LABEL:st_and32: +; TBI-NOT: and x +; NO_TBI: and x +define void @st_and32(i64 %p, i32 %v) { + %and = and i64 %p, 72057594037927935 + %cast = inttoptr i64 %and to i32* + store i32 %v, i32* %cast + ret void +} + +; load (x1 + x2) & MASK +; BOTH-LABEL:ld_ro: +; TBI-NOT: and x +; NO_TBI: and x +define i32 @ld_ro(i64 %a, i64 %b) { + %p = add i64 %a, %b + %and = and i64 %p, 72057594037927935 + %cast = inttoptr i64 %and to i32* + %load = load i32, i32* %cast + ret i32 %load +} + +; load (r1 & MASK) + r2 +; BOTH-LABEL:ld_ro2: +; TBI-NOT: and x +; NO_TBI: and x +define i32 @ld_ro2(i64 %a, i64 %b) { + %and = and i64 %a, 72057594037927935 + %p = add i64 %and, %b + %cast = inttoptr i64 %p to i32* + %load = load i32, i32* %cast + ret i32 %load +} + +; load (r1 & MASK) | r2 +; BOTH-LABEL:ld_indirect_and: +; TBI-NOT: and x +; NO_TBI: and x +define i32 @ld_indirect_and(i64 %r1, i64 %r2) { + %and = and i64 %r1, 72057594037927935 + %p = or i64 %and, %r2 + %cast = inttoptr i64 %p to i32* + %load = load i32, i32* %cast + ret i32 %load +} + +; BOTH-LABEL:ld_and32_narrower: +; BOTH: and x +define i32 @ld_and32_narrower(i64 %p) { + %and = and i64 %p, 36028797018963967 + %cast = inttoptr i64 %and to i32* + %load = load i32, i32* %cast + ret i32 %load +} diff --git a/test/CodeGen/AArch64/vector-fcopysign.ll b/test/CodeGen/AArch64/vector-fcopysign.ll new file mode 100644 index 0000000000000..865a0a5b85808 --- /dev/null +++ b/test/CodeGen/AArch64/vector-fcopysign.ll @@ -0,0 +1,178 @@ +; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false -disable-post-ra | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +;============ v1f32 + +; WidenVecRes same +define <1 x float> @test_copysign_v1f32_v1f32(<1 x float> %a, <1 x float> %b) #0 { +; CHECK-LABEL: test_copysign_v1f32_v1f32: +; CHECK-NEXT: movi.2s v2, #0x80, lsl #24 +; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: ret + %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %b) + ret <1 x float> %r +} + +; WidenVecRes mismatched +define <1 x float> @test_copysign_v1f32_v1f64(<1 x float> %a, <1 x double> %b) #0 { +; CHECK-LABEL: test_copysign_v1f32_v1f64: +; CHECK-NEXT: fcvt s1, d1 +; CHECK-NEXT: movi.4s v2, #0x80, lsl #24 +; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: ret + %tmp0 = fptrunc <1 x double> %b to <1 x float> + %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %tmp0) + ret <1 x float> %r +} + +declare <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %b) #0 + +;============ v1f64 + +; WidenVecOp #1 +define <1 x double> @test_copysign_v1f64_v1f32(<1 x double> %a, <1 x float> %b) #0 { +; CHECK-LABEL: test_copysign_v1f64_v1f32: +; CHECK-NEXT: fcvt d1, s1 +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: fneg.2d v2, v2 +; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: ret + %tmp0 = fpext <1 x float> %b to <1 x double> + %r = call <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %tmp0) + ret <1 x double> %r +} + +define <1 x double> @test_copysign_v1f64_v1f64(<1 x double> %a, <1 x double> %b) #0 { +; CHECK-LABEL: test_copysign_v1f64_v1f64: +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: fneg.2d v2, v2 +; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: ret + %r = call <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %b) + ret <1 x double> %r +} + +declare <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %b) #0 + +;============ v2f32 + +define <2 x float> @test_copysign_v2f32_v2f32(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_copysign_v2f32_v2f32: +; CHECK-NEXT: movi.2s v2, #0x80, lsl #24 +; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: ret + %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %r +} + +define <2 x float> @test_copysign_v2f32_v2f64(<2 x float> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_copysign_v2f32_v2f64: +; CHECK-NEXT: fcvtn v1.2s, v1.2d +; CHECK-NEXT: movi.2s v2, #0x80, lsl #24 +; CHECK-NEXT: bit.8b v0, v1, v2 +; CHECK-NEXT: ret + %tmp0 = fptrunc <2 x double> %b to <2 x float> + %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %tmp0) + ret <2 x float> %r +} + +declare <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) #0 + +;============ v4f32 + +define <4 x float> @test_copysign_v4f32_v4f32(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_copysign_v4f32_v4f32: +; CHECK-NEXT: movi.4s v2, #0x80, lsl #24 +; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: ret + %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %r +} + +; SplitVecOp #1 +define <4 x float> @test_copysign_v4f32_v4f64(<4 x float> %a, <4 x double> %b) #0 { +; CHECK-LABEL: test_copysign_v4f32_v4f64: +; CHECK-NEXT: mov s3, v0[1] +; CHECK-NEXT: mov d4, v1[1] +; CHECK-NEXT: movi.4s v5, #0x80, lsl #24 +; CHECK-NEXT: fcvt s1, d1 +; CHECK-NEXT: mov s6, v0[2] +; CHECK-NEXT: mov s7, v0[3] +; CHECK-NEXT: fcvt s16, d2 +; CHECK-NEXT: bit.16b v0, v1, v5 +; CHECK-NEXT: bit.16b v6, v16, v5 +; CHECK-NEXT: fcvt s1, d4 +; CHECK-NEXT: bit.16b v3, v1, v5 +; CHECK-NEXT: mov d1, v2[1] +; CHECK-NEXT: fcvt s1, d1 +; CHECK-NEXT: ins.s v0[1], v3[0] +; CHECK-NEXT: ins.s v0[2], v6[0] +; CHECK-NEXT: bit.16b v7, v1, v5 +; CHECK-NEXT: ins.s v0[3], v7[0] +; CHECK-NEXT: ret + %tmp0 = fptrunc <4 x double> %b to <4 x float> + %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0) + ret <4 x float> %r +} + +declare <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) #0 + +;============ v2f64 + +define <2 x double> @test_copysign_v2f64_v232(<2 x double> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_copysign_v2f64_v232: +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: fneg.2d v2, v2 +; CHECK-NEXT: fcvtl v1.2d, v1.2s +; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: ret + %tmp0 = fpext <2 x float> %b to <2 x double> + %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %tmp0) + ret <2 x double> %r +} + +define <2 x double> @test_copysign_v2f64_v2f64(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_copysign_v2f64_v2f64: +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: fneg.2d v2, v2 +; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: ret + %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %r +} + +declare <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) #0 + +;============ v4f64 + +; SplitVecRes mismatched +define <4 x double> @test_copysign_v4f64_v4f32(<4 x double> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_copysign_v4f64_v4f32: +; CHECK-NEXT: movi.2d v3, #0000000000000000 +; CHECK-NEXT: fcvtl2 v4.2d, v2.4s +; CHECK-NEXT: fcvtl v2.2d, v2.2s +; CHECK-NEXT: fneg.2d v3, v3 +; CHECK-NEXT: bit.16b v1, v4, v3 +; CHECK-NEXT: bit.16b v0, v2, v3 +; CHECK-NEXT: ret + %tmp0 = fpext <4 x float> %b to <4 x double> + %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %tmp0) + ret <4 x double> %r +} + +; SplitVecRes same +define <4 x double> @test_copysign_v4f64_v4f64(<4 x double> %a, <4 x double> %b) #0 { +; CHECK-LABEL: test_copysign_v4f64_v4f64: +; CHECK-NEXT: movi.2d v4, #0000000000000000 +; CHECK-NEXT: fneg.2d v4, v4 +; CHECK-NEXT: bit.16b v0, v2, v4 +; CHECK-NEXT: bit.16b v1, v3, v4 +; CHECK-NEXT: ret + %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) + ret <4 x double> %r +} + +declare <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/xbfiz.ll b/test/CodeGen/AArch64/xbfiz.ll index f763400d7f6a7..3211cc3f2cedb 100644 --- a/test/CodeGen/AArch64/xbfiz.ll +++ b/test/CodeGen/AArch64/xbfiz.ll @@ -31,3 +31,33 @@ define i32 @ubfiz32(i32 %v) { %shr = lshr i32 %shl, 2 ret i32 %shr } + +define i64 @ubfiz64and(i64 %v) { +; CHECK-LABEL: ubfiz64and: +; CHECK: ubfiz x0, x0, #36, #11 + %shl = shl i64 %v, 36 + %and = and i64 %shl, 140668768878592 + ret i64 %and +} + +define i32 @ubfiz32and(i32 %v) { +; CHECK-LABEL: ubfiz32and: +; CHECK: ubfiz w0, w0, #6, #24 + %shl = shl i32 %v, 6 + %and = and i32 %shl, 1073741760 + ret i32 %and +} + +; Check that we don't generate a ubfiz if the lsl has more than one +; use, since we'd just be replacing an and with a ubfiz. +define i32 @noubfiz32(i32 %v) { +; CHECK-LABEL: noubfiz32: +; CHECK: lsl w[[REG1:[0-9]+]], w0, #6 +; CHECK: and w[[REG2:[0-9]+]], w[[REG1]], #0x3fffffc0 +; CHECK: add w0, w[[REG1]], w[[REG2]] +; CHECK: ret + %shl = shl i32 %v, 6 + %and = and i32 %shl, 1073741760 + %add = add i32 %shl, %and + ret i32 %add +} diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll index 655e75dbc1a48..2ddfa9649ac94 100644 --- a/test/CodeGen/AMDGPU/add.ll +++ b/test/CodeGen/AMDGPU/add.ll @@ -5,7 +5,7 @@ ;FUNC-LABEL: {{^}}test1: ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}} ;SI-NOT: [[REG]] ;SI: buffer_store_dword [[REG]], define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { @@ -21,8 +21,8 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -39,10 +39,10 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/address-space.ll b/test/CodeGen/AMDGPU/address-space.ll index 4be8c5847529c..3aa2f653bf9c4 100644 --- a/test/CodeGen/AMDGPU/address-space.ll +++ b/test/CodeGen/AMDGPU/address-space.ll @@ -5,15 +5,11 @@ %struct.foo = type { [3 x float], [3 x float] } -; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is -; already in a VGPR after the first read. - ; CHECK-LABEL: {{^}}do_as_ptr_calcs: ; CHECK: s_load_dword [[SREG1:s[0-9]+]], -; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]] ; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] ; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12 -; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20 +; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:20 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind { entry: %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll new file mode 100644 index 0000000000000..61bcd4b3c093d --- /dev/null +++ b/test/CodeGen/AMDGPU/addrspacecast.ll @@ -0,0 +1,66 @@ +; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: unsupported addrspacecast not implemented + +; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s +; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s + +; Disable optimizations in case there are optimizations added that +; specialize away generic pointer accesses. + +; CHECK-LABEL: {{^}}branch_use_flat_i32: +; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; CHECK: s_endpgm +define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { +entry: + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %local, label %global + +local: + %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* + br label %end + +global: + %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + br label %end + +end: + %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] + store i32 %x, i32 addrspace(4)* %fptr, align 4 +; %val = load i32, i32 addrspace(4)* %fptr, align 4 +; store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; TODO: This should not be zero when registers are used for small +; scratch allocations again. + +; Check for prologue initializing special SGPRs pointing to scratch. +; CHECK-LABEL: {{^}}store_flat_scratch: +; CHECK: s_movk_i32 flat_scratch_lo, 0 +; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}} +; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}} +; CHECK: flat_store_dword +; CHECK: s_barrier +; CHECK: flat_load_dword +define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { + %alloca = alloca i32, i32 9, align 4 + %x = call i32 @llvm.r600.read.tidig.x() #3 + %pptr = getelementptr i32, i32* %alloca, i32 %x + %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* + store i32 %x, i32 addrspace(4)* %fptr + ; Dummy call + call void @llvm.AMDGPU.barrier.local() #1 + %reload = load i32, i32 addrspace(4)* %fptr, align 4 + store i32 %reload, i32 addrspace(1)* %out, align 4 + ret void +} + +declare void @llvm.AMDGPU.barrier.local() #1 +declare i32 @llvm.r600.read.tidig.x() #3 + +attributes #0 = { nounwind } +attributes #1 = { nounwind convergent } +attributes #3 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll index 5672d470bd7e0..f83fb16101fb6 100644 --- a/test/CodeGen/AMDGPU/and.ll +++ b/test/CodeGen/AMDGPU/and.ll @@ -2,6 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +declare i32 @llvm.r600.read.tidig.x() #0 + ; FUNC-LABEL: {{^}}test2: ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} @@ -54,13 +56,80 @@ define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { ret void } -; FUNC-LABEL: {{^}}v_and_i32: -; SI: v_and_b32 -define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 +; FIXME: We should really duplicate the constant so that the SALU use +; can fold into the s_and_b32 and the VALU one is materialized +; directly without copying from the SGPR. + +; Second use is a VGPR use of the constant. +; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_0: +; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687 +; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]] +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] +; SI: buffer_store_dword [[VK]] +define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %and = and i32 %a, 1234567 + + ; Just to stop future replacement of copy to vgpr + store with VALU op. + %foo = add i32 %and, %b + store volatile i32 %foo, i32 addrspace(1)* %out + store volatile i32 1234567, i32 addrspace(1)* %out + ret void +} + +; Second use is another SGPR use of the constant. +; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_1: +; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687 +; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]] +; SI: s_add_i32 +; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]] +; SI: buffer_store_dword [[VK]] +define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %and = and i32 %a, 1234567 + %foo = add i32 %and, 1234567 + %bar = add i32 %foo, %b + store volatile i32 %bar, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr: +; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep.a + %b = load i32, i32 addrspace(1)* %gep.b %and = and i32 %a, %b - store i32 %and, i32 addrspace(1)* %out, align 4 + store i32 %and, i32 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}v_and_i32_sgpr_vgpr: +; SI-DAG: s_load_dword [[SA:s[0-9]+]] +; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]] +; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]] +define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.b + %and = and i32 %a, %b + store i32 %and, i32 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}v_and_i32_vgpr_sgpr: +; SI-DAG: s_load_dword [[SA:s[0-9]+]] +; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]] +; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]] +define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep.a + %and = and i32 %a, %b + store i32 %and, i32 addrspace(1)* %gep.out ret void } @@ -148,9 +217,23 @@ endif: } ; FUNC-LABEL: {{^}}v_and_constant_i64: -; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207 +; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}} +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], {{v[0-9]+}} +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], {{v[0-9]+}} +; SI: buffer_store_dwordx2 define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %and = and i64 %a, 1231231234567 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: Should replace and 0 +; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant: +; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} +define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -294,3 +377,5 @@ define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 a store i64 %and, i64 addrspace(1)* %out, align 8 ret void } + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/test/CodeGen/AMDGPU/annotate-kernel-features.ll new file mode 100644 index 0000000000000..b116c72322bb9 --- /dev/null +++ b/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -0,0 +1,193 @@ +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA -check-prefix=ALL %s +; RUN: opt -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s + +declare i32 @llvm.r600.read.tgid.x() #0 +declare i32 @llvm.r600.read.tgid.y() #0 +declare i32 @llvm.r600.read.tgid.z() #0 + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.tidig.y() #0 +declare i32 @llvm.r600.read.tidig.z() #0 + +declare i32 @llvm.r600.read.local.size.x() #0 +declare i32 @llvm.r600.read.local.size.y() #0 +declare i32 @llvm.r600.read.local.size.z() #0 + +declare i32 @llvm.r600.read.global.size.x() #0 +declare i32 @llvm.r600.read.global.size.y() #0 +declare i32 @llvm.r600.read.global.size.z() #0 + + +; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { +define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tgid.x() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tgid.y() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tgid.y() + store volatile i32 %val0, i32 addrspace(1)* %ptr + %val1 = call i32 @llvm.r600.read.tgid.y() + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 { +define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tgid.x() + %val1 = call i32 @llvm.r600.read.tgid.y() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 { +define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tgid.z() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 { +define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tgid.x() + %val1 = call i32 @llvm.r600.read.tgid.z() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 { +define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tgid.y() + %val1 = call i32 @llvm.r600.read.tgid.z() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 { +define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tgid.x() + %val1 = call i32 @llvm.r600.read.tgid.y() + %val2 = call i32 @llvm.r600.read.tgid.z() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + store volatile i32 %val2, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { +define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tidig.x() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 { +define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tidig.y() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 { +define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tidig.z() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { +define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tidig.x() + %val1 = call i32 @llvm.r600.read.tgid.x() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 { +define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tidig.y() + %val1 = call i32 @llvm.r600.read.tgid.y() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 { +define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tidig.x() + %val1 = call i32 @llvm.r600.read.tidig.y() + %val2 = call i32 @llvm.r600.read.tidig.z() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + store volatile i32 %val2, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 { +define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tidig.x() + %val1 = call i32 @llvm.r600.read.tidig.y() + %val2 = call i32 @llvm.r600.read.tidig.z() + %val3 = call i32 @llvm.r600.read.tgid.x() + %val4 = call i32 @llvm.r600.read.tgid.y() + %val5 = call i32 @llvm.r600.read.tgid.z() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + store volatile i32 %val2, i32 addrspace(1)* %ptr + store volatile i32 %val3, i32 addrspace(1)* %ptr + store volatile i32 %val4, i32 addrspace(1)* %ptr + store volatile i32 %val5, i32 addrspace(1)* %ptr + ret void +} + +; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { +define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.local.size.x() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { +define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.local.size.y() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { +define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.local.size.z() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } + +; HSA: attributes #0 = { nounwind readnone } +; HSA: attributes #1 = { nounwind } +; HSA: attributes #2 = { nounwind "amdgpu-work-group-id-y" } +; HSA: attributes #3 = { nounwind "amdgpu-work-group-id-z" } +; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" } +; HSA: attributes #5 = { nounwind "amdgpu-work-item-id-y" } +; HSA: attributes #6 = { nounwind "amdgpu-work-item-id-z" } +; HSA: attributes #7 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-item-id-y" } +; HSA: attributes #8 = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" } diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll index 8c2a0795860d8..f8a74222d5669 100644 --- a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -2,7 +2,7 @@ ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s declare i32 @llvm.SI.tid() nounwind readnone -declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate +declare void @llvm.AMDGPU.barrier.local() nounwind convergent ; The required pointer calculations for the alloca'd actually requires ; an add and won't be folded into the addressing, which fails with a @@ -14,7 +14,7 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate ; FIXME: We end up with zero argument for ADD, because ; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index ; with the appropriate offset. We should fold this into the store. -; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}} +; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}} ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}] ; ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this @@ -22,7 +22,7 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate ; to interpret: ; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b -; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16 +; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16 ; SI-PROMOTE: ds_write_b32 [[PTRREG]] define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { %alloca = alloca [4 x i32], i32 4, align 16 @@ -35,7 +35,7 @@ define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 add %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b store i32 %result, i32* %alloca_ptr, align 4 ; Dummy call - call void @llvm.AMDGPU.barrier.local() nounwind noduplicate + call void @llvm.AMDGPU.barrier.local() nounwind convergent %reload = load i32, i32* %alloca_ptr, align 4 %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid store i32 %reload, i32 addrspace(1)* %out_ptr, align 4 diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll new file mode 100644 index 0000000000000..0ef7d5184c1f6 --- /dev/null +++ b/test/CodeGen/AMDGPU/bitreverse.ll @@ -0,0 +1,115 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i16 @llvm.bitreverse.i16(i16) #1 +declare i32 @llvm.bitreverse.i32(i32) #1 +declare i64 @llvm.bitreverse.i64(i64) #1 + +declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1 +declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1 + +declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1 +declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 + +declare i32 @llvm.AMDGPU.brev(i32) #1 + +; FUNC-LABEL: {{^}}s_brev_i16: +; SI: s_brev_b32 +define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { + %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 + store i16 %brev, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_brev_i16: +; SI: v_bfrev_b32_e32 +define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { + %val = load i16, i16 addrspace(1)* %valptr + %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 + store i16 %brev, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_brev_i32: +; SI: s_load_dword [[VAL:s[0-9]+]], +; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { + %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 + store i32 %brev, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_brev_i32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { + %val = load i32, i32 addrspace(1)* %valptr + %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 + store i32 %brev, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_brev_v2i32: +; SI: s_brev_b32 +; SI: s_brev_b32 +define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { + %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 + store <2 x i32> %brev, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_brev_v2i32: +; SI: v_bfrev_b32_e32 +; SI: v_bfrev_b32_e32 +define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr + %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 + store <2 x i32> %brev, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_brev_i64: +define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { + %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 + store i64 %brev, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_brev_i64: +define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { + %val = load i64, i64 addrspace(1)* %valptr + %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 + store i64 %brev, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_brev_v2i64: +define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 { + %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 + store <2 x i64> %brev, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_brev_v2i64: +define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { + %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr + %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 + store <2 x i64> %brev, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}legacy_s_brev_i32: +; SI: s_brev_b32 +define void @legacy_s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %brev = call i32 @llvm.AMDGPU.brev(i32 %val) #1 + store i32 %brev, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/calling-conventions.ll b/test/CodeGen/AMDGPU/calling-conventions.ll new file mode 100644 index 0000000000000..57adc8be6a997 --- /dev/null +++ b/test/CodeGen/AMDGPU/calling-conventions.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s + +; Make sure we don't crash or assert on spir_kernel calling convention. + +; SI-LABEL: {{^}}kernel: +; SI: s_endpgm +define spir_kernel void @kernel(i32 addrspace(1)* %out) { +entry: + store i32 0, i32 addrspace(1)* %out + ret void +} + +; FIXME: This is treated like a kernel +; SI-LABEL: {{^}}func: +; SI: s_endpgm +define spir_func void @func(i32 addrspace(1)* %out) { +entry: + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll new file mode 100644 index 0000000000000..1c5bed3b905f5 --- /dev/null +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -0,0 +1,98 @@ +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; OPT-LABEL: @test_no_sink_flat_small_offset_i32( +; OPT: getelementptr i32, i32 addrspace(4)* %in +; OPT: br i1 +; OPT-NOT: ptrtoint + +; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: +; GCN: flat_load_dword +; GCN: {{^}}BB0_2: +define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(4)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(4)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( +; OPT: getelementptr i32, i32 addrspace(4)* %out, +; OPT-CI-NOT: getelementptr +; OPT: br i1 + +; OPT-CI: ptrtoint +; OPT-CI: add +; OPT-CI: inttoptr +; OPT: br label + +; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32: +; CI: buffer_load_dword {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 +define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 + %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(1)* + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(1)* %cast + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(4)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_constant_i32( +; OPT: getelementptr i32, i32 addrspace(4)* %out, +; OPT-CI-NOT: getelementptr +; OPT: br i1 + +; OPT-CI: ptrtoint +; OPT-CI: add +; OPT-CI: inttoptr +; OPT: br label + +; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32: +; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 + %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(2)* + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %cast + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(4)* %out.gep + br label %done + +done: + ret void +} diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index a68d110fdc96d..698494265a7d4 100644 --- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -1,5 +1,7 @@ +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s @@ -115,35 +117,6 @@ done: ret void } -; OPT-LABEL: @test_no_sink_flat_small_offset_i32( -; OPT: getelementptr i32, i32 addrspace(4)* %in -; OPT: br i1 -; OPT-NOT: ptrtoint - -; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: -; GCN: flat_load_dword -; GCN: {{^}}BB4_2: - -define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i32, i32 addrspace(4)* %in.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(4)* %out.gep - br label %done - -done: - ret void -} - ; OPT-LABEL: @test_sink_scratch_small_offset_i32( ; OPT-NOT: getelementptr [512 x i32] ; OPT: br i1 @@ -153,7 +126,7 @@ done: ; GCN: s_and_saveexec_b64 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} -; GCN: {{^}}BB5_2: +; GCN: {{^}}BB4_2: define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 @@ -189,7 +162,7 @@ done: ; GCN: s_and_saveexec_b64 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN: {{^}}BB6_2: +; GCN: {{^}}BB5_2: define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 @@ -222,7 +195,7 @@ done: ; GCN: s_and_saveexec_b64 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GCN: {{^}}BB7_2: +; GCN: {{^}}BB6_2: define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) { entry: %offset.ext = zext i32 %offset to i64 @@ -246,3 +219,220 @@ done: attributes #0 = { nounwind readnone } attributes #1 = { nounwind } + + + +; OPT-LABEL: @test_sink_constant_small_offset_i32 +; OPT-NOT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32: +; GCN: s_and_saveexec_b64 +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32 +; OPT-NOT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32: +; GCN: s_and_saveexec_b64 +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32 +; OPT-SI: getelementptr i32, i32 addrspace(2)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-VI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32: +; GCN: s_and_saveexec_b64 +; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400 + +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32 +; OPT-SI: getelementptr i32, i32 addrspace(2)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32: +; GCN: s_and_saveexec_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}} +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32 +; OPT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32: +; GCN: s_and_saveexec_b64 +; GCN: s_add_u32 +; GCN: s_addc_u32 +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32: +; GCN: s_and_saveexec_b64 +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}} +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} + +; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}} +; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} + +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32 +; OPT-SI: getelementptr i32, i32 addrspace(2)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-VI: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32: +; GCN: s_and_saveexec_b64 +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} + +; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}} + +; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} +; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} + +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} diff --git a/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll b/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll new file mode 100644 index 0000000000000..1a37e3c75fa38 --- /dev/null +++ b/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s + + +; HSA-DEFAULT: flat_store_dword +; HSA-NODEFAULT: buffer_store_dword +; NOHSA-DEFAULT: buffer_store_dword +; NOHSA-NODEFAULT: flat_store_dword +define void @test(i32 addrspace(1)* %out) { +entry: + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll index e1a0ee3ea2175..ec2971e98032a 100644 --- a/test/CodeGen/AMDGPU/ctpop64.ll +++ b/test/CodeGen/AMDGPU/ctpop64.ll @@ -36,6 +36,24 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali ret void } +; FIXME: or 0 should be replaxed with copy +; FUNC-LABEL: {{^}}v_ctpop_i64_user: +; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, +; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 +; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]] +; GCN-DAG: v_or_b32_e64 v[[RESULT_HI:[0-9]+]], 0, s{{[0-9]+}} +; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +; GCN: s_endpgm +define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { + %val = load i64, i64 addrspace(1)* %in, align 8 + %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone + %or = or i64 %ctpop, %s.val + store i64 %or, i64 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}s_ctpop_v2i64: ; GCN: s_bcnt1_i32_b64 ; GCN: s_bcnt1_i32_b64 @@ -99,8 +117,8 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs ; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd ; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34 ; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}} -; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]] -; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]] +; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]] +; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]] ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}} ; GCN: s_endpgm define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) { diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 3399d9da29e3d..834922c62cbd9 100644 --- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -137,14 +137,8 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]] ; SI-NOT: bfe ; SI-NOT: lshr -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 %cvt = uitofp <8 x i8> %load to <8 x float> @@ -154,7 +148,7 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> ; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32: ; SI: buffer_load_dword [[LOADREG:v[0-9]+]], -; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]] +; SI: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]] ; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] ; SI: buffer_store_dword [[CONV]], define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { diff --git a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll new file mode 100644 index 0000000000000..171883e4c74b2 --- /dev/null +++ b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll @@ -0,0 +1,52 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s + +; The memory operand was dropped from the buffer_load_dword_offset +; when replaced with the addr64 during operand legalization, resulting +; in the global loads not being scheduled together. + +; GCN-LABEL: {{^}}reschedule_global_load_lds_store: +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: ds_write_b32 +; GCN: ds_write_b32 +; GCN: s_endpgm +define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 { +entry: + %tid = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx = shl i32 %tid, 2 + %gep0 = getelementptr i32, i32 addrspace(1)* %gptr0, i32 %idx + %gep1 = getelementptr i32, i32 addrspace(1)* %gptr1, i32 %idx + %gep2 = getelementptr i32, i32 addrspace(3)* %lptr, i32 %tid + %cmp0 = icmp eq i32 %c, 0 + br i1 %cmp0, label %for.body, label %exit + +for.body: ; preds = %for.body, %entry + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.body ] + %gptr0.phi = phi i32 addrspace(1)* [ %gep0, %entry ], [ %gep0.inc, %for.body ] + %gptr1.phi = phi i32 addrspace(1)* [ %gep1, %entry ], [ %gep1.inc, %for.body ] + %lptr0.phi = phi i32 addrspace(3)* [ %gep2, %entry ], [ %gep2.inc, %for.body ] + %lptr1 = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 1 + %val0 = load i32, i32 addrspace(1)* %gep0 + store i32 %val0, i32 addrspace(3)* %lptr0.phi + %val1 = load i32, i32 addrspace(1)* %gep1 + store i32 %val1, i32 addrspace(3)* %lptr1 + %gep0.inc = getelementptr i32, i32 addrspace(1)* %gptr0.phi, i32 4 + %gep1.inc = getelementptr i32, i32 addrspace(1)* %gptr1.phi, i32 4 + %gep2.inc = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 4 + %i.inc = add nsw i32 %i, 1 + %cmp1 = icmp ne i32 %i, 256 + br i1 %cmp1, label %for.body, label %exit + +exit: ; preds = %for.body, %entry + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { convergent nounwind } diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll index 5e4654abd91bd..e657991557e3a 100644 --- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll +++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll @@ -10,13 +10,13 @@ declare void @llvm.AMDGPU.barrier.local() #1 ; CHECK: BB0_1: ; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]], ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] -; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]] +; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], vcc, 4, [[VADDR]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]] -; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]] +; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]] -; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]] +; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], vcc, 0x84, [[VADDR]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]] -; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]] +; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]] ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1 @@ -66,5 +66,5 @@ for.end: ; preds = %for.body } attributes #0 = { nounwind readnone } -attributes #1 = { noduplicate nounwind } +attributes #1 = { convergent nounwind } attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/AMDGPU/ds-sub-offset.ll b/test/CodeGen/AMDGPU/ds-sub-offset.ll new file mode 100644 index 0000000000000..7d6eddb01993c --- /dev/null +++ b/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -0,0 +1,125 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s + +declare void @llvm.AMDGPU.barrier.local() #2 +declare i32 @llvm.r600.read.tidig.x() #0 + +@lds.obj = addrspace(3) global [256 x i32] undef, align 4 + +; GCN-LABEL: {{^}}write_ds_sub0_offset0_global: +; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0 +; GCN: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]] +; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b +; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12 +define void @write_ds_sub0_offset0_global() #0 { +entry: + %x.i = call i32 @llvm.r600.read.tidig.x() #1 + %sub1 = sub i32 0, %x.i + %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3 + store i32 123, i32 addrspace(3)* %arrayidx + ret void +} + +; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_max_offset: +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 +; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 +; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535 +define void @add_x_shl_neg_to_sub_max_offset() #1 { + %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %neg = sub i32 0, %x.i + %shl = shl i32 %neg, 2 + %add = add i32 65535, %shl + %ptr = inttoptr i32 %add to i8 addrspace(3)* + store i8 13, i8 addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_max_offset_p1: +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 +; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 +; GCN: ds_write_b8 [[NEG]], [[K]]{{$}} +define void @add_x_shl_neg_to_sub_max_offset_p1() #1 { + %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %neg = sub i32 0, %x.i + %shl = shl i32 %neg, 2 + %add = add i32 65536, %shl + %ptr = inttoptr i32 %add to i8 addrspace(3)* + store i8 13, i8 addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_multi_use: +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 +; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 +; GCN-NOT: v_sub +; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}} +; GCN-NOT: v_sub +; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}} +; GCN: s_endpgm +define void @add_x_shl_neg_to_sub_multi_use() #1 { + %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %neg = sub i32 0, %x.i + %shl = shl i32 %neg, 2 + %add0 = add i32 123, %shl + %add1 = add i32 456, %shl + %ptr0 = inttoptr i32 %add0 to i32 addrspace(3)* + store volatile i32 13, i32 addrspace(3)* %ptr0 + %ptr1 = inttoptr i32 %add1 to i32 addrspace(3)* + store volatile i32 13, i32 addrspace(3)* %ptr1 + ret void +} + +; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_multi_use_same_offset: +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 +; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 +; GCN-NOT: v_sub +; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}} +; GCN-NOT: v_sub +; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}} +; GCN: s_endpgm +define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { + %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %neg = sub i32 0, %x.i + %shl = shl i32 %neg, 2 + %add = add i32 123, %shl + %ptr = inttoptr i32 %add to i32 addrspace(3)* + store volatile i32 13, i32 addrspace(3)* %ptr + store volatile i32 13, i32 addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_misaligned_i64_max_offset: +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 +; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255 +define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { + %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %neg = sub i32 0, %x.i + %shl = shl i32 %neg, 2 + %add = add i32 1019, %shl + %ptr = inttoptr i32 %add to i64 addrspace(3)* + store i64 123, i64 addrspace(3)* %ptr, align 4 + ret void +} + +; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 +; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]] +; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}} +define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 { + %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %neg = sub i32 0, %x.i + %shl = shl i32 %neg, 2 + %add = add i32 1020, %shl + %ptr = inttoptr i32 %add to i64 addrspace(3)* + store i64 123, i64 addrspace(3)* %ptr, align 4 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } +attributes #2 = { nounwind convergent } diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll index ec04f8b1acd6a..5170d9c82712f 100644 --- a/test/CodeGen/AMDGPU/ds_read2.ll +++ b/test/CodeGen/AMDGPU/ds_read2.ll @@ -216,10 +216,8 @@ define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x f ret void } -; We should be able to merge in this case, but probably not worth the effort. -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 +; SI-LABEL: {{^}}read2_ptr_is_subreg_f32: +; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}} ; SI: s_endpgm define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 @@ -507,9 +505,9 @@ declare i32 @llvm.r600.read.tidig.x() #1 ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tidig.y() #1 -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.local() #2 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll index 842c2d8bc3394..0061aaf2cdbd1 100644 --- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -35,14 +35,11 @@ define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 { ret void } -; FIXME: Shuffling to new superregister ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Y:[0-9]+]]:[[REG_X:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} -; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Y:[0-9]+]], v[[REG_Y]] -; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Z:[0-9]+]], v[[REG_Z]] -; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[COPY_REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[COPY_REG_Y]] +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]] ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD2]] ; CI: s_endpgm @@ -88,8 +85,13 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { } ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI-DAG: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} + +; FIXME: These moves shouldn't be necessary, it should be able to +; store the same register if offset1 was the non-zero offset. + +; CI: v_mov_b32 +; CI: v_mov_b32 ; CI: buffer_store_dwordx4 ; CI: s_endpgm define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 { @@ -102,8 +104,9 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) } ; CI-LABEL: {{^}}simple_read2_v4f32_superreg: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI: v_mov_b32 +; CI: v_mov_b32 ; CI: buffer_store_dwordx4 ; CI: s_endpgm define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { @@ -115,19 +118,16 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { ret void } +; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v8f32_superreg: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword +; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}} +; CI: v_mov_b32 +; CI: v_mov_b32 +; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}} +; CI: v_mov_b32 +; CI: v_mov_b32 +; CI: buffer_store_dwordx4 +; CI: buffer_store_dwordx4 ; CI: s_endpgm define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 @@ -138,33 +138,24 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { ret void } +; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v16f32_superreg: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}} +; CI: v_mov_b32 +; CI: v_mov_b32 +; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:7{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:4{{$}} +; CI: v_mov_b32 +; CI: v_mov_b32 +; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}} +; CI: v_mov_b32 +; CI: v_mov_b32 ; CI: s_waitcnt lgkmcnt(0) -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword +; CI: buffer_store_dwordx4 +; CI: buffer_store_dwordx4 +; CI: buffer_store_dwordx4 +; CI: buffer_store_dwordx4 ; CI: s_endpgm define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 @@ -238,9 +229,9 @@ declare i32 @llvm.r600.read.tidig.x() #1 ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tidig.y() #1 -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.local() #2 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll index e2e441214b4ab..4a0571ea16f2b 100644 --- a/test/CodeGen/AMDGPU/ds_read2st64.ll +++ b/test/CodeGen/AMDGPU/ds_read2st64.ll @@ -65,7 +65,7 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add ; SI-LABEL: @simple_read2st64_f32_over_max_offset ; SI-NOT: ds_read2st64_b32 -; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} +; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} ; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 ; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]] ; SI: s_endpgm @@ -197,7 +197,7 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a ; SI-LABEL: @simple_read2st64_f64_over_max_offset ; SI-NOT: ds_read2st64_b64 -; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} +; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] ; SI: s_endpgm @@ -264,9 +264,5 @@ declare i32 @llvm.r600.read.tidig.x() #1 ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tidig.y() #1 -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll index d4973e377b596..9d3a293f3b898 100644 --- a/test/CodeGen/AMDGPU/ds_write2.ll +++ b/test/CodeGen/AMDGPU/ds_write2.ll @@ -345,8 +345,9 @@ define void @store_constant_disjoint_offsets() { ; SI-LABEL: @store_misaligned64_constant_offsets ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; SI: s_endpgm define void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -430,9 +431,9 @@ declare i32 @llvm.r600.read.tidig.x() #1 ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tidig.y() #1 -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.local() #2 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll index 358aa6a9e3636..5a1024ccf6d72 100644 --- a/test/CodeGen/AMDGPU/ds_write2st64.ll +++ b/test/CodeGen/AMDGPU/ds_write2st64.ll @@ -109,9 +109,9 @@ declare i32 @llvm.r600.read.tidig.x() #1 ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tidig.y() #1 -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.local() #2 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } diff --git a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll new file mode 100644 index 0000000000000..f4409a0984a96 --- /dev/null +++ b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -0,0 +1,11 @@ +; RUN: not llc -march=amdgcn -mcpu=tahiti -mattr=+promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s +; RUN: not llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s +; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s + +; CHECK: error: unsupported dynamic alloca in test_dynamic_stackalloc + +define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) { + %alloca = alloca i32, i32 %n + store volatile i32 0, i32* %alloca + ret void +} diff --git a/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll b/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll new file mode 100644 index 0000000000000..e325591396236 --- /dev/null +++ b/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll @@ -0,0 +1,43 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; How the replacement of i64 stores with v2i32 stores resulted in +; breaking other users of the bitcast if they already existed + +; GCN-LABEL: {{^}}extract_vector_elt_select_error: +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dwordx2 +define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) nounwind { + %vec = bitcast i64 %val to <2 x i32> + %elt0 = extractelement <2 x i32> %vec, i32 0 + %elt1 = extractelement <2 x i32> %vec, i32 1 + + store volatile i32 %elt0, i32 addrspace(1)* %out + store volatile i32 %elt1, i32 addrspace(1)* %out + store volatile i64 %val, i64 addrspace(1)* %in + ret void +} + + +define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) nounwind { + %p0 = extractelement <2 x i64> %foo, i32 0 + %p1 = extractelement <2 x i64> %foo, i32 1 + %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1 + store volatile i64 %p1, i64 addrspace(1)* %out + store volatile i64 %p0, i64 addrspace(1)* %out1 + ret void +} + +define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) nounwind { + %dynelt = extractelement <2 x i64> %foo, i32 %elt + store volatile i64 %dynelt, i64 addrspace(1)* %out + ret void +} + +define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) nounwind { + %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo + %or = or <2 x i64> %load, %arst + %dynelt = extractelement <2 x i64> %or, i32 %elt + store volatile i64 %dynelt, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll index 485c55870c479..19c17289da3d4 100644 --- a/test/CodeGen/AMDGPU/fadd64.ll +++ b/test/CodeGen/AMDGPU/fadd64.ll @@ -1,14 +1,44 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; CHECK: {{^}}fadd_f64: +; CHECK-LABEL: {{^}}v_fadd_f64: ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}} +define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fadd double %r0, %r1 + store double %r2, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}s_fadd_f64: +; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) { + %r2 = fadd double %r0, %r1 + store double %r2, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}v_fadd_v2f64: +; CHECK: v_add_f64 +; CHECK: v_add_f64 +; CHECK: buffer_store_dwordx4 +define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, + <2 x double> addrspace(1)* %in2) { + %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 + %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 + %r2 = fadd <2 x double> %r0, %r1 + store <2 x double> %r2, <2 x double> addrspace(1)* %out + ret void +} -define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fadd double %r0, %r1 - store double %r2, double addrspace(1)* %out - ret void +; CHECK-LABEL: {{^}}s_fadd_v2f64: +; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} +; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} +; CHECK: buffer_store_dwordx4 +define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) { + %r2 = fadd <2 x double> %r0, %r1 + store <2 x double> %r2, <2 x double> addrspace(1)* %out + ret void } diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll index e8c34f0141e40..c8ef5b101c4d0 100644 --- a/test/CodeGen/AMDGPU/fceil64.ll +++ b/test/CodeGen/AMDGPU/fceil64.ll @@ -17,12 +17,12 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone ; SI: s_lshr_b64 ; SI: s_not_b64 ; SI: s_and_b64 -; SI: cmp_gt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 -; SI: cmp_lt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 +; SI-DAG: cmp_gt_i32 +; SI-DAG: cndmask_b32 +; SI-DAG: cndmask_b32 +; SI-DAG: cmp_lt_i32 +; SI-DAG: cndmask_b32 +; SI-DAG: cndmask_b32 ; SI-DAG: v_cmp_lt_f64 ; SI-DAG: v_cmp_lg_f64 ; SI: s_and_b64 diff --git a/test/CodeGen/AMDGPU/fcmp.ll b/test/CodeGen/AMDGPU/fcmp.ll index 5207ab57bade3..97d954fcc3c27 100644 --- a/test/CodeGen/AMDGPU/fcmp.ll +++ b/test/CodeGen/AMDGPU/fcmp.ll @@ -20,7 +20,7 @@ entry: ; CHECK: {{^}}fcmp_br: ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}} -; CHECK-NEXT {{[0-9]+(5.0}} +; CHECK-NEXT: {{[0-9]+\(5.0}} define void @fcmp_br(i32 addrspace(1)* %out, float %in) { entry: diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll index 8ceca078f2d6c..86e0c07323bb2 100644 --- a/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/test/CodeGen/AMDGPU/flat-address-space.ll @@ -7,39 +7,16 @@ ; specialize away generic pointer accesses. -; CHECK-LABEL: {{^}}branch_use_flat_i32: -; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; CHECK: s_endpgm -define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { -entry: - %cmp = icmp ne i32 %c, 0 - br i1 %cmp, label %local, label %global - -local: - %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* - br label %end - -global: - %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - br label %end - -end: - %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] - store i32 %x, i32 addrspace(4)* %fptr, align 4 -; %val = load i32, i32 addrspace(4)* %fptr, align 4 -; store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - - - ; These testcases might become useless when there are optimizations to ; remove generic pointers. ; CHECK-LABEL: {{^}}store_flat_i32: -; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}} -; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}} -; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}} +; CHECK-DAG: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]], +; CHECK-DAG: s_load_dword s[[SDATA:[0-9]+]], +; CHECK: s_waitcnt lgkmcnt(0) +; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]] +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] ; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* @@ -83,7 +60,7 @@ define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { -; CHECK-LABEL @load_flat_i32: +; CHECK-LABEL: load_flat_i32: ; CHECK: flat_load_dword define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* @@ -92,7 +69,7 @@ define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noa ret void } -; CHECK-LABEL @load_flat_i64: +; CHECK-LABEL: load_flat_i64: ; CHECK: flat_load_dwordx2 define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* @@ -101,7 +78,7 @@ define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa ret void } -; CHECK-LABEL @load_flat_v4i32: +; CHECK-LABEL: load_flat_v4i32: ; CHECK: flat_load_dwordx4 define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* @@ -110,7 +87,7 @@ define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> add ret void } -; CHECK-LABEL @sextload_flat_i8: +; CHECK-LABEL: sextload_flat_i8: ; CHECK: flat_load_sbyte define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* @@ -120,7 +97,7 @@ define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n ret void } -; CHECK-LABEL @zextload_flat_i8: +; CHECK-LABEL: zextload_flat_i8: ; CHECK: flat_load_ubyte define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* @@ -130,7 +107,7 @@ define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n ret void } -; CHECK-LABEL @sextload_flat_i16: +; CHECK-LABEL: sextload_flat_i16: ; CHECK: flat_load_sshort define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* @@ -140,7 +117,7 @@ define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* ret void } -; CHECK-LABEL @zextload_flat_i16: +; CHECK-LABEL: zextload_flat_i16: ; CHECK: flat_load_ushort define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* @@ -150,35 +127,9 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* ret void } - - -; TODO: This should not be zero when registers are used for small -; scratch allocations again. - -; Check for prologue initializing special SGPRs pointing to scratch. -; CHECK-LABEL: {{^}}store_flat_scratch: -; CHECK: s_movk_i32 flat_scratch_lo, 0 -; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}} -; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}} -; CHECK: flat_store_dword -; CHECK: s_barrier -; CHECK: flat_load_dword -define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { - %alloca = alloca i32, i32 9, align 4 - %x = call i32 @llvm.r600.read.tidig.x() #3 - %pptr = getelementptr i32, i32* %alloca, i32 %x - %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* - store i32 %x, i32 addrspace(4)* %fptr - ; Dummy call - call void @llvm.AMDGPU.barrier.local() #1 - %reload = load i32, i32 addrspace(4)* %fptr, align 4 - store i32 %reload, i32 addrspace(1)* %out, align 4 - ret void -} - declare void @llvm.AMDGPU.barrier.local() #1 declare i32 @llvm.r600.read.tidig.x() #3 attributes #0 = { nounwind } -attributes #1 = { nounwind noduplicate } +attributes #1 = { nounwind convergent } attributes #3 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll new file mode 100644 index 0000000000000..e2ae3353ae1d3 --- /dev/null +++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -0,0 +1,36 @@ +; RUN: llc < %s -march=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=CI +; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=VI + +; GCN-LABEL: {{^}}no_vcc_no_flat: +; GCN: ; NumSgprs: 8 +define void @no_vcc_no_flat() { +entry: + call void asm sideeffect "", "~{SGPR7}"() + ret void +} + +; GCN-LABEL: {{^}}vcc_no_flat: +; GCN: ; NumSgprs: 10 +define void @vcc_no_flat() { +entry: + call void asm sideeffect "", "~{SGPR7},~{VCC}"() + ret void +} + +; GCN-LABEL: {{^}}no_vcc_flat: +; CI: ; NumSgprs: 12 +; VI: ; NumSgprs: 14 +define void @no_vcc_flat() { +entry: + call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"() + ret void +} + +; GCN-LABEL: {{^}}vcc_flat: +; CI: ; NumSgprs: 12 +; VI: ; NumSgprs: 14 +define void @vcc_flat() { +entry: + call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"() + ret void +} diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll index bd574b877117e..6f3437048ed89 100644 --- a/test/CodeGen/AMDGPU/fma-combine.ll +++ b/test/CodeGen/AMDGPU/fma-combine.ll @@ -364,5 +364,205 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias % ret void } +; +; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y) +; + +; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y: +; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]] +define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %a = fadd float %x, 1.0 + %m = fmul float %a, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one: +; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]] +define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %a = fadd float %x, 1.0 + %m = fmul float %y, %a + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y: +; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %a = fadd float %x, -1.0 + %m = fmul float %a, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone: +; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %a = fadd float %x, -1.0 + %m = fmul float %y, %a + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y: +; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]] +define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float 1.0, %x + %m = fmul float %s, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x: +; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]] +define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float 1.0, %x + %m = fmul float %y, %s + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y: +; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float -1.0, %x + %m = fmul float %s, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x: +; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float -1.0, %x + %m = fmul float %y, %s + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y: +; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float %x, 1.0 + %m = fmul float %s, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one: +; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float %x, 1.0 + %m = fmul float %y, %s + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y: +; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]] +define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float %x, -1.0 + %m = fmul float %s, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone: +; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]] +define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float %x, -1.0 + %m = fmul float %y, %s + store float %m, float addrspace(1)* %out + ret void +} + +; +; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) +; + +; FUNC-LABEL: {{^}}test_f32_interp: +; SI: v_mad_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] +; SI: v_mac_f32_e32 [[VR]], [[VT]], [[VX:v[0-9]]] +define void @test_f32_interp(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2, + float addrspace(1)* %in3) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %t = load float, float addrspace(1)* %in3 + %t1 = fsub float 1.0, %t + %tx = fmul float %x, %t + %ty = fmul float %y, %t1 + %r = fadd float %tx, %ty + store float %r, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f64_interp: +; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]] +; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] +define void @test_f64_interp(double addrspace(1)* %out, + double addrspace(1)* %in1, + double addrspace(1)* %in2, + double addrspace(1)* %in3) { + %x = load double, double addrspace(1)* %in1 + %y = load double, double addrspace(1)* %in2 + %t = load double, double addrspace(1)* %in3 + %t1 = fsub double 1.0, %t + %tx = fmul double %x, %t + %ty = fmul double %y, %t1 + %r = fadd double %tx, %ty + store double %r, double addrspace(1)* %out + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll index 413957d2982ac..d374fb67350cc 100644 --- a/test/CodeGen/AMDGPU/fmax_legacy.ll +++ b/test/CodeGen/AMDGPU/fmax_legacy.ll @@ -87,6 +87,46 @@ define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace( ret void } +; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32: +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; EG: MAX +define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1 + + %a = load <1 x float>, <1 x float> addrspace(1)* %gep.0 + %b = load <1 x float>, <1 x float> addrspace(1)* %gep.1 + + %cmp = fcmp ogt <1 x float> %a, %b + %val = select <1 x i1> %cmp, <1 x float> %a, <1 x float> %b + store <1 x float> %val, <1 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32: +; SI-SAFE: v_max_legacy_f32_e32 +; SI-SAFE: v_max_legacy_f32_e32 +; SI-SAFE: v_max_legacy_f32_e32 +; SI-NONAN: v_max_f32_e32 +; SI-NONAN: v_max_f32_e32 +; SI-NONAN: v_max_f32_e32 +define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1 + + %a = load <3 x float>, <3 x float> addrspace(1)* %gep.0 + %b = load <3 x float>, <3 x float> addrspace(1)* %gep.1 + + %cmp = fcmp ogt <3 x float> %a, %b + %val = select <3 x i1> %cmp, <3 x float> %a, <3 x float> %b + store <3 x float> %val, <3 x float> addrspace(1)* %out + ret void +} ; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll index 6a625c239d761..52fc3d0d251a4 100644 --- a/test/CodeGen/AMDGPU/fmin_legacy.ll +++ b/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -96,6 +96,69 @@ define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace( ret void } +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32: +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1 + + %a = load <1 x float>, <1 x float> addrspace(1)* %gep.0 + %b = load <1 x float>, <1 x float> addrspace(1)* %gep.1 + + %cmp = fcmp ult <1 x float> %a, %b + %val = select <1 x i1> %cmp, <1 x float> %a, <1 x float> %b + store <1 x float> %val, <1 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32: +; SI: buffer_load_dwordx2 +; SI: buffer_load_dwordx2 +; SI-SAFE: v_min_legacy_f32_e32 +; SI-SAFE: v_min_legacy_f32_e32 + +; SI-NONAN: v_min_f32_e32 +; SI-NONAN: v_min_f32_e32 +define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1 + + %a = load <2 x float>, <2 x float> addrspace(1)* %gep.0 + %b = load <2 x float>, <2 x float> addrspace(1)* %gep.1 + + %cmp = fcmp ult <2 x float> %a, %b + %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + store <2 x float> %val, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32: +; SI-SAFE: v_min_legacy_f32_e32 +; SI-SAFE: v_min_legacy_f32_e32 +; SI-SAFE: v_min_legacy_f32_e32 + +; SI-NONAN: v_min_f32_e32 +; SI-NONAN: v_min_f32_e32 +; SI-NONAN: v_min_f32_e32 +define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1 + + %a = load <3 x float>, <3 x float> addrspace(1)* %gep.0 + %b = load <3 x float>, <3 x float> addrspace(1)* %gep.1 + + %cmp = fcmp ult <3 x float> %a, %b + %val = select <3 x i1> %cmp, <3 x float> %a, <3 x float> %b + store <3 x float> %val, <3 x float> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll new file mode 100644 index 0000000000000..1ee92b2f7c086 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -0,0 +1,102 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't +; make add an instruction if the fadd has more than one use. + +declare float @llvm.fabs.f32(float) #1 + +; GCN-LABEL: {{^}}multiple_fadd_use_test: +; GCN: v_max_legacy_f32_e64 [[A16:v[0-9]+]], +; GCN: v_add_f32_e32 [[A17:v[0-9]+]], [[A16]], [[A16]] +; GCN: v_mul_f32_e32 [[A18:v[0-9]+]], [[A17]], [[A17]] +; GCN: v_mad_f32 [[A20:v[0-9]+]], -[[A18]], [[A17]], 1.0 +; GCN: buffer_store_dword [[A20]] +define void @multiple_fadd_use_test(float addrspace(1)* %out, float %x, float %y, float %z) #0 { + %a11 = fadd fast float %y, -1.0 + %a12 = call float @llvm.fabs.f32(float %a11) + %a13 = fadd fast float %x, -1.0 + %a14 = call float @llvm.fabs.f32(float %a13) + %a15 = fcmp ogt float %a12, %a14 + %a16 = select i1 %a15, float %a12, float %a14 + %a17 = fmul fast float %a16, 2.0 + %a18 = fmul fast float %a17, %a17 + %a19 = fmul fast float %a18, %a17 + %a20 = fsub fast float 1.0, %a19 + store float %a20, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}multiple_use_fadd_fmac +; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], [[X:s[0-9]+]], s{{[0-9]+}} +; GCN-DAG: v_mac_f32_e64 [[MAD:v[0-9]+]], 2.0, [[X]] +; GCN-DAG: buffer_store_dword [[MUL2]] +; GCN-DAG: buffer_store_dword [[MAD]] +; GCN: s_endpgm +define void @multiple_use_fadd_fmac(float addrspace(1)* %out, float %x, float %y) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %mul2 = fmul fast float %x, 2.0 + %mad = fadd fast float %mul2, %y + store float %mul2, float addrspace(1)* %out + store float %mad, float addrspace(1)* %out.gep.1 + ret void +} + +; GCN-LABEL: {{^}}multiple_use_fadd_fmad: +; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}| +; GCN-DAG: v_mad_f32 [[MAD:v[0-9]+]], 2.0, |[[X]]|, v{{[0-9]+}} +; GCN-DAG: buffer_store_dword [[MUL2]] +; GCN-DAG: buffer_store_dword [[MAD]] +; GCN: s_endpgm +define void @multiple_use_fadd_fmad(float addrspace(1)* %out, float %x, float %y) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %x.abs = call float @llvm.fabs.f32(float %x) + %mul2 = fmul fast float %x.abs, 2.0 + %mad = fadd fast float %mul2, %y + store float %mul2, float addrspace(1)* %out + store float %mad, float addrspace(1)* %out.gep.1 + ret void +} + +; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad: +; GCN: v_mad_f32 {{v[0-9]+}}, 2.0, |[[X:s[0-9]+]]|, v{{[0-9]+}} +; GCN: v_mad_f32 {{v[0-9]+}}, 2.0, |[[X]]|, v{{[0-9]+}} +define void @multiple_use_fadd_multi_fmad(float addrspace(1)* %out, float %x, float %y, float %z) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %x.abs = call float @llvm.fabs.f32(float %x) + %mul2 = fmul fast float %x.abs, 2.0 + %mad0 = fadd fast float %mul2, %y + %mad1 = fadd fast float %mul2, %z + store float %mad0, float addrspace(1)* %out + store float %mad1, float addrspace(1)* %out.gep.1 + ret void +} + +; GCN-LABEL: {{^}}fmul_x2_xn2: +; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], -4.0, [[X:s[0-9]+]] +; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] +; GCN: buffer_store_dword [[RESULT]] +define void @fmul_x2_xn2(float addrspace(1)* %out, float %x, float %y) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %mul2 = fmul fast float %x, 2.0 + %muln2 = fmul fast float %x, -2.0 + %mul = fmul fast float %mul2, %muln2 + store float %mul, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmul_x2_xn3: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xc0c00000 +; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]] +; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] +; GCN: buffer_store_dword [[RESULT]] +define void @fmul_x2_xn3(float addrspace(1)* %out, float %x, float %y) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %mul2 = fmul fast float %x, 2.0 + %muln2 = fmul fast float %x, -3.0 + %mul = fmul fast float %mul2, %muln2 + store float %mul, float addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind "unsafe-fp-math"="true" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll index 3b4930d9897d1..b99d2712ed758 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -34,8 +34,7 @@ define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { ; R600: |PV.{{[XYZW]}}| ; R600: -PV -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) @@ -49,8 +48,7 @@ define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { ; R600: |PV.{{[XYZW]}}| ; R600: -PV -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @fabs(float %bc) @@ -60,8 +58,7 @@ define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { } ; FUNC-LABEL: {{^}}fneg_fabs_f32: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) %fsub = fsub float -0.000000e+00, %fabs @@ -85,11 +82,8 @@ define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: -PV -; FIXME: SGPR should be used directly for first src operand. -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-NOT: 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs @@ -97,14 +91,11 @@ define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { ret void } -; FIXME: SGPR should be used directly for first src operand. ; FUNC-LABEL: {{^}}fneg_fabs_v4f32: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-NOT: 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> , %fabs diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll index 6618d8b5e57e3..83a8ad8901d21 100644 --- a/test/CodeGen/AMDGPU/ftrunc.f64.ll +++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -29,12 +29,12 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { ; SI: s_lshr_b64 ; SI: s_not_b64 ; SI: s_and_b64 -; SI: cmp_gt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 -; SI: cmp_lt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 +; SI-DAG: cmp_gt_i32 +; SI-DAG: cndmask_b32 +; SI-DAG: cndmask_b32 +; SI-DAG: cmp_lt_i32 +; SI-DAG: cndmask_b32 +; SI-DAG: cndmask_b32 ; SI: s_endpgm define void @ftrunc_f64(double addrspace(1)* %out, double %x) { %y = call double @llvm.trunc.f64(double %x) nounwind readnone diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll index 471b0f6b13e78..f5ab390ce686d 100644 --- a/test/CodeGen/AMDGPU/gep-address-space.ll +++ b/test/CodeGen/AMDGPU/gep-address-space.ll @@ -11,24 +11,35 @@ define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind { ret void } -define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { ; CHECK-LABEL: {{^}}use_gep_address_space_large_offset: ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on ; SI, which is why it is being OR'd with the base pointer. ; SI: s_or_b32 ; CI: s_add_i32 ; CHECK: ds_write_b32 +define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384 store i32 99, i32 addrspace(3)* %p ret void } -define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { ; CHECK-LABEL: {{^}}gep_as_vector_v4: -; CHECK: s_add_i32 -; CHECK: s_add_i32 -; CHECK: s_add_i32 -; CHECK: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 + +; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}} +; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}} +; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}} +; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}} + +; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 +; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 +; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 +; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 +; CHECK: s_endpgm +define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0 %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1 @@ -41,10 +52,15 @@ define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind ret void } -define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { ; CHECK-LABEL: {{^}}gep_as_vector_v2: -; CHECK: s_add_i32 -; CHECK: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}} +; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}} +; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 +; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 +; CHECK: s_endpgm +define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0 %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1 diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll new file mode 100644 index 0000000000000..bc5f031cd4a29 --- /dev/null +++ b/test/CodeGen/AMDGPU/global-constant.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOHSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA %s + +@readonly = private unnamed_addr addrspace(2) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0] +@readonly2 = private unnamed_addr addrspace(2) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0] + +; GCN-LABEL: {{^}}main: +; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], readonly +; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], 0 +; GCN: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], readonly +; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], 0 +; NOHSA: .text +; HSA: .hsatext +; GCN: readonly: +; GCN: readonly2: +define void @main(i32 %index, float addrspace(1)* %out) { + %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly, i32 0, i32 %index + %val = load float, float addrspace(2)* %ptr + store float %val, float addrspace(1)* %out + %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly2, i32 0, i32 %index + %val2 = load float, float addrspace(2)* %ptr2 + store float %val2, float addrspace(1)* %out + ret void +} + diff --git a/test/CodeGen/AMDGPU/global-extload-i32.ll b/test/CodeGen/AMDGPU/global-extload-i32.ll index 79b83452939e9..e5e6be2199c37 100644 --- a/test/CodeGen/AMDGPU/global-extload-i32.ll +++ b/test/CodeGen/AMDGPU/global-extload-i32.ll @@ -49,8 +49,7 @@ define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i ; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64: ; SI: buffer_load_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i32>, <2 x i32> addrspace(1)* %in @@ -63,8 +62,7 @@ define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; SI: buffer_load_dwordx2 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i32>, <2 x i32> addrspace(1)* %in @@ -75,10 +73,8 @@ define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64: ; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -93,10 +89,8 @@ define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -106,22 +100,12 @@ define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i } ; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i32>, <8 x i32> addrspace(1)* %in @@ -131,14 +115,8 @@ define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i } ; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 @@ -148,15 +126,10 @@ define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i32>, <8 x i32> addrspace(1)* %in @@ -166,50 +139,34 @@ define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i } ; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i32>, <16 x i32> addrspace(1)* %in @@ -219,40 +176,19 @@ define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 } ; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64 -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i32>, <16 x i32> addrspace(1)* %in @@ -262,41 +198,15 @@ define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 } ; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 @@ -331,41 +241,25 @@ define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { @@ -376,77 +270,34 @@ define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 } ; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 + +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 + +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 + +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll index 146f0a5fbf260..6786e4a2f375a 100644 --- a/test/CodeGen/AMDGPU/global_atomics.ll +++ b/test/CodeGen/AMDGPU/global_atomics.ll @@ -12,7 +12,7 @@ entry: } ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset: -; GCN: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; GCN: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: @@ -105,7 +105,7 @@ entry: } ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset: -; GCN: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; GCN: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: @@ -197,7 +197,7 @@ entry: } ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset: -; GCN: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; GCN: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: @@ -289,7 +289,7 @@ entry: } ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset: -; GCN: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; GCN: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: @@ -381,7 +381,7 @@ entry: } ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset: -; GCN: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; GCN: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: @@ -473,7 +473,7 @@ entry: } ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset: -; GCN: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; GCN: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: @@ -565,7 +565,7 @@ entry: } ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset: -; GCN: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; GCN: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: @@ -657,7 +657,7 @@ entry: } ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset: -; GCN: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; GCN: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: @@ -749,7 +749,7 @@ entry: } ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset: -; GCN: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; GCN: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: @@ -838,7 +838,7 @@ entry: } ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset: -; GCN: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; GCN: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll index bf8f11860b50d..a02cbf43c4009 100644 --- a/test/CodeGen/AMDGPU/half.ll +++ b/test/CodeGen/AMDGPU/half.ll @@ -105,6 +105,26 @@ define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x hal } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort + +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 + +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { %ext = fpext <8 x half> %arg to <8 x float> store <8 x float> %ext, <8 x float> addrspace(1)* %out @@ -112,12 +132,24 @@ define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x hal } ; GCN-LABEL: {{^}}extload_f16_to_f64_arg: +; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} +; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}} +; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]] +; GCN: buffer_store_dwordx2 [[RESULT]] define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { %ext = fpext half %arg to double store double %ext, double addrspace(1)* %out ret void } + ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN: s_endpgm define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { %ext = fpext <2 x half> %arg to <2 x double> store <2 x double> %ext, <2 x double> addrspace(1)* %out @@ -125,6 +157,16 @@ define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x ha } ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN: s_endpgm define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { %ext = fpext <3 x half> %arg to <3 x double> store <3 x double> %ext, <3 x double> addrspace(1)* %out @@ -132,6 +174,19 @@ define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x ha } ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN: s_endpgm define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { %ext = fpext <4 x half> %arg to <4 x double> store <4 x double> %ext, <4 x double> addrspace(1)* %out @@ -139,6 +194,37 @@ define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x ha } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v + +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v +; GCN-DAG: buffer_load_ushort v + +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 + +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 + +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 + +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 +; GCN-DAG: v_cvt_f64_f32_e32 + +; GCN: s_endpgm define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { %ext = fpext <8 x half> %arg to <8 x double> store <8 x double> %ext, <8 x double> addrspace(1)* %out @@ -194,6 +280,12 @@ define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace( } ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: +; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] +; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} +; GCN: s_endpgm define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %cvt = fpext <2 x half> %val to <2 x float> @@ -226,6 +318,46 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x } ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort + +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 + +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 + +; GCN: s_endpgm define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { %val = load <16 x half>, <16 x half> addrspace(1)* %in %cvt = fpext <16 x half> %val to <16 x float> @@ -246,6 +378,14 @@ define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace } ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: +; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] +; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] +; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] +; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} +; GCN: s_endpgm define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %cvt = fpext <2 x half> %val to <2 x double> @@ -254,6 +394,25 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x } ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: + +; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]], 32 +; VI: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 32, [[LOAD]] +; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} + +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN-NOT: v_cvt_f32_f16_e32 + +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN-NOT: v_cvt_f64_f32_e32 + +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 +; GCN: s_endpgm define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { %val = load <3 x half>, <3 x half> addrspace(1)* %in %cvt = fpext <3 x half> %val to <3 x double> @@ -310,13 +469,12 @@ define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 ret void } -; FIXME: Shouldn't do 4th conversion ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: ; GCN: buffer_load_dwordx4 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 +; GCN-NOT: v_cvt_f16_f32_e32 ; GCN: buffer_store_short ; GCN: buffer_store_dword ; GCN: s_endpgm @@ -346,14 +504,8 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 } ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 @@ -379,54 +531,42 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 } ; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short ; GCN: s_endpgm define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { %val = load <16 x float>, <16 x float> addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll new file mode 100644 index 0000000000000..1d76c40c042e8 --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-globals.ll @@ -0,0 +1,132 @@ +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=ASM %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s | FileCheck %s --check-prefix=ELF + +@internal_global_program = internal addrspace(1) global i32 0 +@common_global_program = common addrspace(1) global i32 0 +@external_global_program = addrspace(1) global i32 0 + +@internal_global_agent = internal addrspace(1) global i32 0, section ".hsadata_global_agent" +@common_global_agent = common addrspace(1) global i32 0, section ".hsadata_global_agent" +@external_global_agent = addrspace(1) global i32 0, section ".hsadata_global_agent" + +@internal_readonly = internal unnamed_addr addrspace(2) constant i32 0 +@external_readonly = unnamed_addr addrspace(2) constant i32 0 + +define void @test() { + ret void +} + +; ASM: .amdgpu_hsa_module_global internal_global +; ASM: .hsadata_global_program +; ASM: internal_global_program: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_module_global common_global +; ASM: .hsadata_global_program +; ASM: common_global_program: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_program_global external_global +; ASM: .hsadata_global_program +; ASM: external_global_program: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_module_global internal_global +; ASM: .hsadata_global_agent +; ASM: internal_global_agent: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_module_global common_global +; ASM: .hsadata_global_agent +; ASM: common_global_agent: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_program_global external_global +; ASM: .hsadata_global_agent +; ASM: external_global_agent: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_module_global internal_readonly +; ASM: .hsatext +; ASM: internal_readonly: +; ASM: .long 0 + +; ASM: .amdgpu_hsa_program_global external_readonly +; ASM: .hsatext +; ASM: external_readonly: +; ASM: .long 0 + +; ELF: Section { +; ELF: Name: .hsadata_global_program +; ELF: Type: SHT_PROGBITS (0x1) +; ELF: Flags [ (0x100003) +; ELF: SHF_ALLOC (0x2) +; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000) +; ELF: SHF_WRITE (0x1) +; ELF: ] +; ELF: } + +; ELF: Section { +; ELF: Name: .hsadata_global_agent +; ELF: Type: SHT_PROGBITS (0x1) +; ELF: Flags [ (0x900003) +; ELF: SHF_ALLOC (0x2) +; ELF: SHF_AMDGPU_HSA_AGENT (0x800000) +; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000) +; ELF: SHF_WRITE (0x1) +; ELF: ] +; ELF: } + +; ELF: Symbol { +; ELF: Name: common_global_agent +; ELF: Binding: Local +; ELF: Section: .hsadata_global_agent +; ELF: } + +; ELF: Symbol { +; ELF: Name: common_global_program +; ELF: Binding: Local +; ELF: Section: .hsadata_global_program +; ELF: } + +; ELF: Symbol { +; ELF: Name: internal_global_agent +; ELF: Binding: Local +; ELF: Type: Object +; ELF: Section: .hsadata_global_agent +; ELF: } + +; ELF: Symbol { +; ELF: Name: internal_global_program +; ELF: Binding: Local +; ELF: Type: Object +; ELF: Section: .hsadata_global_program +; ELF: } + +; ELF: Symbol { +; ELF: Name: internal_readonly +; ELF: Binding: Local +; ELF: Type: Object +; ELF: Section: .hsatext +; ELF: } + +; ELF: Symbol { +; ELF: Name: external_global_agent +; ELF: Binding: Global +; ELF: Type: Object +; ELF: Section: .hsadata_global_agent +; ELF: } + +; ELF: Symbol { +; ELF: Name: external_global_program +; ELF: Binding: Global +; ELF: Type: Object +; ELF: Section: .hsadata_global_program +; ELF: } + +; ELF: Symbol { +; ELF: Name: external_readonly +; ELF: Binding: Global +; ELF: Type: Object +; ELF: Section: .hsatext +; ELF: } diff --git a/test/CodeGen/AMDGPU/hsa-group-segment.ll b/test/CodeGen/AMDGPU/hsa-group-segment.ll new file mode 100644 index 0000000000000..1999dc38a6b0f --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-group-segment.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s + +@internal_group = internal addrspace(3) global i32 undef +@external_group = addrspace(3) global i32 undef + +define void @test() { +entry: + store i32 0, i32 addrspace(3)* @internal_group + store i32 0, i32 addrspace(3)* @external_group + ret void +} + +; HSA-NOT: internal_group: +; HSA-NOT: external_group: diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll index 653a6bb1b6098..abc89b7fd837c 100644 --- a/test/CodeGen/AMDGPU/hsa.ll +++ b/test/CodeGen/AMDGPU/hsa.ll @@ -1,11 +1,24 @@ -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA-CI --check-prefix=HSA %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA-VI --check-prefix=HSA %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -s -sd | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF ; The SHT_NOTE section contains the output from the .hsa_code_object_* ; directives. +; ELF: Section { +; ELF: Name: .hsatext +; ELF: Type: SHT_PROGBITS (0x1) +; ELF: Flags [ (0xC00007) +; ELF: SHF_ALLOC (0x2) +; ELF: SHF_AMDGPU_HSA_AGENT (0x800000) +; ELF: SHF_AMDGPU_HSA_CODE (0x400000) +; ELF: SHF_EXECINSTR (0x4) +; ELF: SHF_WRITE (0x1) +; ELF: } + ; ELF: SHT_NOTE ; ELF: 0000: 04000000 08000000 01000000 414D4400 ; ELF: 0010: 01000000 00000000 04000000 1B000000 @@ -13,20 +26,31 @@ ; ELF: 0030: 00000000 00000000 414D4400 414D4447 ; ELF: 0040: 50550000 +; ELF: Symbol { +; ELF: Name: simple +; ELF: Type: AMDGPU_HSA_KERNEL (0xA) +; ELF: } + ; HSA: .hsa_code_object_version 1,0 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU" +; HSA: .hsatext + +; HSA: .amdgpu_hsa_kernel simple ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: .end_amd_kernel_code_t -; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0 +; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; Make sure we are setting the ATC bit: ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 ; On VI+ we also need to set MTYPE = 2 ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000 -; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0 +; Make sure we generate flat store for HSA +; HSA: flat_store_dword v{{[0-9]+}} define void @simple(i32 addrspace(1)* %out) { entry: diff --git a/test/CodeGen/AMDGPU/image-attributes.ll b/test/CodeGen/AMDGPU/image-attributes.ll new file mode 100644 index 0000000000000..5906b2f157096 --- /dev/null +++ b/test/CodeGen/AMDGPU/image-attributes.ll @@ -0,0 +1,206 @@ +; RUN: llc -march=r600 -mcpu=juniper < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; === WIDTH ================================================================== +; 9 implicit args = 9 dwords to first image argument. +; First width at dword index 9+1 -> KC0[2].Z + +; FUNC-LABEL: {{^}}width_2d: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[2].Z +define void @width_2d (%opencl.image2d_t addrspace(1)* %in, + i32 addrspace(1)* %out) { +entry: + %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d( + %opencl.image2d_t addrspace(1)* %in) #0 + %1 = extractvalue [3 x i32] %0, 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}width_3d: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[2].Z +define void @width_3d (%opencl.image3d_t addrspace(1)* %in, + i32 addrspace(1)* %out) { +entry: + %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d( + %opencl.image3d_t addrspace(1)* %in) #0 + %1 = extractvalue [3 x i32] %0, 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + + +; === HEIGHT ================================================================= +; First height at dword index 9+2 -> KC0[2].W + +; FUNC-LABEL: {{^}}height_2d: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[2].W +define void @height_2d (%opencl.image2d_t addrspace(1)* %in, + i32 addrspace(1)* %out) { +entry: + %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d( + %opencl.image2d_t addrspace(1)* %in) #0 + %1 = extractvalue [3 x i32] %0, 1 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}height_3d: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[2].W +define void @height_3d (%opencl.image3d_t addrspace(1)* %in, + i32 addrspace(1)* %out) { +entry: + %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d( + %opencl.image3d_t addrspace(1)* %in) #0 + %1 = extractvalue [3 x i32] %0, 1 + store i32 %1, i32 addrspace(1)* %out + ret void +} + + +; === DEPTH ================================================================== +; First depth at dword index 9+3 -> KC0[3].X + +; FUNC-LABEL: {{^}}depth_3d: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[3].X +define void @depth_3d (%opencl.image3d_t addrspace(1)* %in, + i32 addrspace(1)* %out) { +entry: + %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d( + %opencl.image3d_t addrspace(1)* %in) #0 + %1 = extractvalue [3 x i32] %0, 2 + store i32 %1, i32 addrspace(1)* %out + ret void +} + + +; === CHANNEL DATA TYPE ====================================================== +; First channel data type at dword index 9+4 -> KC0[3].Y + +; FUNC-LABEL: {{^}}data_type_2d: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[3].Y +define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in, + i32 addrspace(1)* %out) { +entry: + %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d( + %opencl.image2d_t addrspace(1)* %in) #0 + %1 = extractvalue [2 x i32] %0, 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}data_type_3d: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[3].Y +define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in, + i32 addrspace(1)* %out) { +entry: + %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d( + %opencl.image3d_t addrspace(1)* %in) #0 + %1 = extractvalue [2 x i32] %0, 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + + +; === CHANNEL ORDER ========================================================== +; First channel order at dword index 9+5 -> KC0[3].Z + +; FUNC-LABEL: {{^}}channel_order_2d: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[3].Z +define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in, + i32 addrspace(1)* %out) { +entry: + %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d( + %opencl.image2d_t addrspace(1)* %in) #0 + %1 = extractvalue [2 x i32] %0, 1 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}channel_order_3d: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[3].Z +define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in, + i32 addrspace(1)* %out) { +entry: + %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d( + %opencl.image3d_t addrspace(1)* %in) #0 + %1 = extractvalue [2 x i32] %0, 1 + store i32 %1, i32 addrspace(1)* %out + ret void +} + + +; === 2ND IMAGE ============================================================== +; 9 implicit args + 2 explicit args + 5 implicit args for 1st image argument +; = 16 dwords to 2nd image argument. +; Height of the second image is at 16+2 -> KC0[4].Z +; +; FUNC-LABEL: {{^}}image_arg_2nd: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[4].Z +define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1, + i32 %x, + %opencl.image2d_t addrspace(1)* %in2, + i32 addrspace(1)* %out) { +entry: + %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d( + %opencl.image2d_t addrspace(1)* %in2) #0 + %1 = extractvalue [3 x i32] %0, 1 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +%opencl.image2d_t = type opaque +%opencl.image3d_t = type opaque + +declare [3 x i32] @llvm.OpenCL.image.get.size.2d(%opencl.image2d_t addrspace(1)*) #0 +declare [3 x i32] @llvm.OpenCL.image.get.size.3d(%opencl.image3d_t addrspace(1)*) #0 +declare [2 x i32] @llvm.OpenCL.image.get.format.2d(%opencl.image2d_t addrspace(1)*) #0 +declare [2 x i32] @llvm.OpenCL.image.get.format.3d(%opencl.image3d_t addrspace(1)*) #0 + +attributes #0 = { readnone } + +!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9} +!0 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @width_2d, + !10, !20, !30, !40, !50} +!1 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @width_3d, + !10, !21, !31, !41, !50} +!2 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @height_2d, + !10, !20, !30, !40, !50} +!3 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @height_3d, + !10, !21, !31, !41, !50} +!4 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @depth_3d, + !10, !21, !31, !41, !50} +!5 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @data_type_2d, + !10, !20, !30, !40, !50} +!6 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @data_type_3d, + !10, !21, !31, !41, !50} +!7 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @channel_order_2d, + !10, !20, !30, !40, !50} +!8 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @channel_order_3d, + !10, !21, !31, !41, !50} +!9 = !{void (%opencl.image3d_t addrspace(1)*, i32, %opencl.image2d_t addrspace(1)*, + i32 addrspace(1)*)* @image_arg_2nd, !12, !22, !32, !42, !52} + +!10 = !{!"kernel_arg_addr_space", i32 1, i32 1} +!20 = !{!"kernel_arg_access_qual", !"read_only", !"none"} +!21 = !{!"kernel_arg_access_qual", !"read_only", !"none"} +!30 = !{!"kernel_arg_type", !"image2d_t", !"int*"} +!31 = !{!"kernel_arg_type", !"image3d_t", !"int*"} +!40 = !{!"kernel_arg_base_type", !"image2d_t", !"int*"} +!41 = !{!"kernel_arg_base_type", !"image3d_t", !"int*"} +!50 = !{!"kernel_arg_type_qual", !"", !""} + +!12 = !{!"kernel_arg_addr_space", i32 1, i32 0, i32 1, i32 1} +!22 = !{!"kernel_arg_access_qual", !"read_only", !"none", !"write_only", !"none"} +!32 = !{!"kernel_arg_type", !"image3d_t", !"sampler_t", !"image2d_t", !"int*"} +!42 = !{!"kernel_arg_base_type", !"image3d_t", !"sampler_t", !"image2d_t", !"int*"} +!52 = !{!"kernel_arg_type_qual", !"", !"", !"", !""} diff --git a/test/CodeGen/AMDGPU/image-resource-id.ll b/test/CodeGen/AMDGPU/image-resource-id.ll new file mode 100644 index 0000000000000..d4cf349442409 --- /dev/null +++ b/test/CodeGen/AMDGPU/image-resource-id.ll @@ -0,0 +1,409 @@ +; RUN: llc -march=r600 -mcpu=juniper < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; === 1 image arg, read_only =================================================== + +; FUNC-LABEL: {{^}}test_2d_rd_1_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 0( +define void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %in) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_3d_rd_1_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 0( +define void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)* %in) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; === 1 image arg, write_only ================================================== + +; FUNC-LABEL: {{^}}test_2d_wr_1_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 0( +define void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %in) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_3d_wr_1_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 0( +define void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)* %in) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; === 2 image args, read_only ================================================== + +; FUNC-LABEL: {{^}}test_2d_rd_2_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 0( +define void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only + %opencl.image2d_t addrspace(1)* %in2, ; read_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %in1) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_2d_rd_2_1: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 1( +define void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only + %opencl.image2d_t addrspace(1)* %in2, ; read_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %in2) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_3d_rd_2_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 0( +define void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only + %opencl.image3d_t addrspace(1)* %in2, ; read_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)* %in1) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_3d_rd_2_1: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 1( +define void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only + %opencl.image3d_t addrspace(1)* %in2, ; read_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)* %in2) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; === 2 image args, write_only ================================================= + +; FUNC-LABEL: {{^}}test_2d_wr_2_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 0( +define void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only + %opencl.image2d_t addrspace(1)* %in2, ; write_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %in1) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_2d_wr_2_1: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 1( +define void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only + %opencl.image2d_t addrspace(1)* %in2, ; write_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %in2) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_3d_wr_2_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 0( +define void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only + %opencl.image3d_t addrspace(1)* %in2, ; write_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)* %in1) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_3d_wr_2_1: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 1( +define void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only + %opencl.image3d_t addrspace(1)* %in2, ; write_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)* %in2) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; === 3 image args, read_only ================================================== + +; FUNC-LABEL: {{^}}test_2d_rd_3_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 2( +define void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only + %opencl.image3d_t addrspace(1)* %in2, ; read_only + %opencl.image2d_t addrspace(1)* %in3, ; read_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %in3) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}test_3d_rd_3_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 2( +define void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only + %opencl.image2d_t addrspace(1)* %in2, ; read_only + %opencl.image3d_t addrspace(1)* %in3, ; read_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)* %in3) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; === 3 image args, write_only ================================================= + +; FUNC-LABEL: {{^}}test_2d_wr_3_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 2( +define void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only + %opencl.image3d_t addrspace(1)* %in2, ; write_only + %opencl.image2d_t addrspace(1)* %in3, ; write_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %in3) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}test_3d_wr_3_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 2( +define void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only + %opencl.image2d_t addrspace(1)* %in2, ; write_only + %opencl.image3d_t addrspace(1)* %in3, ; write_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)* %in3) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; === 3 image args, mixed ====================================================== + +; FUNC-LABEL: {{^}}test_2d_mix_3_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 1( +define void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only + %opencl.image3d_t addrspace(1)* %in2, ; read_only + %opencl.image2d_t addrspace(1)* %in3, ; read_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %in3) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_3d_mix_3_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 1( +define void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only + %opencl.image2d_t addrspace(1)* %in2, ; read_only + %opencl.image3d_t addrspace(1)* %in3, ; read_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)* %in3) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_2d_mix_3_1: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 1( +define void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only + %opencl.image3d_t addrspace(1)* %in2, ; read_only + %opencl.image2d_t addrspace(1)* %in3, ; write_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %in3) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_3d_mix_3_1: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 1( +define void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only + %opencl.image2d_t addrspace(1)* %in2, ; read_only + %opencl.image3d_t addrspace(1)* %in3, ; write_only + i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)* %in3) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + + +%opencl.image2d_t = type opaque +%opencl.image3d_t = type opaque + +declare i32 @llvm.OpenCL.image.get.resource.id.2d(%opencl.image2d_t addrspace(1)*) #0 +declare i32 @llvm.OpenCL.image.get.resource.id.3d(%opencl.image3d_t addrspace(1)*) #0 + +attributes #0 = { readnone } + +!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, + !14, !15, !16, !17, !18, !19} +!0 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_rd_1_0, + !110, !120, !130, !140, !150} +!1 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_rd_1_0, + !110, !120, !131, !141, !150} +!2 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_wr_1_0, + !110, !121, !130, !140, !150} +!3 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_wr_1_0, + !110, !121, !131, !141, !150} +!110 = !{!"kernel_arg_addr_space", i32 1, i32 1} +!120 = !{!"kernel_arg_access_qual", !"read_only", !"none"} +!121 = !{!"kernel_arg_access_qual", !"write_only", !"none"} +!130 = !{!"kernel_arg_type", !"image2d_t", !"int*"} +!131 = !{!"kernel_arg_type", !"image3d_t", !"int*"} +!140 = !{!"kernel_arg_base_type", !"image2d_t", !"int*"} +!141 = !{!"kernel_arg_base_type", !"image3d_t", !"int*"} +!150 = !{!"kernel_arg_type_qual", !"", !""} + +!4 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*, + i32 addrspace(1)*)* @test_2d_rd_2_0, !112, !122, !132, !142, !152} +!5 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*, + i32 addrspace(1)*)* @test_2d_rd_2_1, !112, !122, !132, !142, !152} +!6 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*, + i32 addrspace(1)*)* @test_3d_rd_2_0, !112, !122, !133, !143, !152} +!7 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*, + i32 addrspace(1)*)* @test_3d_rd_2_1, !112, !122, !133, !143, !152} +!8 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*, + i32 addrspace(1)*)* @test_2d_wr_2_0, !112, !123, !132, !142, !152} +!9 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*, + i32 addrspace(1)*)* @test_2d_wr_2_1, !112, !123, !132, !142, !152} +!10 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*, + i32 addrspace(1)*)* @test_3d_wr_2_0, !112, !123, !133, !143, !152} +!11 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*, + i32 addrspace(1)*)* @test_3d_wr_2_1, !112, !123, !133, !143, !152} +!112 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1} +!122 = !{!"kernel_arg_access_qual", !"read_only", !"read_only", !"none"} +!123 = !{!"kernel_arg_access_qual", !"write_only", !"write_only", !"none"} +!132 = !{!"kernel_arg_type", !"image2d_t", !"image2d_t", !"int*"} +!133 = !{!"kernel_arg_type", !"image3d_t", !"image3d_t", !"int*"} +!142 = !{!"kernel_arg_base_type", !"image2d_t", !"image2d_t", !"int*"} +!143 = !{!"kernel_arg_base_type", !"image3d_t", !"image3d_t", !"int*"} +!152 = !{!"kernel_arg_type_qual", !"", !"", !""} + +!12 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*, + %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_rd_3_0, + !114, !124, !134, !144, !154} +!13 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*, + %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_rd_3_0, + !114, !124, !135, !145, !154} +!14 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*, + %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_wr_3_0, + !114, !125, !134, !144, !154} +!15 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*, + %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_wr_3_0, + !114, !125, !135, !145, !154} +!16 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*, + %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_mix_3_0, + !114, !126, !134, !144, !154} +!17 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*, + %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_mix_3_0, + !114, !126, !135, !145, !154} +!18 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*, + %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_mix_3_1, + !114, !127, !134, !144, !154} +!19 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*, + %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_mix_3_1, + !114, !127, !135, !145, !154} +!114 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1} +!124 = !{!"kernel_arg_access_qual", !"read_only", !"read_only", !"read_only", !"none"} +!125 = !{!"kernel_arg_access_qual", !"write_only", !"write_only", !"write_only", !"none"} +!126 = !{!"kernel_arg_access_qual", !"write_only", !"read_only", !"read_only", !"none"} +!127 = !{!"kernel_arg_access_qual", !"write_only", !"read_only", !"write_only", !"none"} +!134 = !{!"kernel_arg_type", !"image2d_t", !"image3d_t", !"image2d_t", !"int*"} +!135 = !{!"kernel_arg_type", !"image3d_t", !"image2d_t", !"image3d_t", !"int*"} +!144 = !{!"kernel_arg_base_type", !"image2d_t", !"image3d_t", !"image2d_t", !"int*"} +!145 = !{!"kernel_arg_base_type", !"image3d_t", !"image2d_t", !"image3d_t", !"int*"} +!154 = !{!"kernel_arg_type_qual", !"", !"", !"", !""} diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll index 12eed550eb1fe..8db9ea4ccf314 100644 --- a/test/CodeGen/AMDGPU/imm.ll +++ b/test/CodeGen/AMDGPU/imm.ll @@ -3,8 +3,7 @@ ; Use a 64-bit value with lo bits that can be represented as an inline constant ; CHECK-LABEL: {{^}}i64_imm_inline_lo: -; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5 -; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]] +; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]: define void @i64_imm_inline_lo(i64 addrspace(1) *%out) { entry: @@ -14,8 +13,7 @@ entry: ; Use a 64-bit value with hi bits that can be represented as an inline constant ; CHECK-LABEL: {{^}}i64_imm_inline_hi: -; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5 -; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]] +; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5 ; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]] define void @i64_imm_inline_hi(i64 addrspace(1) *%out) { entry: @@ -24,10 +22,8 @@ entry: } ; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { store i64 -9223372036854775808, i64 addrspace(1) *%out @@ -523,10 +519,8 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) { ; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { store double -0.0, double addrspace(1)* %out @@ -606,10 +600,8 @@ define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) { } ; CHECK-LABEL: {{^}}store_literal_imm_f64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} define void @store_literal_imm_f64(double addrspace(1)* %out) { store double 4096.0, double addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll index f551606d63a73..e40cac22725ca 100644 --- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -5,23 +5,52 @@ ; indexing of vectors. ; CHECK-LABEL: {{^}}extract_w_offset: +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 ; CHECK: s_mov_b32 m0 ; CHECK-NEXT: v_movrels_b32_e32 define void @extract_w_offset(float addrspace(1)* %out, i32 %in) { entry: - %0 = add i32 %in, 1 - %1 = extractelement <4 x float> , i32 %0 - store float %1, float addrspace(1)* %out + %idx = add i32 %in, 1 + %elt = extractelement <4 x float> , i32 %idx + store float %elt, float addrspace(1)* %out + ret void +} + +; XXX: Could do v_or_b32 directly +; CHECK-LABEL: {{^}}extract_w_offset_salu_use_vector: +; CHECK-DAG: s_or_b32 +; CHECK-DAG: s_or_b32 +; CHECK-DAG: s_or_b32 +; CHECK-DAG: s_or_b32 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; CHECK: s_mov_b32 m0 +; CHECK-NEXT: v_movrels_b32_e32 +define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) { +entry: + %idx = add i32 %in, 1 + %vec = or <4 x i32> %or.val, + %elt = extractelement <4 x i32> %vec, i32 %idx + store i32 %elt, i32 addrspace(1)* %out ret void } ; CHECK-LABEL: {{^}}extract_wo_offset: +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 ; CHECK: s_mov_b32 m0 ; CHECK-NEXT: v_movrels_b32_e32 define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { entry: - %0 = extractelement <4 x float> , i32 %in - store float %0, float addrspace(1)* %out + %elt = extractelement <4 x float> , i32 %in + store float %elt, float addrspace(1)* %out ret void } @@ -37,6 +66,19 @@ entry: ret void } +; CHECK-LABEL: {{^}}extract_neg_offset_sgpr_loaded: +; The offset depends on the register that holds the first element of the vector. +; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} +; CHECK: v_movrels_b32_e32 v{{[0-9]}}, v0 +define void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) { +entry: + %index = add i32 %offset, -512 + %or = or <4 x i32> %vec0, %vec1 + %value = extractelement <4 x i32> %or, i32 %index + store i32 %value, i32 addrspace(1)* %out + ret void +} + ; CHECK-LABEL: {{^}}extract_neg_offset_vgpr: ; The offset depends on the register that holds the first element of the vector. ; CHECK: v_readfirstlane_b32 @@ -87,6 +129,21 @@ entry: ret void } +; The vector indexed into is originally loaded into an SGPR rather +; than built with a reg_sequence + +; CHECK-LABEL: {{^}}insert_neg_offset_sgpr_loadreg: +; The offset depends on the register that holds the first element of the vector. +; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} +; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}} +define void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) { +entry: + %index = add i32 %offset, -512 + %value = insertelement <4 x i32> %vec, i32 5, i32 %index + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void +} + ; CHECK-LABEL: {{^}}insert_neg_offset_vgpr: ; The offset depends on the register that holds the first element of the vector. ; CHECK: v_readfirstlane_b32 diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll index d63e1b6c5212f..2a3b29f54fa9b 100644 --- a/test/CodeGen/AMDGPU/indirect-private-64.ll +++ b/test/CodeGen/AMDGPU/indirect-private-64.ll @@ -4,7 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s -declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() convergent nounwind ; SI-LABEL: {{^}}private_access_f64_alloca: @@ -18,7 +18,7 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double %array = alloca double, i32 16, align 8 %ptr = getelementptr double, double* %array, i32 %b store double %val, double* %ptr, align 8 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + call void @llvm.AMDGPU.barrier.local() convergent nounwind %result = load double, double* %ptr, align 8 store double %result, double addrspace(1)* %out, align 8 ret void @@ -29,20 +29,16 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double ; SI-ALLOCA: buffer_store_dwordx4 ; SI-ALLOCA: buffer_load_dwordx4 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_read_b64 +; SI-PROMOTE: ds_read_b64 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 %array = alloca <2 x double>, i32 16, align 16 %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b store <2 x double> %val, <2 x double>* %ptr, align 16 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + call void @llvm.AMDGPU.barrier.local() convergent nounwind %result = load <2 x double>, <2 x double>* %ptr, align 16 store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16 ret void @@ -60,7 +56,7 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs %array = alloca i64, i32 16, align 8 %ptr = getelementptr i64, i64* %array, i32 %b store i64 %val, i64* %ptr, align 8 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + call void @llvm.AMDGPU.barrier.local() convergent nounwind %result = load i64, i64* %ptr, align 8 store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -71,20 +67,16 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs ; SI-ALLOCA: buffer_store_dwordx4 ; SI-ALLOCA: buffer_load_dwordx4 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_read_b64 +; SI-PROMOTE: ds_read_b64 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %array = alloca <2 x i64>, i32 16, align 16 %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b store <2 x i64> %val, <2 x i64>* %ptr, align 16 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + call void @llvm.AMDGPU.barrier.local() convergent nounwind %result = load <2 x i64>, <2 x i64>* %ptr, align 16 store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16 ret void diff --git a/test/CodeGen/AMDGPU/inline-constraints.ll b/test/CodeGen/AMDGPU/inline-constraints.ll new file mode 100644 index 0000000000000..78868710c6a28 --- /dev/null +++ b/test/CodeGen/AMDGPU/inline-constraints.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s + +; GCN-LABEL: {{^}}inline_reg_constraints: +; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GCN: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GCN: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] +; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] +; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] +; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] + +define void @inline_reg_constraints(i32 addrspace(1)* %ptr) { +entry: + %v32 = tail call i32 asm sideeffect "flat_load_dword $0, $1", "=v,v"(i32 addrspace(1)* %ptr) + %v64 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr) + %v128 = tail call <4 x i32> asm sideeffect "flat_load_dwordx4 $0, $1", "=v,v"(i32 addrspace(1)* %ptr) + %s32 = tail call i32 asm sideeffect "s_load_dword $0, $1", "=s,s"(i32 addrspace(1)* %ptr) + %s64 = tail call <2 x i32> asm sideeffect "s_load_dwordx2 $0, $1", "=s,s"(i32 addrspace(1)* %ptr) + %s128 = tail call <4 x i32> asm sideeffect "s_load_dwordx4 $0, $1", "=s,s"(i32 addrspace(1)* %ptr) + %s256 = tail call <8 x i32> asm sideeffect "s_load_dwordx8 $0, $1", "=s,s"(i32 addrspace(1)* %ptr) + ret void +} diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll index 6de3d408c4864..7f9579e59782b 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -70,8 +70,9 @@ define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x fl } ; SI-LABEL: {{^}}dynamic_insertelement_v8f32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 +; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 @@ -79,10 +80,11 @@ define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x fl } ; SI-LABEL: {{^}}dynamic_insertelement_v16f32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 +; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 @@ -202,10 +204,28 @@ endif: } ; SI-LABEL: {{^}}dynamic_insertelement_v2f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}} +; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}} +; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}} + +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} + +; SI: s_mov_b32 m0, [[SCALEDIDX]] +; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]] + +; Increment to next element. +; FIXME: Should be able to manipulate m0 directly instead of add and +; copy. + +; SI: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1 +; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 +; SI-DAG: s_mov_b32 m0, [[IDX1]] +; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] + +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { %vecins = insertelement <2 x double> %a, double 8.0, i32 %b @@ -213,9 +233,16 @@ define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x d ret void } +; FIXME: Inline immediate should be folded into v_movreld_b32. ; SI-LABEL: {{^}}dynamic_insertelement_v2i64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 + +; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}} +; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}} + +; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]] +; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] + +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { %vecins = insertelement <2 x i64> %a, i64 5, i32 %b @@ -223,12 +250,29 @@ define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> ret void } +; FIXME: Should be able to do without stack access. The used stack +; space is also 2x what should be required. + ; SI-LABEL: {{^}}dynamic_insertelement_v4f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: SCRATCH_RSRC_DWORD + +; Stack store +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} + +; Write element +; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; Stack reload +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; Store result +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm +; SI: ScratchSize: 64 + define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { %vecins = insertelement <4 x double> %a, double 8.0, i32 %b store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 @@ -236,15 +280,26 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d } ; SI-LABEL: {{^}}dynamic_insertelement_v8f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: SCRATCH_RSRC_DWORD + +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}} + +; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm +; SI: ScratchSize: 128 define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { %vecins = insertelement <8 x double> %a, double 8.0, i32 %b store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll index 1dd7c2cb7995d..e9d98ac89e72d 100644 --- a/test/CodeGen/AMDGPU/kernel-args.ll +++ b/test/CodeGen/AMDGPU/kernel-args.ll @@ -4,8 +4,10 @@ ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC ; FUNC-LABEL: {{^}}i8_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; GCN: buffer_load_ubyte +; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { entry: @@ -39,8 +41,10 @@ entry: } ; FUNC-LABEL: {{^}}i16_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; GCN: buffer_load_ushort +; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { entry: @@ -290,8 +294,8 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 -; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 +; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 +; VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { entry: store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 @@ -307,7 +311,7 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { entry: store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 @@ -409,8 +413,8 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { entry: store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 @@ -434,8 +438,8 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { entry: store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll new file mode 100644 index 0000000000000..8347b8c96ec4b --- /dev/null +++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s + +; FIXME: align on alloca seems to be ignored for private_segment_alignment + +; ALL-LABEL: {{^}}large_alloca_compute_shader: + +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 + + +; GCNHSA: .amd_kernel_code_t + +; GCNHSA: compute_pgm_rsrc2_scratch_en = 1 +; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6 +; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1 +; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0 +; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0 +; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0 +; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 + +; GCNHSA: enable_sgpr_private_segment_buffer = 1 +; GCNHSA: enable_sgpr_dispatch_ptr = 0 +; GCNHSA: enable_sgpr_queue_ptr = 0 +; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1 +; GCNHSA: enable_sgpr_dispatch_id = 0 +; GCNHSA: enable_sgpr_flat_scratch_init = 0 +; GCNHSA: enable_sgpr_private_segment_size = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCNHSA: workitem_private_segment_byte_size = 32772 +; GCNHSA: private_segment_alignment = 4 +; GCNHSA: .end_amd_kernel_code_t + + +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen + +; Scratch size = alloca size + emergency stack slot +; ALL: ; ScratchSize: 32772 +define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll new file mode 100644 index 0000000000000..141ee2560152b --- /dev/null +++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s + +; ALL-LABEL: {{^}}large_alloca_pixel_shader: +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen + +; ALL: ; ScratchSize: 32772 +define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg: +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen + +; ALL: ; ScratchSize: 32772 +define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/large-alloca.ll b/test/CodeGen/AMDGPU/large-alloca.ll deleted file mode 100644 index 671833d1a33a5..0000000000000 --- a/test/CodeGen/AMDGPU/large-alloca.ll +++ /dev/null @@ -1,15 +0,0 @@ -; XFAIL: * -; REQUIRES: asserts -; RUN: llc -march=amdgcn -mcpu=SI < %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s - -define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind { - %large = alloca [8192 x i32], align 4 - %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 - store i32 %x, i32* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y - %0 = load i32, i32* %gep1 - store i32 %0, i32 addrspace(1)* %out - ret void -} - diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll index cff1c24f89d6e..9d2320cb2d19f 100644 --- a/test/CodeGen/AMDGPU/literals.ll +++ b/test/CodeGen/AMDGPU/literals.ll @@ -7,8 +7,8 @@ ; ADD_INT literal.x KC0[2].Z, 5 ; CHECK: {{^}}i32_literal: -; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y ; CHECK-NEXT: 5 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) { entry: @@ -24,8 +24,8 @@ entry: ; ADD literal.x KC0[2].Z, 5.0 ; CHECK: {{^}}float_literal: -; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y ; CHECK-NEXT: 1084227584(5.0 define void @float_literal(float addrspace(1)* %out, float %in) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll index 8bf094b8bc7bf..ca8ddbae9fbc7 100644 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll @@ -8,9 +8,7 @@ declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone ; FUNC-LABEL: {{^}}s_abs_i32: -; SI: s_sub_i32 -; SI: s_max_i32 -; SI: s_endpgm +; SI: s_abs_i32 ; EG: SUB_INT ; EG: MAX_INT diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll index 1168713ca66ee..d56b484572856 100644 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll @@ -425,7 +425,7 @@ define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; SI: buffer_load_dword [[LOAD:v[0-9]+]] ; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16 ; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]] -; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]] +; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]] ; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] ; SI: buffer_store_dword [[TMP2]] define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll deleted file mode 100644 index 301de4b1c82d8..0000000000000 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone - -; FUNC-LABEL: {{^}}s_brev_i32: -; SI: s_load_dword [[VAL:s[0-9]+]], -; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_brev_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll index 805a88b59c721..80eb3b93f8e50 100644 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll @@ -271,7 +271,8 @@ define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { ; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NOT: vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { @@ -285,7 +286,8 @@ define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { ; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] ; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} ; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NOT: vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll index f948c987b0385..7dc094ed1b4b7 100644 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll @@ -4,7 +4,6 @@ ; FIXME: Enable for VI. declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll new file mode 100644 index 0000000000000..2e299e30b8c74 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll @@ -0,0 +1,37 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}read_workdim: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[2].Z + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] +define void @read_workdim(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.AMDGPU.read.workdim() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}read_workdim_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOT: 0xff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @read_workdim_known_bits(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.AMDGPU.read.workdim() #0 + %shl = shl i32 %dim, 24 + %shr = lshr i32 %shl, 24 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.AMDGPU.read.workdim() #0 + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll index 74792e50017f3..a30a8e083eb6f 100644 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s ; R600: {{^}}amdgpu_trunc: -; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; R600: TRUNC {{\*? *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: {{^}}amdgpu_trunc: ; SI: v_trunc_f32 diff --git a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll new file mode 100644 index 0000000000000..0155757632d4f --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}main: +; GCN: v_cvt_pkrtz_f16_f32 +; GCN: v_cvt_pkrtz_f16_f32 +; GCN-NOT: v_cvt_pkrtz_f16_f32 + +define void @main(float %src) #0 { +main_body: + %p1 = call i32 @llvm.SI.packf16(float undef, float %src) + %p2 = call i32 @llvm.SI.packf16(float %src, float undef) + %p3 = call i32 @llvm.SI.packf16(float undef, float undef) + %f1 = bitcast i32 %p1 to float + %f2 = bitcast i32 %p2 to float + %f3 = bitcast i32 %p3 to float + call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 1, float undef, float %f1, float undef, float %f1) + call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 1, float undef, float %f2, float undef, float %f2) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %f3, float undef, float %f2) + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll new file mode 100644 index 0000000000000..6d9db65e7d93a --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare void @llvm.amdgcn.buffer.wbinvl1() #0 + +; GCN-LABEL: {{^}}test_buffer_wbinvl1: +; GCN-NEXT: ; BB#0: +; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00] +; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00] +; GCN-NEXT: s_endpgm +define void @test_buffer_wbinvl1() #0 { + call void @llvm.amdgcn.buffer.wbinvl1() + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll new file mode 100644 index 0000000000000..746298465e580 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=SI %s + +declare void @llvm.amdgcn.buffer.wbinvl1.sc() #0 + +; SI-LABEL: {{^}}test_buffer_wbinvl1_sc: +; SI-NEXT: ; BB#0: +; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00] +; SI-NEXT: s_endpgm +define void @test_buffer_wbinvl1_sc() #0 { + call void @llvm.amdgcn.buffer.wbinvl1.sc() + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll new file mode 100644 index 0000000000000..cecfcb1bfe7c0 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare void @llvm.amdgcn.buffer.wbinvl1.vol() #0 + +; GCN-LABEL: {{^}}test_buffer_wbinvl1_vol: +; GCN-NEXT: ; BB#0: +; CI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00] +; VI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00] +; GCN-NEXT: s_endpgm +define void @test_buffer_wbinvl1_vol() #0 { + call void @llvm.amdgcn.buffer.wbinvl1.vol() + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll new file mode 100644 index 0000000000000..dc95cd1ee012f --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -0,0 +1,16 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test: +; GCN: enable_sgpr_dispatch_ptr = 1 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +define void @test(i32 addrspace(1)* %out) { + %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* + %value = load i32, i32 addrspace(2)* %header_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll new file mode 100644 index 0000000000000..a28e1b1eb2413 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll @@ -0,0 +1,30 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s + +;GCN-LABEL: {{^}}v_interp: +;GCN-NOT: s_wqm +;GCN: s_mov_b32 m0, s{{[0-9]+}} +;GCN: v_interp_p1_f32 +;GCN: v_interp_p2_f32 +define void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 { +main_body: + %i = extractelement <2 x i32> %4, i32 0 + %j = extractelement <2 x i32> %4, i32 1 + %p0_0 = call float @llvm.amdgcn.interp.p1(i32 %i, i32 0, i32 0, i32 %3) + %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, i32 %j, i32 0, i32 0, i32 %3) + %p0_1 = call float @llvm.amdgcn.interp.p1(i32 %i, i32 1, i32 0, i32 %3) + %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, i32 %j, i32 1, i32 0, i32 %3) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %p1_1) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.amdgcn.interp.p1(i32, i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.amdgcn.interp.p2(float, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll new file mode 100644 index 0000000000000..02ee2039542ae --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll @@ -0,0 +1,24 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s + +;GCN-LABEL: {{^}}mbcnt_intrinsics: +;GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0 +;SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]] +;VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]] + +define void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { +main_body: + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1 + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1 + %4 = bitcast i32 %hi to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %4, float %4, float %4, float %4) + ret void +} + +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 + +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll new file mode 100644 index 0000000000000..f8af67c17ec2b --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare void @llvm.amdgcn.s.dcache.inv() #0 + +; GCN-LABEL: {{^}}test_s_dcache_inv: +; GCN-NEXT: ; BB#0: +; SI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0xc0,0xc7] +; VI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0x80,0xc0,0x00,0x00,0x00,0x00] +; GCN-NEXT: s_endpgm +define void @test_s_dcache_inv() #0 { + call void @llvm.amdgcn.s.dcache.inv() + ret void +} + +; GCN-LABEL: {{^}}test_s_dcache_inv_insert_wait: +; GCN-NEXT: ; BB#0: +; GCN-NEXT: s_dcache_inv +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding +define void @test_s_dcache_inv_insert_wait() #0 { + call void @llvm.amdgcn.s.dcache.inv() + br label %end + +end: + store volatile i32 3, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll new file mode 100644 index 0000000000000..a8502a7c5033b --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare void @llvm.amdgcn.s.dcache.inv.vol() #0 + +; GCN-LABEL: {{^}}test_s_dcache_inv_vol: +; GCN-NEXT: ; BB#0: +; CI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x40,0xc7] +; VI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x88,0xc0,0x00,0x00,0x00,0x00] +; GCN-NEXT: s_endpgm +define void @test_s_dcache_inv_vol() #0 { + call void @llvm.amdgcn.s.dcache.inv.vol() + ret void +} + +; GCN-LABEL: {{^}}test_s_dcache_inv_vol_insert_wait: +; GCN-NEXT: ; BB#0: +; GCN-NEXT: s_dcache_inv_vol +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding +define void @test_s_dcache_inv_vol_insert_wait() #0 { + call void @llvm.amdgcn.s.dcache.inv.vol() + br label %end + +end: + store volatile i32 3, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll new file mode 100644 index 0000000000000..f9ae09b391aac --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s + +declare void @llvm.amdgcn.s.dcache.wb() #0 + +; VI-LABEL: {{^}}test_s_dcache_wb: +; VI-NEXT: ; BB#0: +; VI-NEXT: s_dcache_wb ; encoding: [0x00,0x00,0x84,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_endpgm +define void @test_s_dcache_wb() #0 { + call void @llvm.amdgcn.s.dcache.wb() + ret void +} + +; VI-LABEL: {{^}}test_s_dcache_wb_insert_wait: +; VI-NEXT: ; BB#0: +; VI-NEXT: s_dcache_wb +; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding +define void @test_s_dcache_wb_insert_wait() #0 { + call void @llvm.amdgcn.s.dcache.wb() + br label %end + +end: + store volatile i32 3, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll new file mode 100644 index 0000000000000..d9145458a1f6c --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s + +declare void @llvm.amdgcn.s.dcache.wb.vol() #0 + +; VI-LABEL: {{^}}test_s_dcache_wb_vol: +; VI-NEXT: ; BB#0: +; VI-NEXT: s_dcache_wb_vol ; encoding: [0x00,0x00,0x8c,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_endpgm +define void @test_s_dcache_wb_vol() #0 { + call void @llvm.amdgcn.s.dcache.wb.vol() + ret void +} + +; VI-LABEL: {{^}}test_s_dcache_wb_vol_insert_wait: +; VI-NEXT: ; BB#0: +; VI-NEXT: s_dcache_wb_vol +; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding +define void @test_s_dcache_wb_vol_insert_wait() #0 { + call void @llvm.amdgcn.s.dcache.wb.vol() + br label %end + +end: + store volatile i32 3, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll index a64dd0ebd2dd8..0c3e4ecaa1a0e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll @@ -4,7 +4,7 @@ declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone ; FUNC-LABEL: {{^}}test_lrp: -; SI: v_sub_f32 +; SI: v_mad_f32 ; SI: v_mac_f32_e32 define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind { %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll index d001bcb4db176..b01f8ab2bdf95 100644 --- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -1,11 +1,11 @@ -; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -mattr=-flat-for-global < %s | FileCheck %s ; CHECK-LABEL: {{^}}test_debug_value: -; CHECK: s_load_dwordx2 -; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- SGPR0_SGPR1 +; CHECK: s_load_dwordx2 s[4:5] +; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5 ; CHECK: buffer_store_dword ; CHECK: s_endpgm -define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 { +define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { entry: tail call void @llvm.dbg.value(metadata i32 addrspace(1)* %globalptr_arg, i64 0, metadata !10, metadata !13), !dbg !14 store i32 123, i32 addrspace(1)* %globalptr_arg, align 4 @@ -24,13 +24,13 @@ attributes #1 = { nounwind readnone } !1 = !DIFile(filename: "/tmp/test_debug_value.cl", directory: "/Users/matt/src/llvm/build_debug") !2 = !{} !3 = !{!4} -!4 = !DISubprogram(name: "test_debug_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, function: void (i32 addrspace(1)*)* @test_debug_value, variables: !9) +!4 = distinct !DISubprogram(name: "test_debug_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, variables: !9) !5 = !DISubroutineType(types: !6) !6 = !{null, !7} !7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 32) !8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) !9 = !{!10} -!10 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "globalptr_arg", arg: 1, scope: !4, file: !1, line: 1, type: !7) +!10 = !DILocalVariable(name: "globalptr_arg", arg: 1, scope: !4, file: !1, line: 1, type: !7) !11 = !{i32 2, !"Dwarf Version", i32 4} !12 = !{i32 2, !"Debug Info Version", i32 3} !13 = !DIExpression() diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll index e491732cf9c5f..d83ab562b7180 100644 --- a/test/CodeGen/AMDGPU/llvm.memcpy.ll +++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -132,32 +132,15 @@ define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias % } ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4: -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 +; SI: ds_read2_b32 +; SI: ds_read2_b32 +; SI: ds_read2_b32 +; SI: ds_read2_b32 -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 +; SI: ds_write2_b32 +; SI: ds_write2_b32 +; SI: ds_write2_b32 +; SI: ds_write2_b32 ; SI: s_endpgm define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { @@ -170,32 +153,15 @@ define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias % ; FIXME: Use 64-bit ops ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 +; SI: ds_read_b64 +; SI: ds_read_b64 +; SI: ds_read_b64 +; SI: ds_read_b64 -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 +; SI: ds_write_b64 +; SI: ds_write_b64 +; SI: ds_write_b64 +; SI: ds_write_b64 ; SI-DAG: s_endpgm define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll new file mode 100644 index 0000000000000..13ebee41e844e --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -0,0 +1,184 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}local_size_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[1].Z + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 + +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_x(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[1].W + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_y(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[2].X + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_z(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xy: +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xy(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %val = mul i32 %x, %y + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xz: + +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xz(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %x, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_yz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_yz(i32 addrspace(1)* %out) { +entry: + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %y, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xyz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xyz(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %xy = mul i32 %x, %y + %xyz = add i32 %xy, %z + store i32 %xyz, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_x_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_x_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.x() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_y_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.y() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_z_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.z() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.local.size.x() #0 +declare i32 @llvm.r600.read.local.size.y() #0 +declare i32 @llvm.r600.read.local.size.z() #0 + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll index 3d0f57e33280c..6b365dc09e2a9 100644 --- a/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -21,12 +21,9 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 { ; SI-DAG: v_cmp_eq_i32 ; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff -; SI-DAG: v_cmp_gt_i32_e64 +; SI-DAG: v_cmp_gt_i32_e32 ; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] -; SI-DAG: v_cmp_gt_i32_e64 - - ; SI: buffer_store_dwordx2 ; SI: s_endpgm define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { diff --git a/test/CodeGen/AMDGPU/load.ll b/test/CodeGen/AMDGPU/load.ll index 93b1b51a0d075..6a04261fe47bb 100644 --- a/test/CodeGen/AMDGPU/load.ll +++ b/test/CodeGen/AMDGPU/load.ll @@ -277,15 +277,9 @@ entry: ; FUNC-LABEL: {{^}}load_v8i32: ; R600: VTX_READ_128 ; R600: VTX_READ_128 -; XXX: We should be using DWORDX4 instructions on SI. -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword + +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) { entry: %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in @@ -298,23 +292,11 @@ entry: ; R600: VTX_READ_128 ; R600: VTX_READ_128 ; R600: VTX_READ_128 -; XXX: We should be using DWORDX4 instructions on SI. -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword + +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) { entry: %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll index f501a7ac62748..6b52b80ba0822 100644 --- a/test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ b/test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -10,7 +10,7 @@ ; EG: .long 166120 ; EG-NEXT: .long 8 ; GCN: .long 47180 -; GCN-NEXT: .long 38792 +; GCN-NEXT: .long 32900 ; EG: {{^}}local_memory_two_objects: @@ -30,7 +30,7 @@ ; constant offsets. ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] -; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}} +; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], vcc, 16, v{{[0-9]+}} ; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] ; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 ; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll index 9494ed75bd0c0..9ffb59e709200 100644 --- a/test/CodeGen/AMDGPU/local-memory.ll +++ b/test/CodeGen/AMDGPU/local-memory.ll @@ -9,9 +9,9 @@ ; EG: .long 166120 ; EG-NEXT: .long 128 ; SI: .long 47180 -; SI-NEXT: .long 71560 +; SI-NEXT: .long 65668 ; CI: .long 47180 -; CI-NEXT: .long 38792 +; CI-NEXT: .long 32900 ; FUNC-LABEL: {{^}}local_memory: diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll index fef3e2f0a21ca..eeb915c10a960 100644 --- a/test/CodeGen/AMDGPU/max.ll +++ b/test/CodeGen/AMDGPU/max.ll @@ -2,7 +2,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone -; FUNC-LABEL: @v_test_imax_sge_i32 +; FUNC-LABEL: {{^}}v_test_imax_sge_i32: ; SI: v_max_i32_e32 define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -17,6 +17,24 @@ define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ret void } +; FUNC-LABEL: {{^}}v_test_imax_sge_v4i32: +; SI: v_max_i32_e32 +; SI: v_max_i32_e32 +; SI: v_max_i32_e32 +; SI: v_max_i32_e32 +define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %out, i32 %tid + %a = load <4 x i32>, <4 x i32> addrspace(1)* %gep0, align 4 + %b = load <4 x i32>, <4 x i32> addrspace(1)* %gep1, align 4 + %cmp = icmp sge <4 x i32> %a, %b + %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b + store <4 x i32> %val, <4 x i32> addrspace(1)* %outgep, align 4 + ret void +} + ; FUNC-LABEL: @s_test_imax_sge_i32 ; SI: s_max_i32 define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -35,6 +53,23 @@ define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { ret void } +; FUNC-LABEL: {{^}}v_test_imax_sge_i8: +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: v_max_i32_e32 +define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %a = load i8, i8 addrspace(1)* %gep0, align 1 + %b = load i8, i8 addrspace(1)* %gep1, align 1 + %cmp = icmp sge i8 %a, %b + %val = select i1 %cmp, i8 %a, i8 %b + store i8 %val, i8 addrspace(1)* %outgep, align 1 + ret void +} + ; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32: ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { @@ -44,6 +79,15 @@ define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { ret void } +; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_v2i32: +; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 +; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 +define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { + %cmp = icmp sgt <2 x i32> %a, + %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + ret void +} ; FUNC-LABEL: @v_test_imax_sgt_i32 ; SI: v_max_i32_e32 define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -92,6 +136,36 @@ define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ret void } +; FUNC-LABEL: {{^}}s_test_umax_uge_v3i32: +; SI: s_max_u32 +; SI: s_max_u32 +; SI: s_max_u32 +; SI-NOT: s_max_u32 +; SI: s_endpgm +define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind { + %cmp = icmp uge <3 x i32> %a, %b + %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b + store <3 x i32> %val, <3 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_test_umax_uge_i8: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: v_max_u32_e32 +define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %a = load i8, i8 addrspace(1)* %gep0, align 1 + %b = load i8, i8 addrspace(1)* %gep1, align 1 + %cmp = icmp uge i8 %a, %b + %val = select i1 %cmp, i8 %a, i8 %b + store i8 %val, i8 addrspace(1)* %outgep, align 1 + ret void +} + ; FUNC-LABEL: @v_test_umax_ugt_i32 ; SI: v_max_u32_e32 define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -107,7 +181,7 @@ define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ret void } -; FUNC-LABEL: @s_test_umax_ugt_i32 +; FUNC-LABEL: {{^}}s_test_umax_ugt_i32: ; SI: s_max_u32 define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp ugt i32 %a, %b @@ -116,13 +190,23 @@ define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ret void } +; FUNC-LABEL: {{^}}s_test_umax_ugt_imm_v2i32: +; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15 +; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23 +define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { + %cmp = icmp ugt <2 x i32> %a, + %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + ret void +} + ; Make sure redundant and removed ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] +; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] +; SI-NEXT: buffer_store_dword [[VMAX]] define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 @@ -135,13 +219,13 @@ define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i1 ; Make sure redundant sign_extend_inreg removed. -; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_max_slt_i16: ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] -define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { +; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] +; SI-NEXT: buffer_store_dword [[VMAX]] +define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 %cmp = icmp sgt i32 %a.ext, %b.ext @@ -152,15 +236,13 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 ret void } -; FIXME: Should get match min/max through extends inserted by -; legalization. - -; FUNC-LABEL: {{^}}s_test_imin_sge_i16: +; FUNC-LABEL: {{^}}s_test_imax_sge_i16: +; SI: s_load_dword +; SI: s_load_dword ; SI: s_sext_i32_i16 ; SI: s_sext_i32_i16 -; SI: v_cmp_ge_i32_e32 -; SI: v_cndmask_b32 -define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { +; SI: s_max_i32 +define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { %cmp = icmp sge i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b store i16 %val, i16 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll index 34a2fc7ffa745..65b454b5d8cbb 100644 --- a/test/CodeGen/AMDGPU/merge-stores.ll +++ b/test/CodeGen/AMDGPU/merge-stores.ll @@ -1,5 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s + +; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s ; Run with devices with different unaligned load restrictions. @@ -65,10 +68,8 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* } ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: -; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 -; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b -; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] -; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] +; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 +; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 @@ -89,10 +90,8 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { } ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: -; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0 -; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}} -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]] +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 @@ -121,10 +120,7 @@ define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { } ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dwordx2 v +; GCN: buffer_store_dwordx4 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 @@ -137,17 +133,9 @@ define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) ret void } -; First store is out of order. Because of order of combines, the -; consecutive store fails because only some of the stores have been -; replaced with integer constant stores, and then won't merge because -; the types are different. - +; First store is out of order. ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx4 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 @@ -160,6 +148,33 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { ret void } +; FIXME: Should be able to merge this +; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32: +; GCN-NOAA: buffer_store_dword v +; GCN-NOAA: buffer_store_dword v +; GCN-NOAA: buffer_store_dword v +; GCN-NOAA: buffer_store_dword v + +; GCN-AA: buffer_store_dwordx2 +; GCN-AA: buffer_store_dword v +; GCN-AA: buffer_store_dword v + +; GCN: s_endpgm +define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + + %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* + %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)* + + store i32 11, i32 addrspace(1)* %out.gep.1.bc + store float 2.0, float addrspace(1)* %out.gep.2 + store i32 17, i32 addrspace(1)* %out.gep.3.bc + store float 8.0, float addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: ; SI-DAG: buffer_store_dwordx2 ; SI-DAG: buffer_store_dword @@ -176,9 +191,7 @@ define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { } ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx4 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 @@ -188,13 +201,8 @@ define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { } ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: -; XGCN: buffer_store_dwordx4 -; XGCN: buffer_store_dwordx4 - -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 @@ -472,11 +480,15 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1 ; This works once AA is enabled on the subtarget ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; XGCN: buffer_store_dwordx4 [[LOAD]] -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v + +; GCN-NOAA: buffer_store_dword v +; GCN-NOAA: buffer_store_dword v +; GCN-NOAA: buffer_store_dword v +; GCN-NOAA: buffer_store_dword v + +; GCN-AA: buffer_store_dwordx4 [[LOAD]] + +; GCN: s_endpgm define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 @@ -508,10 +520,8 @@ define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { } ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: -; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 -; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 @@ -522,10 +532,15 @@ define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { } ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: ds_write_b32 +; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8 +; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3 + +; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2 +; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 + +; GCN: s_endpgm define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 @@ -597,17 +612,9 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { } ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32: -; XGCN: buffer_store_dwordx4 -; XGCN: buffer_store_dwordx4 - -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN: s_endpgm define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 @@ -627,7 +634,78 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { ret void } +; This requires handling of scalar_to_vector for v2i64 to avoid +; scratch usage. +; FIXME: Should do single load and store + +; GCN-LABEL: {{^}}copy_v3i32_align4: +; GCN-NOT: SCRATCH_RSRC_DWORD +; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-NOT: offen +; GCN: s_waitcnt vmcnt +; GCN-NOT: offen +; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 + +; GCN: ScratchSize: 0{{$}} +define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { + %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 + store <3 x i32> %vec, <3 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}copy_v3i64_align4: +; GCN-NOT: SCRATCH_RSRC_DWORD +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} +; GCN-NOT: offen +; GCN: s_waitcnt vmcnt +; GCN-NOT: offen +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} +; GCN: ScratchSize: 0{{$}} +define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { + %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 + store <3 x i64> %vec, <3 x i64> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}copy_v3f32_align4: +; GCN-NOT: SCRATCH_RSRC_DWORD +; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-NOT: offen +; GCN: s_waitcnt vmcnt +; GCN-NOT: offen +; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN: ScratchSize: 0{{$}} +define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { + %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 + %fadd = fadd <3 x float> %vec, + store <3 x float> %fadd, <3 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}copy_v3f64_align4: +; GCN-NOT: SCRATCH_RSRC_DWORD +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} +; GCN-NOT: offen +; GCN: s_waitcnt vmcnt +; GCN-NOT: offen +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} +; GCN: ScratchSize: 0{{$}} +define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { + %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 + %fadd = fadd <3 x double> %vec, + store <3 x double> %fadd, <3 x double> addrspace(1)* %out + ret void +} + declare void @llvm.AMDGPU.barrier.local() #1 attributes #0 = { nounwind } -attributes #1 = { noduplicate nounwind } +attributes #1 = { convergent nounwind } diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll index 0332d1a8e407c..215dbeb4b2fdd 100644 --- a/test/CodeGen/AMDGPU/min.ll +++ b/test/CodeGen/AMDGPU/min.ll @@ -2,7 +2,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone -; FUNC-LABEL: @v_test_imin_sle_i32 +; FUNC-LABEL: {{^}}v_test_imin_sle_i32: ; SI: v_min_i32_e32 define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -17,7 +17,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ret void } -; FUNC-LABEL: @s_test_imin_sle_i32 +; FUNC-LABEL: {{^}}s_test_imin_sle_i32: ; SI: s_min_i32 define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp sle i32 %a, %b @@ -26,6 +26,78 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ret void } +; FUNC-LABEL: {{^}}s_test_imin_sle_v1i32: +; SI: s_min_i32 +define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { + %cmp = icmp sle <1 x i32> %a, %b + %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b + store <1 x i32> %val, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_v4i32: +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 +define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %cmp = icmp sle <4 x i32> %a, %b + %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b + store <4 x i32> %val, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_i8: +; SI: s_load_dword +; SI: s_load_dword +; SI: s_sext_i32_i8 +; SI: s_sext_i32_i8 +; SI: s_min_i32 +define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind { + %cmp = icmp sle i8 %a, %b + %val = select i1 %cmp, i8 %a, i8 %b + store i8 %val, i8 addrspace(1)* %out + ret void +} + +; XXX - should be able to use s_min if we stop unnecessarily doing +; extloads with mubuf instructions. + +; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8: +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte + +; SI: v_min_i32 +; SI: v_min_i32 +; SI: v_min_i32 +; SI: v_min_i32 + +; SI: s_endpgm +define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) nounwind { + %cmp = icmp sle <4 x i8> %a, %b + %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b + store <4 x i8> %val, <4 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: +; SI: v_min_i32 +; SI: v_min_i32 +; SI: v_min_i32 +; SI: v_min_i32 +define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { + %cmp = icmp sle <4 x i16> %a, %b + %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b + store <4 x i16> %val, <4 x i16> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: @v_test_imin_slt_i32 ; SI: v_min_i32_e32 define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -50,6 +122,16 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ret void } +; FUNC-LABEL: {{^}}s_test_imin_slt_v2i32: +; SI: s_min_i32 +; SI: s_min_i32 +define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %cmp = icmp slt <2 x i32> %a, %b + %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b + store <2 x i32> %val, <2 x i32> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32: ; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { @@ -83,6 +165,24 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ret void } +; FUNC-LABEL: @v_test_umin_ule_v3i32 +; SI: v_min_u32_e32 +; SI: v_min_u32_e32 +; SI: v_min_u32_e32 +; SI-NOT: v_min_u32_e32 +; SI: s_endpgm +define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %aptr, <3 x i32> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid + %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep0 + %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep1 + %cmp = icmp ule <3 x i32> %a, %b + %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b + store <3 x i32> %val, <3 x i32> addrspace(1)* %outgep + ret void +} ; FUNC-LABEL: @s_test_umin_ule_i32 ; SI: s_min_u32 define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -107,6 +207,23 @@ define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ret void } +; FUNC-LABEL: {{^}}v_test_umin_ult_i8: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: v_min_u32_e32 +define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %a = load i8, i8 addrspace(1)* %gep0, align 1 + %b = load i8, i8 addrspace(1)* %gep1, align 1 + %cmp = icmp ult i8 %a, %b + %val = select i1 %cmp, i8 %a, i8 %b + store i8 %val, i8 addrspace(1)* %outgep, align 1 + ret void +} + ; FUNC-LABEL: @s_test_umin_ult_i32 ; SI: s_min_u32 define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -137,6 +254,48 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace ret void } + +; FUNC-LABEL: @s_test_umin_ult_v1i32 +; SI: s_min_u32 +define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { + %cmp = icmp ult <1 x i32> %a, %b + %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b + store <1 x i32> %val, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_umin_ult_v8i32: +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { + %cmp = icmp ult <8 x i32> %a, %b + %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b + store <8 x i32> %val, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16: +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { + %cmp = icmp ult <8 x i16> %a, %b + %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + store <8 x i16> %val, <8 x i16> addrspace(1)* %out + ret void +} + ; Make sure redundant and removed ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb @@ -173,14 +332,8 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 ret void } -; FIXME: Should get match min/max through extends inserted by -; legalization. - ; FUNC-LABEL: {{^}}s_test_imin_sle_i16: -; SI: s_sext_i32_i16 -; SI: s_sext_i32_i16 -; SI: v_cmp_le_i32_e32 -; SI: v_cndmask_b32 +; SI: s_min_i32 define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { %cmp = icmp sle i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b diff --git a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll new file mode 100644 index 0000000000000..e9f641b736d56 --- /dev/null +++ b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s + +; Check that when mubuf addr64 instruction is handled in moveToVALU +; from the pointer, dead register writes are not emitted. + +; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32 + +; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: +; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} + +; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] +; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN-NOT: v_mov_b32 + +; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] +; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] +; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, + +define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { +bb: + %tmp = icmp sgt i32 %arg3, 0 + br i1 %tmp, label %bb4, label %bb17 + +bb4: + %tmp14 = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %ptrarg + %tmp15 = getelementptr inbounds i8, i8 addrspace(1)* %tmp14, i64 %arg1 + %tmp16 = load volatile i8, i8 addrspace(1)* %tmp15 + br label %bb17 + +bb17: + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll new file mode 100644 index 0000000000000..8bca0575ecd23 --- /dev/null +++ b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll @@ -0,0 +1,52 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; XUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; FIXME: broken on VI because flat instructions need to be emitted +; instead of addr64 equivalent of the _OFFSET variants. + +; Check that moving the pointer out of the resource descriptor to +; vaddr works for atomics. + +declare i32 @llvm.r600.read.tidig.x() #1 + +; GCN-LABEL: {{^}}atomic_max_i32: +; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}} +define void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid + %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep + %xor = xor i32 %tid, 1 + %cmp = icmp ne i32 %xor, 0 + br i1 %cmp, label %atomic, label %exit + +atomic: + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 100 + %ret = atomicrmw max i32 addrspace(1)* %gep, i32 %y seq_cst + store i32 %ret, i32 addrspace(1)* %out + br label %exit + +exit: + ret void +} + +; GCN-LABEL: {{^}}atomic_max_i32_noret: +; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}} +define void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid + %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep + %xor = xor i32 %tid, 1 + %cmp = icmp ne i32 %xor, 0 + br i1 %cmp, label %atomic, label %exit + +atomic: + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 100 + %ret = atomicrmw max i32 addrspace(1)* %gep, i32 %y seq_cst + br label %exit + +exit: + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll b/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll new file mode 100644 index 0000000000000..73a146710a9ff --- /dev/null +++ b/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll @@ -0,0 +1,18 @@ +; RUN: not llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s 2>&1 | FileCheck %s + +; CHECK: error: unsupported non-compute shaders with HSA in pixel_shader +define void @pixel_shader() #0 { + ret void +} + +define void @vertex_shader() #1 { + ret void +} + +define void @geometry_shader() #2 { + ret void +} + +attributes #0 = { nounwind "ShaderType"="0" } +attributes #1 = { nounwind "ShaderType"="1" } +attributes #2 = { nounwind "ShaderType"="2" } diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll index e4328ecbaca8d..f81911aafe220 100644 --- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll +++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -189,3 +189,15 @@ define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace store i8 %trunc, i8 addrspace(1)* %gep.out ret void } + +; FUNC-LABEL: {{^}}smrd_mask_i32_to_i16 +; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0 +; SI: s_waitcnt lgkmcnt(0) +; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff +define void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +entry: + %val = load i32, i32 addrspace(2)* %in + %mask = and i32 %val, 65535 + store i32 %mask, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/test/CodeGen/AMDGPU/opencl-image-metadata.ll new file mode 100644 index 0000000000000..bc467e47dc316 --- /dev/null +++ b/test/CodeGen/AMDGPU/opencl-image-metadata.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; Make sure the OpenCL Image lowering pass doesn't crash when argument metadata +; is not in expected order. + +; EG: CF_END +; SI: s_endpgm +define void @kernel(i32 addrspace(1)* %out) { +entry: + store i32 0, i32 addrspace(1)* %out + ret void +} + +attributes #3 = { nounwind } + +!opencl.kernels = !{!0} + +!0 = !{void (i32 addrspace(1)*)* @kernel, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 0} +!2 = !{!"kernel_arg_access_qual", !"none"} +!3 = !{!"kernel_arg_type", !"int*"} +!4 = !{!"kernel_arg_type_qual", !""} +!5 = !{!"kernel_arg_name", !""} diff --git a/test/CodeGen/AMDGPU/operand-folding.ll b/test/CodeGen/AMDGPU/operand-folding.ll index 816755efb07ce..9e514ef9970ac 100644 --- a/test/CodeGen/AMDGPU/operand-folding.ll +++ b/test/CodeGen/AMDGPU/operand-folding.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s ; CHECK-LABEL: {{^}}fold_sgpr: -; CHECK: v_add_i32_e32 v{{[0-9]+}}, s +; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { entry: %tmp0 = icmp ne i32 %fold, 0 diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll index 1c04090b407ff..e40f18f040b7a 100644 --- a/test/CodeGen/AMDGPU/or.ll +++ b/test/CodeGen/AMDGPU/or.ll @@ -153,7 +153,7 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { } ; FUNC-LABEL: {{^}}or_i1: -; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} +; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}} ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { diff --git a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll new file mode 100644 index 0000000000000..51985af42a290 --- /dev/null +++ b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs -verify-coalescing < %s + +; The original and requires materializing a 64-bit immediate for +; s_and_b64. This is split into 2 x v_and_i32, part of the immediate +; is folded through the reg_sequence into the v_and_i32 operand, and +; only half of the result is ever used. +; +; During live interval construction, the first sub register def is +; incorrectly marked as dead. + +declare i32 @llvm.r600.read.tidig.x() #1 + +define void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep + + %lshr = shl i64 %val, 24 + %and1 = and i64 %lshr, 2190433320969 ; (255 << 33) | 9 + %vec = bitcast i64 %and1 to <2 x i32> + %elt1 = extractelement <2 x i32> %vec, i32 1 + + store i32 %elt1, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/private-memory.ll b/test/CodeGen/AMDGPU/private-memory.ll index 645dc04f44202..79778eebd802b 100644 --- a/test/CodeGen/AMDGPU/private-memory.ll +++ b/test/CodeGen/AMDGPU/private-memory.ll @@ -1,6 +1,8 @@ ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC @@ -13,11 +15,21 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; R600: LDS_READ ; R600: LDS_READ +; HSA-PROMOTE: .amd_kernel_code_t +; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120 +; HSA-PROMOTE: .end_amd_kernel_code_t + ; SI-PROMOTE: ds_write_b32 ; SI-PROMOTE: ds_write_b32 ; SI-PROMOTE: ds_read_b32 ; SI-PROMOTE: ds_read_b32 +; HSA-ALLOCA: .amd_kernel_code_t +; FIXME: Creating the emergency stack slots causes us to over-estimate scratch +; by 4 bytes. +; HSA-ALLOCA: workitem_private_segment_byte_size = 24 +; HSA-ALLOCA: .end_amd_kernel_code_t + ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { diff --git a/test/CodeGen/AMDGPU/register-count-comments.ll b/test/CodeGen/AMDGPU/register-count-comments.ll index de6bfb3108836..4bb315049be4b 100644 --- a/test/CodeGen/AMDGPU/register-count-comments.ll +++ b/test/CodeGen/AMDGPU/register-count-comments.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s declare i32 @llvm.SI.tid() nounwind readnone diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll index 187650ff9a537..d5e10d0be883d 100644 --- a/test/CodeGen/AMDGPU/reorder-stores.ll +++ b/test/CodeGen/AMDGPU/reorder-stores.ll @@ -2,14 +2,10 @@ ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16 @@ -34,46 +30,16 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace } ; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32 diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll index 6b1a36c979c2a..47c7fbb6dd6a8 100644 --- a/test/CodeGen/AMDGPU/s_movk_i32.ll +++ b/test/CodeGen/AMDGPU/s_movk_i32.ll @@ -3,10 +3,9 @@ ; SI-LABEL: {{^}}s_movk_i32_k0: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -17,10 +16,9 @@ define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-LABEL: {{^}}s_movk_i32_k1: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -31,10 +29,9 @@ define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-LABEL: {{^}}s_movk_i32_k2: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -45,10 +42,9 @@ define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-LABEL: {{^}}s_movk_i32_k3: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -59,10 +55,9 @@ define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-LABEL: {{^}}s_movk_i32_k4: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -87,10 +82,9 @@ define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-LABEL: {{^}}s_movk_i32_k6: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll index 0b9649576545d..a30c25e700aba 100644 --- a/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -1,4 +1,8 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.tidig.y() #0 ; In this test both the pointer and the offset operands to the ; BUFFER_LOAD instructions end up being stored in vgprs. This @@ -7,94 +11,267 @@ ; sgpr register pair and use that for the pointer operand ; (low 64-bits of srsrc). -; CHECK-LABEL: {{^}}mubuf: +; GCN-LABEL: {{^}}mubuf: ; Make sure we aren't using VGPRs for the source operand of s_mov_b64 -; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v +; GCN-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v ; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_* ; instructions -; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 -; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 -define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 + +define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { entry: - %0 = call i32 @llvm.r600.read.tidig.x() #1 - %1 = call i32 @llvm.r600.read.tidig.y() #1 - %2 = sext i32 %0 to i64 - %3 = sext i32 %1 to i64 + %tmp = call i32 @llvm.r600.read.tidig.x() + %tmp1 = call i32 @llvm.r600.read.tidig.y() + %tmp2 = sext i32 %tmp to i64 + %tmp3 = sext i32 %tmp1 to i64 br label %loop -loop: - %4 = phi i64 [0, %entry], [%5, %loop] - %5 = add i64 %2, %4 - %6 = getelementptr i8, i8 addrspace(1)* %in, i64 %5 - %7 = load i8, i8 addrspace(1)* %6, align 1 - %8 = or i64 %5, 1 - %9 = getelementptr i8, i8 addrspace(1)* %in, i64 %8 - %10 = load i8, i8 addrspace(1)* %9, align 1 - %11 = add i8 %7, %10 - %12 = sext i8 %11 to i32 - store i32 %12, i32 addrspace(1)* %out - %13 = icmp slt i64 %5, 10 - br i1 %13, label %loop, label %done - -done: +loop: ; preds = %loop, %entry + %tmp4 = phi i64 [ 0, %entry ], [ %tmp5, %loop ] + %tmp5 = add i64 %tmp2, %tmp4 + %tmp6 = getelementptr i8, i8 addrspace(1)* %in, i64 %tmp5 + %tmp7 = load i8, i8 addrspace(1)* %tmp6, align 1 + %tmp8 = or i64 %tmp5, 1 + %tmp9 = getelementptr i8, i8 addrspace(1)* %in, i64 %tmp8 + %tmp10 = load i8, i8 addrspace(1)* %tmp9, align 1 + %tmp11 = add i8 %tmp7, %tmp10 + %tmp12 = sext i8 %tmp11 to i32 + store i32 %tmp12, i32 addrspace(1)* %out + %tmp13 = icmp slt i64 %tmp5, 10 + br i1 %tmp13, label %loop, label %done + +done: ; preds = %loop ret void } -declare i32 @llvm.r600.read.tidig.x() #1 -declare i32 @llvm.r600.read.tidig.y() #1 - -attributes #1 = { nounwind readnone } - ; Test moving an SMRD instruction to the VALU -; CHECK-LABEL: {{^}}smrd_valu: -; CHECK: buffer_load_dword [[OUT:v[0-9]+]] -; CHECK: buffer_store_dword [[OUT]] - -define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) { +; GCN-LABEL: {{^}}smrd_valu: +; GCN: buffer_load_dword [[OUT:v[0-9]+]] +; GCN: buffer_store_dword [[OUT]] +define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { entry: - %0 = icmp ne i32 %a, 0 - br i1 %0, label %if, label %else + %tmp = icmp ne i32 %a, 0 + br i1 %tmp, label %if, label %else -if: - %1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in +if: ; preds = %entry + %tmp1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in br label %endif -else: - %2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in - %3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %2 +else: ; preds = %entry + %tmp2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in + %tmp3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %tmp2 br label %endif -endif: - %4 = phi i32 addrspace(2)* [%1, %if], [%3, %else] - %5 = getelementptr i32, i32 addrspace(2)* %4, i32 3000 - %6 = load i32, i32 addrspace(2)* %5 - store i32 %6, i32 addrspace(1)* %out +endif: ; preds = %else, %if + %tmp4 = phi i32 addrspace(2)* [ %tmp1, %if ], [ %tmp3, %else ] + %tmp5 = getelementptr i32, i32 addrspace(2)* %tmp4, i32 3000 + %tmp6 = load i32, i32 addrspace(2)* %tmp5 + store i32 %tmp6, i32 addrspace(1)* %out ret void } -; Test moving ann SMRD with an immediate offset to the VALU +; Test moving an SMRD with an immediate offset to the VALU -; CHECK-LABEL: {{^}}smrd_valu2: -; CHECK: buffer_load_dword -define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) { +; GCN-LABEL: {{^}}smrd_valu2: +; GCN-NOT: v_add +; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}} +define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 { entry: - %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %1 = add i32 %0, 4 - %2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %0, i32 4 - %3 = load i32, i32 addrspace(2)* %2 - store i32 %3, i32 addrspace(1)* %out + %tmp = call i32 @llvm.r600.read.tidig.x() #0 + %tmp1 = add i32 %tmp, 4 + %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4 + %tmp3 = load i32, i32 addrspace(2)* %tmp2 + store i32 %tmp3, i32 addrspace(1)* %out ret void } -; CHECK-LABEL: {{^}}s_load_imm_v8i32: -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { +; Use a big offset that will use the SMRD literal offset on CI +; GCN-LABEL: {{^}}smrd_valu_ci_offset: +; GCN-NOT: v_add +; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} +; GCN: v_add_i32_e32 +; GCN: buffer_store_dword +define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 { entry: - %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 + %tmp = call i32 @llvm.r600.read.tidig.x() #0 + %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp + %tmp3 = getelementptr i32, i32 addrspace(2)* %tmp2, i32 5000 + %tmp4 = load i32, i32 addrspace(2)* %tmp3 + %tmp5 = add i32 %tmp4, %c + store i32 %tmp5, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2: +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: buffer_store_dwordx2 +define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 { +entry: + %tmp = call i32 @llvm.r600.read.tidig.x() #0 + %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp + %tmp3 = getelementptr i64, i64 addrspace(2)* %tmp2, i32 5000 + %tmp4 = load i64, i64 addrspace(2)* %tmp3 + %tmp5 = or i64 %tmp4, %c + store i64 %tmp5, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4: +; GCN-NOT: v_add +; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: buffer_store_dwordx4 +define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 { +entry: + %tmp = call i32 @llvm.r600.read.tidig.x() #0 + %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp + %tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %tmp2, i32 1234 + %tmp4 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp3 + %tmp5 = or <4 x i32> %tmp4, %c + store <4 x i32> %tmp5, <4 x i32> addrspace(1)* %out + ret void +} + +; Original scalar load uses SGPR offset on SI and 32-bit literal on +; CI. + +; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8: +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} + +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 { +entry: + %tmp = call i32 @llvm.r600.read.tidig.x() #0 + %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp + %tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %tmp2, i32 1234 + %tmp4 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp3 + %tmp5 = or <8 x i32> %tmp4, %c + store <8 x i32> %tmp5, <8 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: + +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}} + +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 + +; GCN: s_endpgm +define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 { +entry: + %tmp = call i32 @llvm.r600.read.tidig.x() #0 + %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp + %tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %tmp2, i32 1234 + %tmp4 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp3 + %tmp5 = or <16 x i32> %tmp4, %c + store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}smrd_valu2_salu_user: +; GCN: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]] +; GCN: buffer_store_dword [[ADD]] +define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 { +entry: + %tmp = call i32 @llvm.r600.read.tidig.x() #0 + %tmp1 = add i32 %tmp, 4 + %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4 + %tmp3 = load i32, i32 addrspace(2)* %tmp2 + %tmp4 = add i32 %tmp3, %a + store i32 %tmp4, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset: +; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}} +define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { +entry: + %tmp = call i32 @llvm.r600.read.tidig.x() #0 + %tmp1 = add i32 %tmp, 4 + %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 255 + %tmp3 = load i32, i32 addrspace(2)* %tmp2 + store i32 %tmp3, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset: +; GCN-NOT: v_add +; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}} +define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { +entry: + %tmp = call i32 @llvm.r600.read.tidig.x() #0 + %tmp1 = add i32 %tmp, 4 + %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 256 + %tmp3 = load i32, i32 addrspace(2)* %tmp2 + store i32 %tmp3, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_load_imm_v8i32: +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +entry: + %tmp0 = tail call i32 @llvm.r600.read.tidig.x() %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)* %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4 @@ -102,12 +279,51 @@ entry: ret void } -; CHECK-LABEL: {{^}}s_load_imm_v16i32: -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { +; GCN-LABEL: {{^}}s_load_imm_v8i32_salu_user: +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: buffer_store_dword +define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +entry: + %tmp0 = tail call i32 @llvm.r600.read.tidig.x() + %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)* + %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4 + + %elt0 = extractelement <8 x i32> %tmp3, i32 0 + %elt1 = extractelement <8 x i32> %tmp3, i32 1 + %elt2 = extractelement <8 x i32> %tmp3, i32 2 + %elt3 = extractelement <8 x i32> %tmp3, i32 3 + %elt4 = extractelement <8 x i32> %tmp3, i32 4 + %elt5 = extractelement <8 x i32> %tmp3, i32 5 + %elt6 = extractelement <8 x i32> %tmp3, i32 6 + %elt7 = extractelement <8 x i32> %tmp3, i32 7 + + %add0 = add i32 %elt0, %elt1 + %add1 = add i32 %add0, %elt2 + %add2 = add i32 %add1, %elt3 + %add3 = add i32 %add2, %elt4 + %add4 = add i32 %add3, %elt5 + %add5 = add i32 %add4, %elt6 + %add6 = add i32 %add5, %elt7 + + store i32 %add6, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_load_imm_v16i32: +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 @@ -116,3 +332,71 @@ entry: store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32 ret void } + +; GCN-LABEL: {{^}}s_load_imm_v16i32_salu_user: +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: buffer_store_dword +define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +entry: + %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 + %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)* + %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4 + + %elt0 = extractelement <16 x i32> %tmp3, i32 0 + %elt1 = extractelement <16 x i32> %tmp3, i32 1 + %elt2 = extractelement <16 x i32> %tmp3, i32 2 + %elt3 = extractelement <16 x i32> %tmp3, i32 3 + %elt4 = extractelement <16 x i32> %tmp3, i32 4 + %elt5 = extractelement <16 x i32> %tmp3, i32 5 + %elt6 = extractelement <16 x i32> %tmp3, i32 6 + %elt7 = extractelement <16 x i32> %tmp3, i32 7 + %elt8 = extractelement <16 x i32> %tmp3, i32 8 + %elt9 = extractelement <16 x i32> %tmp3, i32 9 + %elt10 = extractelement <16 x i32> %tmp3, i32 10 + %elt11 = extractelement <16 x i32> %tmp3, i32 11 + %elt12 = extractelement <16 x i32> %tmp3, i32 12 + %elt13 = extractelement <16 x i32> %tmp3, i32 13 + %elt14 = extractelement <16 x i32> %tmp3, i32 14 + %elt15 = extractelement <16 x i32> %tmp3, i32 15 + + %add0 = add i32 %elt0, %elt1 + %add1 = add i32 %add0, %elt2 + %add2 = add i32 %add1, %elt3 + %add3 = add i32 %add2, %elt4 + %add4 = add i32 %add3, %elt5 + %add5 = add i32 %add4, %elt6 + %add6 = add i32 %add5, %elt7 + %add7 = add i32 %add6, %elt8 + %add8 = add i32 %add7, %elt9 + %add9 = add i32 %add8, %elt10 + %add10 = add i32 %add9, %elt11 + %add11 = add i32 %add10, %elt12 + %add12 = add i32 %add11, %elt13 + %add13 = add i32 %add12, %elt14 + %add14 = add i32 %add13, %elt15 + + store i32 %add14, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/sampler-resource-id.ll b/test/CodeGen/AMDGPU/sampler-resource-id.ll new file mode 100644 index 0000000000000..c41d345369bf6 --- /dev/null +++ b/test/CodeGen/AMDGPU/sampler-resource-id.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=r600 -mcpu=juniper < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 0( +define void @test_0(i32 %in0, i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in0) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_1: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 1( +define void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in1) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_2: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], literal.x +; EG-NEXT: LSHR +; EG-NEXT: 2( +define void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in2) #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + + +declare i32 @llvm.OpenCL.sampler.get.resource.id(i32) #0 + +attributes #0 = { readnone } + +!opencl.kernels = !{!0, !1, !2} + +!0 = !{void (i32, i32 addrspace(1)*)* @test_0, !10, !20, !30, !40, !50} +!10 = !{!"kernel_arg_addr_space", i32 0, i32 1} +!20 = !{!"kernel_arg_access_qual", !"none", !"none"} +!30 = !{!"kernel_arg_type", !"sampler_t", !"int*"} +!40 = !{!"kernel_arg_base_type", !"sampler_t", !"int*"} +!50 = !{!"kernel_arg_type_qual", !"", !""} + +!1 = !{void (i32, i32, i32 addrspace(1)*)* @test_1, !11, !21, !31, !41, !51} +!11 = !{!"kernel_arg_addr_space", i32 0, i32 0, i32 1} +!21 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"} +!31 = !{!"kernel_arg_type", !"sampler_t", !"sampler_t", !"int*"} +!41 = !{!"kernel_arg_base_type", !"sampler_t", !"sampler_t", !"int*"} +!51 = !{!"kernel_arg_type_qual", !"", !"", !""} + +!2 = !{void (i32, i32, i32, i32 addrspace(1)*)* @test_2, !12, !22, !32, !42, !52} +!12 = !{!"kernel_arg_addr_space", i32 0, i32 0, i32 0, i32 1} +!22 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"} +!32 = !{!"kernel_arg_type", !"sampler_t", !"sampler_t", !"sampler_t", !"int*"} +!42 = !{!"kernel_arg_base_type", !"sampler_t", !"sampler_t", !"sampler_t", !"int*"} +!52 = !{!"kernel_arg_type_qual", !"", !"", !"", !""} diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll index 3863afda5dd3a..e4b16c0a165f4 100644 --- a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll +++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll @@ -3,7 +3,7 @@ ; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI -declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate +declare void @llvm.AMDGPU.barrier.local() nounwind convergent ; SI-LABEL: {{^}}main( diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll index 268869daaa321..d43de47660576 100644 --- a/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -51,7 +51,7 @@ done: ; GCN-LABEL: {{^}}legal_offset_fi_offset ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen -; GCN: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000 +; GCN: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8000 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll index 13fb575b2b151..a68fdecb00af7 100644 --- a/test/CodeGen/AMDGPU/select64.ll +++ b/test/CodeGen/AMDGPU/select64.ll @@ -51,12 +51,8 @@ define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspa } ; CHECK-LABEL: {{^}}v_select_i64_split_imm: -; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63 -; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0 -; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]] -; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]] -; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}} -; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}} +; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} +; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}} ; CHECK: s_endpgm define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %cmp = icmp ugt i32 %cond, 5 diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll index 53694dcffa66e..57365a6e1fc37 100644 --- a/test/CodeGen/AMDGPU/set-dx10.ll +++ b/test/CodeGen/AMDGPU/set-dx10.ll @@ -5,8 +5,8 @@ ; SET*DX10 instructions. ; CHECK: {{^}}fcmp_une_select_fptosi: -; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: @@ -19,8 +19,8 @@ entry: } ; CHECK: {{^}}fcmp_une_select_i32: -; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) { entry: @@ -31,8 +31,8 @@ entry: } ; CHECK: {{^}}fcmp_oeq_select_fptosi: -; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: @@ -45,8 +45,8 @@ entry: } ; CHECK: {{^}}fcmp_oeq_select_i32: -; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) { entry: @@ -57,8 +57,8 @@ entry: } ; CHECK: {{^}}fcmp_ogt_select_fptosi: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: @@ -71,8 +71,8 @@ entry: } ; CHECK: {{^}}fcmp_ogt_select_i32: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) { entry: @@ -83,8 +83,8 @@ entry: } ; CHECK: {{^}}fcmp_oge_select_fptosi: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: @@ -97,8 +97,8 @@ entry: } ; CHECK: {{^}}fcmp_oge_select_i32: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) { entry: @@ -109,8 +109,8 @@ entry: } ; CHECK: {{^}}fcmp_ole_select_fptosi: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: @@ -123,8 +123,8 @@ entry: } ; CHECK: {{^}}fcmp_ole_select_i32: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) { entry: @@ -135,8 +135,8 @@ entry: } ; CHECK: {{^}}fcmp_olt_select_fptosi: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: @@ -149,8 +149,8 @@ entry: } ; CHECK: {{^}}fcmp_olt_select_i32: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) { entry: diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll index 4e6a10d6b78d7..63d74820f9613 100644 --- a/test/CodeGen/AMDGPU/setcc-opt.ll +++ b/test/CodeGen/AMDGPU/setcc-opt.ll @@ -142,11 +142,14 @@ define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind } ; FUNC-LABEL: {{^}}cmp_zext_k_i8max: -; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}} -; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]] +; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff +; GCN: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]] +; GCN: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]] +; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[VK255]] ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { %b.ext = zext i8 %b to i32 @@ -187,11 +190,14 @@ define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) n ; Should do a buffer_load_sbyte and compare with -1 ; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg: -; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}} -; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}} +; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff +; GCN: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]] +; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] +; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[VK]]{{$}} ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { %b.ext = sext i8 %b to i32 diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll index 5aedda2ce1a9c..23ae3b967971d 100644 --- a/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -12,8 +12,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; SI: buffer_store_dword [[EXTRACT]], ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1 -; EG-NEXT: LSHR * [[ADDR]] +; EG: LSHR * [[ADDR]] +; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1 define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) { %shl = shl i32 %in, 31 %sext = ashr i32 %shl, 31 @@ -609,3 +609,53 @@ define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1 store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void } + +; Make sure we propagate the VALUness to users of a moved scalar BFE. + +; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64_move_use: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 +; SI-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; SI-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]] +; SI-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]] +; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %a.gep, align 8 + %b = load i64, i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 63 + %ashr = ashr i64 %shl, 63 + + %and = and i64 %ashr, %s.val + store i64 %and, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64_move_use: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, +; SI-DAG: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] +; SI-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]] +; SI-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[SHR]] +; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %a.gep, align 8 + %b = load i64, i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 32 + %ashr = ashr i64 %shl, 32 + %and = and i64 %ashr, %s.val + store i64 %and, i64 addrspace(1)* %out.gep, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll index 6f81a39ed96aa..55db80731c900 100644 --- a/test/CodeGen/AMDGPU/shl.ll +++ b/test/CodeGen/AMDGPU/shl.ll @@ -53,14 +53,14 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in ret void } -;EG: {{^}}shl_i64: +;EG-LABEL: {{^}}shl_i64: ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] ;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} -;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 -;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 ;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]] -;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} -;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}} +;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}} +;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]|PV.[XYZW]}} ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal ;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} ;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 @@ -80,7 +80,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ret void } -;EG: {{^}}shl_v2i64: +;EG-LABEL: {{^}}shl_v2i64: ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] ;EG-DAG: LSHR {{\*? *}}[[COMPSHA]] @@ -185,8 +185,7 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in ; Make sure load width gets reduced to i32 load. ; GCN-LABEL: {{^}}s_shl_32_i64: ; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} -; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) { diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll index b1485bfaaebb0..dfb2bf3383fc5 100644 --- a/test/CodeGen/AMDGPU/shl_add_constant.ll +++ b/test/CodeGen/AMDGPU/shl_add_constant.ll @@ -6,7 +6,7 @@ declare i32 @llvm.r600.read.tidig.x() #1 ; FUNC-LABEL: {{^}}shl_2_add_9_i32: ; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], 2, {{v[0-9]+}} -; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]] +; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 36, [[REG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { @@ -20,7 +20,7 @@ define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { } ; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses: -; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}} +; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], vcc, 9, {{v[0-9]+}} ; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}} ; SI-DAG: buffer_store_dword [[ADDREG]] ; SI-DAG: buffer_store_dword [[SHLREG]] @@ -40,7 +40,7 @@ define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1 ; FUNC-LABEL: {{^}}shl_2_add_999_i32: ; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], 2, {{v[0-9]+}} -; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]] +; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xf9c, [[REG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll index 6671e909cd1dd..ac94824bd61f1 100644 --- a/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -35,7 +35,7 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad ; SI-LABEL: {{^}}load_shl_base_lds_1: ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 -; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}} +; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}} ; SI-DAG: buffer_store_dword [[RESULT]] ; SI-DAG: buffer_store_dword [[ADDUSE]] ; SI: s_endpgm diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll new file mode 100644 index 0000000000000..27a8e70aae137 --- /dev/null +++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll @@ -0,0 +1,16 @@ +; RUN: llc -o /dev/null %s -march=amdgcn -mcpu=verde -verify-machineinstrs -stop-after expand-isel-pseudos 2>&1 | FileCheck %s +; This test verifies that the instruction selection will add the implicit +; register operands in the correct order when modifying the opcode of an +; instruction to V_ADD_I32_e32. + +; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec + +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/si-literal-folding.ll b/test/CodeGen/AMDGPU/si-literal-folding.ll new file mode 100644 index 0000000000000..901b3c3453fc3 --- /dev/null +++ b/test/CodeGen/AMDGPU/si-literal-folding.ll @@ -0,0 +1,17 @@ +; XFAIL: * +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0xbf4353f8 + +define void @main(float) #0 { +main_body: + %1 = fmul float %0, 0x3FE86A7F00000000 + %2 = fmul float %0, 0xBFE86A7F00000000 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %1, float %1, float %2, float %2) + ret void +} + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll index 84652701f7731..d7b35fc631ebb 100644 --- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -6,6 +6,16 @@ ; CHECK-LABEL: {{^}}main: ; CHECK: s_wqm + +; Make sure not emitting unused scratch resource descriptor setup +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 + +; CHECK: s_mov_b32 m0 + + ; Writing to M0 from an SMRD instruction will hang the GPU. ; CHECK-NOT: s_buffer_load_dword m0 ; CHECK: s_endpgm diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 5a6129aaa3fa0..bc766dbcac676 100644 --- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -155,9 +155,9 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, } ; FUNC-LABEL: @reorder_local_offsets -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 ; CI: buffer_store_dword @@ -181,9 +181,10 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa } ; FUNC-LABEL: @reorder_global_offsets -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 ; CI: buffer_store_dword @@ -233,4 +234,4 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #2 = { nounwind noduplicate } +attributes #2 = { nounwind convergent } diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index 0db7cdc171b54..a94ccc32e61c6 100644 --- a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -46,9 +46,9 @@ define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { ; SI-LABEL: @v_sint_to_fp_i64_to_f64 ; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; SI: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] -; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 -; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] +; SI-DAG: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] +; SI-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] +; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] ; SI: buffer_store_dwordx2 [[RESULT]] define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll new file mode 100644 index 0000000000000..e646605f7da1d --- /dev/null +++ b/test/CodeGen/AMDGPU/sminmax.ll @@ -0,0 +1,130 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}s_abs_i32: +; GCN: s_abs_i32 +; GCN: s_add_i32 +define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind { + %neg = sub i32 0, %val + %cond = icmp sgt i32 %val, %neg + %res = select i1 %cond, i32 %val, i32 %neg + %res2 = add i32 %res, 2 + store i32 %res2, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_abs_i32: +; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]] +; GCN: v_add_i32 +define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { + %val = load i32, i32 addrspace(1)* %src, align 4 + %neg = sub i32 0, %val + %cond = icmp sgt i32 %val, %neg + %res = select i1 %cond, i32 %val, i32 %neg + %res2 = add i32 %res, 2 + store i32 %res2, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_abs_v2i32: +; GCN: s_abs_i32 +; GCN: s_abs_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind { + %z0 = insertelement <2 x i32> undef, i32 0, i32 0 + %z1 = insertelement <2 x i32> %z0, i32 0, i32 1 + %t0 = insertelement <2 x i32> undef, i32 2, i32 0 + %t1 = insertelement <2 x i32> %t0, i32 2, i32 1 + %neg = sub <2 x i32> %z1, %val + %cond = icmp sgt <2 x i32> %val, %neg + %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg + %res2 = add <2 x i32> %res, %t1 + store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_abs_v2i32: +; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] +; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] + +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] + +; GCN: v_add_i32 +; GCN: v_add_i32 +define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind { + %z0 = insertelement <2 x i32> undef, i32 0, i32 0 + %z1 = insertelement <2 x i32> %z0, i32 0, i32 1 + %t0 = insertelement <2 x i32> undef, i32 2, i32 0 + %t1 = insertelement <2 x i32> %t0, i32 2, i32 1 + %val = load <2 x i32>, <2 x i32> addrspace(1)* %src, align 4 + %neg = sub <2 x i32> %z1, %val + %cond = icmp sgt <2 x i32> %val, %neg + %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg + %res2 = add <2 x i32> %res, %t1 + store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_abs_v4i32: +; TODO: this should use s_abs_i32 +; GCN: s_abs_i32 +; GCN: s_abs_i32 +; GCN: s_abs_i32 +; GCN: s_abs_i32 + +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind { + %z0 = insertelement <4 x i32> undef, i32 0, i32 0 + %z1 = insertelement <4 x i32> %z0, i32 0, i32 1 + %z2 = insertelement <4 x i32> %z1, i32 0, i32 2 + %z3 = insertelement <4 x i32> %z2, i32 0, i32 3 + %t0 = insertelement <4 x i32> undef, i32 2, i32 0 + %t1 = insertelement <4 x i32> %t0, i32 2, i32 1 + %t2 = insertelement <4 x i32> %t1, i32 2, i32 2 + %t3 = insertelement <4 x i32> %t2, i32 2, i32 3 + %neg = sub <4 x i32> %z3, %val + %cond = icmp sgt <4 x i32> %val, %neg + %res = select <4 x i1> %cond, <4 x i32> %val, <4 x i32> %neg + %res2 = add <4 x i32> %res, %t3 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_abs_v4i32: +; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] +; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] +; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] +; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] + +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] + +; GCN: v_add_i32 +; GCN: v_add_i32 +; GCN: v_add_i32 +; GCN: v_add_i32 +define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind { + %z0 = insertelement <4 x i32> undef, i32 0, i32 0 + %z1 = insertelement <4 x i32> %z0, i32 0, i32 1 + %z2 = insertelement <4 x i32> %z1, i32 0, i32 2 + %z3 = insertelement <4 x i32> %z2, i32 0, i32 3 + %t0 = insertelement <4 x i32> undef, i32 2, i32 0 + %t1 = insertelement <4 x i32> %t0, i32 2, i32 1 + %t2 = insertelement <4 x i32> %t1, i32 2, i32 2 + %t3 = insertelement <4 x i32> %t2, i32 2, i32 3 + %val = load <4 x i32>, <4 x i32> addrspace(1)* %src, align 4 + %neg = sub <4 x i32> %z3, %val + %cond = icmp sgt <4 x i32> %val, %neg + %res = select <4 x i1> %cond, <4 x i32> %val, <4 x i32> %neg + %res2 = add <4 x i32> %res, %t3 + store <4 x i32> %res2, <4 x i32> addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll index 0598208e13173..1d6bb9ece8c68 100644 --- a/test/CodeGen/AMDGPU/smrd.ll +++ b/test/CodeGen/AMDGPU/smrd.ll @@ -1,9 +1,10 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s +; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=CI --check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s ; SMRD load with an immediate offset. ; GCN-LABEL: {{^}}smrd0: -; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 +; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { entry: @@ -15,7 +16,7 @@ entry: ; SMRD load with the largest possible immediate offset. ; GCN-LABEL: {{^}}smrd1: -; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff +; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { entry: @@ -29,6 +30,7 @@ entry: ; GCN-LABEL: {{^}}smrd2: ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { @@ -54,9 +56,37 @@ entry: ret void } +; SMRD load with the largest possible immediate offset on VI +; GCN-LABEL: {{^}}smrd4: +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with an offset greater than the largest possible immediate on VI +; GCN-LABEL: {{^}}smrd5: +; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 +; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 +; GCN: s_endpgm +define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + ; SMRD load using the load.const intrinsic with an immediate offset ; GCN-LABEL: {{^}}smrd_load_const0: -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 +; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { main_body: @@ -70,7 +100,7 @@ main_body: ; SMRD load using the load.const intrinsic with the largest possible immediate ; offset. ; GCN-LABEL: {{^}}smrd_load_const1: -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff +; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { main_body: @@ -86,6 +116,7 @@ main_body: ; GCN-LABEL: {{^}}smrd_load_const2: ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { main_body: @@ -96,6 +127,36 @@ main_body: ret void } +; SMRD load with the largest possible immediate offset on VI +; GCN-LABEL: {{^}}smrd_load_const3: +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +define void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048572) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + ret void +} + +; SMRD load with an offset greater than the largest possible immediate on VI +; GCN-LABEL: {{^}}smrd_load_const4: +; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 +; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 +; GCN: s_endpgm +define void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048576) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + ret void +} + ; Function Attrs: nounwind readnone declare float @llvm.SI.load.const(<16 x i8>, i32) #1 diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll index 46409cdfae1c7..9e181bc14d9db 100644 --- a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll +++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() readnone @@ -8,9 +8,22 @@ declare i32 @llvm.r600.read.tidig.x() readnone ; scc instead. ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { +; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}} +; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc +define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) { + %v.val = load volatile i32, i32 addrspace(1)* %in + %vec.0 = insertelement <2 x i32> undef, i32 %s.val, i32 0 + %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1 + %bc = bitcast <2 x i32> %vec.1 to i64 + %add = add i64 %bc, 399 + store i64 %add, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_0: +; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x18f +; SI: s_addc_u32 {{s[0-9]+}}, 0xf423f, 0 +define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0 %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1 %bc = bitcast <2 x i32> %vec.1 to i64 @@ -22,7 +35,20 @@ define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) { +define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { + %v.val = load volatile i32, i32 addrspace(1)* %in + %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 + %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1 + %bc = bitcast <2 x i32> %vec.1 to i64 + %add = add i64 %bc, %val1 + store i64 %add, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_1: +; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_addc_u32 {{s[0-9]+}}, 0x1869f, {{s[0-9]+}} +define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) { %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1 %bc = bitcast <2 x i32> %vec.1 to i64 @@ -32,9 +58,9 @@ define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 } ; Doesn't use constants -; FUNC-LABEL @imp_def_vcc_split_i64_add_2 -; SI: v_add_i32 -; SI: v_addc_u32 +; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_2: +; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { %tid = call i32 @llvm.r600.read.tidig.x() readnone %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid diff --git a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll new file mode 100644 index 0000000000000..4c82ed6affc23 --- /dev/null +++ b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll @@ -0,0 +1,104 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s + +@sPrivateStorage = external addrspace(3) global [256 x [8 x <4 x i64>]] + +; GCN-LABEL: {{^}}ds_reorder_vector_split: + +; Write zeroinitializer +; GCN-DAG: ds_write_b64 [[PTR:v[0-9]+]], [[VAL:v\[[0-9]+:[0-9]+\]]] offset:24 +; GCN-DAG: ds_write_b64 [[PTR]], [[VAL]] offset:16 +; GCN-DAG: ds_write_b64 [[PTR]], [[VAL]] offset:8 +; GCN-DAG: ds_write_b64 [[PTR]], [[VAL]]{{$}} + +; GCN: s_waitcnt vmcnt + +; GCN-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24 +; GCN-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16 +; GCN-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 + +; GCN: s_waitcnt lgkmcnt + +; GCN-DAG ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:8 +; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16 +; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24 + +; Appears to be dead store of vector component. +; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}} + +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +; GCN: s_endpgm +define void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 { +entry: + %tmp = tail call i32 @llvm.r600.read.local.size.y() + %tmp1 = tail call i32 @llvm.r600.read.local.size.z() + %tmp2 = tail call i32 @llvm.r600.read.tidig.x() + %tmp3 = tail call i32 @llvm.r600.read.tidig.y() + %tmp4 = tail call i32 @llvm.r600.read.tidig.z() + %tmp6 = mul i32 %tmp2, %tmp + %tmp10 = add i32 %tmp3, %tmp6 + %tmp11 = mul i32 %tmp10, %tmp1 + %tmp9 = add i32 %tmp11, %tmp4 + %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1 + %x.i.12.i = tail call i32 @llvm.r600.read.local.size.x() #1 + %mul.26.i = mul i32 %x.i.12.i, %x.i.i + %add.i = add i32 %tmp2, %mul.26.i + %arrayidx = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 %add.i + store <4 x i64> zeroinitializer, <4 x i64> addrspace(3)* %arrayidx + %tmp12 = sext i32 %add.i to i64 + %arrayidx1 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %srcValues, i64 %tmp12 + %tmp13 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx1 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %tmp12 + %tmp14 = load i32, i32 addrspace(1)* %arrayidx2 + %add.ptr = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 0, i32 %alignmentOffset + %mul.i = shl i32 %tmp14, 2 + %arrayidx.i = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr, i32 %mul.i + %tmp15 = bitcast i64 addrspace(3)* %arrayidx.i to <4 x i64> addrspace(3)* + store <4 x i64> %tmp13, <4 x i64> addrspace(3)* %tmp15 + %add.ptr6 = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 %tmp14, i32 %alignmentOffset + %tmp16 = sext i32 %tmp14 to i64 + %tmp17 = sext i32 %alignmentOffset to i64 + %add.ptr9 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %destBuffer, i64 %tmp16, i64 %tmp17 + %tmp18 = bitcast <4 x i64> %tmp13 to i256 + %trunc = trunc i256 %tmp18 to i64 + store i64 %trunc, i64 addrspace(1)* %add.ptr9 + %arrayidx10.1 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 1 + %tmp19 = load i64, i64 addrspace(3)* %arrayidx10.1 + %arrayidx11.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 1 + store i64 %tmp19, i64 addrspace(1)* %arrayidx11.1 + %arrayidx10.2 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 2 + %tmp20 = load i64, i64 addrspace(3)* %arrayidx10.2 + %arrayidx11.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 2 + store i64 %tmp20, i64 addrspace(1)* %arrayidx11.2 + %arrayidx10.3 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 3 + %tmp21 = load i64, i64 addrspace(3)* %arrayidx10.3 + %arrayidx11.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 3 + store i64 %tmp21, i64 addrspace(1)* %arrayidx11.3 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.local.size.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.local.size.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.local.size.z() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.z() #1 + +attributes #0 = { norecurse nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll index bcbc32f4c0539..3b59bbfb18c03 100644 --- a/test/CodeGen/AMDGPU/sra.ll +++ b/test/CodeGen/AMDGPU/sra.ll @@ -70,11 +70,11 @@ entry: ;EG-LABEL: {{^}}ashr_i64_2: ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] ;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} -;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 -;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 ;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]] -;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} -;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} +;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}} +;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}} ;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal ;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll index 0dad91e709d99..bbd9543563222 100644 --- a/test/CodeGen/AMDGPU/srl.ll +++ b/test/CodeGen/AMDGPU/srl.ll @@ -65,14 +65,14 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i ; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] ; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} -; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 ; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +; EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 ; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]] -; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} -; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} -; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} +; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}} +; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]|PV\.[XYZW]}} ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} +; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}} +; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]] ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 @@ -190,8 +190,7 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i ; Make sure load width gets reduced to i32 load. ; GCN-LABEL: {{^}}s_lshr_32_i64: ; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}} -; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) { diff --git a/test/CodeGen/AMDGPU/store-barrier.ll b/test/CodeGen/AMDGPU/store-barrier.ll index 4a72b4d090adf..ba4049f28a6e5 100644 --- a/test/CodeGen/AMDGPU/store-barrier.ll +++ b/test/CodeGen/AMDGPU/store-barrier.ll @@ -36,7 +36,7 @@ bb: ret void } -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.local() #2 -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } diff --git a/test/CodeGen/AMDGPU/store.ll b/test/CodeGen/AMDGPU/store.ll index 0f89405e073b0..d22f43fa05ef3 100644 --- a/test/CodeGen/AMDGPU/store.ll +++ b/test/CodeGen/AMDGPU/store.ll @@ -287,16 +287,33 @@ entry: ; CM: LDS_WRITE ; CM: LDS_WRITE -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 +; SI: ds_write_b64 +; SI: ds_write_b64 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(3)* %out ret void } +; FUNC-LABEL: {{^}}store_local_v4i32_align4: +; EG: LDS_WRITE +; EG: LDS_WRITE +; EG: LDS_WRITE +; EG: LDS_WRITE + +; CM: LDS_WRITE +; CM: LDS_WRITE +; CM: LDS_WRITE +; CM: LDS_WRITE + +; SI: ds_write2_b32 +; SI: ds_write2_b32 +define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4 + ret void +} + ; FUNC-LABEL: {{^}}store_local_i64_i8: ; EG: LDS_BYTE_WRITE ; SI: ds_write_b8 diff --git a/test/CodeGen/AMDGPU/store_typed.ll b/test/CodeGen/AMDGPU/store_typed.ll new file mode 100644 index 0000000000000..515fcf04f4067 --- /dev/null +++ b/test/CodeGen/AMDGPU/store_typed.ll @@ -0,0 +1,24 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck --check-prefix=CM --check-prefix=FUNC %s + +; store to rat 0 +; FUNC-LABEL: {{^}}store_typed_rat0: +; EG: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}, 1 +; CM: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}} + +define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) { + call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 0) + ret void +} + +; store to rat 11 +; FUNC-LABEL: {{^}}store_typed_rat11: +; EG: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}, 1 +; CM: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}} + +define void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) { + call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 11) + ret void +} + +declare void @llvm.r600.rat.store.typed(<4 x i32>, <4 x i32>, i32) diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll index b7fba0efa5b29..9f9446a4e6087 100644 --- a/test/CodeGen/AMDGPU/sub.ll +++ b/test/CodeGen/AMDGPU/sub.ll @@ -7,7 +7,7 @@ declare i32 @llvm.r600.read.tidig.x() readnone ; FUNC-LABEL: {{^}}test_sub_i32: ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in @@ -22,8 +22,8 @@ define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -40,10 +40,10 @@ define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1) ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll index bf690ca4cb282..ad52d0f2e2380 100644 --- a/test/CodeGen/AMDGPU/trunc.ll +++ b/test/CodeGen/AMDGPU/trunc.ll @@ -61,7 +61,7 @@ define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) { } ; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1: -; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}} +; SI: s_and_b32 s{{[0-9]+}}, 1, s{{[0-9]+}} ; SI: v_cmp_eq_i32 define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { %trunc = trunc i32 %a to i1 @@ -72,9 +72,9 @@ define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { ; SI-LABEL: {{^}}s_trunc_i64_to_i1: ; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]] -; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc +; SI: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]] +; SI: v_cmp_eq_i32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], 1, [[MASKED]] +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]] define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { %trunc = trunc i64 %x to i1 %sel = select i1 %trunc, i32 63, i32 -12 diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll index b3837f28209af..f692b7dfdc271 100644 --- a/test/CodeGen/AMDGPU/udivrem.ll +++ b/test/CodeGen/AMDGPU/udivrem.ll @@ -30,19 +30,19 @@ ; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]] ; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]] ; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]] -; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]] +; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], vcc, 0, [[RCP_LO]] ; SI: v_cndmask_b32_e64 ; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]] -; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]] -; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]] +; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], vcc, [[E]], [[RCP]] +; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], vcc, [[E]], [[RCP]] ; SI: v_cndmask_b32_e64 ; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]] ; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]] +; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], vcc, {{[vs][0-9]+}}, [[Num_S_Remainder]] ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]] +; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]] ; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]], ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 @@ -110,15 +110,15 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) { ; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] ; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] ; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] +; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[FIRST_RCP_LO]] ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] -; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]] ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] ; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], vcc, [[FIRST_Num_S_Remainder]], v{{[0-9]+}} ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] @@ -133,15 +133,15 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) { ; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] ; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] ; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] +; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[SECOND_RCP_LO]] ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] -; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]] ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] ; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], vcc, [[SECOND_Num_S_Remainder]], v{{[0-9]+}} ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] @@ -257,83 +257,83 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3 ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] +; SI-DAG: v_rcp_iflag_f32_e32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_sub_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] -; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_and_b32_e32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] +; SI-DAG: v_rcp_iflag_f32_e32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_sub_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] -; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_and_b32_e32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]] -; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]] -; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]] +; SI-DAG: v_rcp_iflag_f32_e32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_sub_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]] -; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] -; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}} +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_and_b32_e32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]] -; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]] -; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]] +; SI-DAG: v_rcp_iflag_f32_e32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_sub_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]] -; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI: s_endpgm define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index 6f608df5e9f55..65fe580792a59 100644 --- a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -4,9 +4,9 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64 ; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; SI: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] -; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 -; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] +; SI-DAG: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] +; SI-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] +; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] ; SI: buffer_store_dwordx2 [[RESULT]] define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll index 8ab4faf2f1458..d120111a71fb3 100644 --- a/test/CodeGen/AMDGPU/unsupported-cc.ll +++ b/test/CodeGen/AMDGPU/unsupported-cc.ll @@ -3,8 +3,8 @@ ; These tests are for condition codes that are not supported by the hardware ; CHECK-LABEL: {{^}}slt: -; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 5(7.006492e-45) define void @slt(i32 addrspace(1)* %out, i32 %in) { entry: @@ -15,8 +15,8 @@ entry: } ; CHECK-LABEL: {{^}}ult_i32: -; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 5(7.006492e-45) define void @ult_i32(i32 addrspace(1)* %out, i32 %in) { entry: @@ -40,8 +40,8 @@ entry: } ; CHECK-LABEL: {{^}}ult_float_native: -; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR * +; CHECK: LSHR +; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}} ; CHECK-NEXT: 1084227584(5.000000e+00) define void @ult_float_native(float addrspace(1)* %out, float %in) { entry: @@ -52,8 +52,8 @@ entry: } ; CHECK-LABEL: {{^}}olt: -; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR * +; CHECK: LSHR +; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 1084227584(5.000000e+00) define void @olt(float addrspace(1)* %out, float %in) { entry: @@ -64,8 +64,8 @@ entry: } ; CHECK-LABEL: {{^}}sle: -; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 6(8.407791e-45) define void @sle(i32 addrspace(1)* %out, i32 %in) { entry: @@ -76,8 +76,8 @@ entry: } ; CHECK-LABEL: {{^}}ule_i32: -; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR +; CHECK: LSHR +; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 6(8.407791e-45) define void @ule_i32(i32 addrspace(1)* %out, i32 %in) { entry: @@ -101,8 +101,8 @@ entry: } ; CHECK-LABEL: {{^}}ule_float_native: -; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR * +; CHECK: LSHR +; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}} ; CHECK-NEXT: 1084227584(5.000000e+00) define void @ule_float_native(float addrspace(1)* %out, float %in) { entry: @@ -113,8 +113,8 @@ entry: } ; CHECK-LABEL: {{^}}ole: -; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR * +; CHECK: LSHR +; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT:1084227584(5.000000e+00) define void @ole(float addrspace(1)* %out, float %in) { entry: diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll index f26f30022b4f3..87b925a24a041 100644 --- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s declare float @llvm.fma.f32(float, float, float) #1 +declare double @llvm.fma.f64(double, double, double) #1 declare float @llvm.fmuladd.f32(float, float, float) #1 declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1 @@ -40,6 +41,32 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa ret void } +; GCN-LABEL: {{^}}test_use_s_v_s: +; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} + +; GCN: buffer_load_dword [[VA0:v[0-9]+]] +; GCN-NOT: v_mov_b32 +; GCN: buffer_load_dword [[VA1:v[0-9]+]] + +; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN-NOT: v_mov_b32 + +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]] +; GCN: buffer_store_dword [[RESULT0]] +; GCN: buffer_store_dword [[RESULT1]] +define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 { + %va0 = load volatile float, float addrspace(1)* %in + %va1 = load volatile float, float addrspace(1)* %in + %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1 + %fma1 = call float @llvm.fma.f32(float %a, float %va1, float %b) #1 + store volatile float %fma0, float addrspace(1)* %out + store volatile float %fma1, float addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a: ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc @@ -99,5 +126,145 @@ define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 ret void } +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_kimm: +; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 { + %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s: +; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]] +; GCN: buffer_store_dword [[RESULT0]] +define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 { + %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1 + store float %fma, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s_x2: +; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR0]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], [[SGPR1]] +; GCN: buffer_store_dword [[RESULT0]] +; GCN: buffer_store_dword [[RESULT1]] +; GCN: s_endpgm +define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 { + %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1 + %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1 + store volatile float %fma0, float addrspace(1)* %out + store volatile float %fma1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k: +; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 { + %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1 + store float %fma, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k_x2: +; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VK]], [[VK]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR1]], [[VK]], [[VK]] +; GCN: buffer_store_dword [[RESULT0]] +; GCN: buffer_store_dword [[RESULT1]] +; GCN: s_endpgm +define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 { + %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1 + %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1 + store volatile float %fma0, float addrspace(1)* %out + store volatile float %fma1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k: +; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 { + %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1 + store float %fma, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k_x2: +; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VK]], [[VK]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR1]], [[VK]], [[VK]] +; GCN: buffer_store_dword [[RESULT0]] +; GCN: buffer_store_dword [[RESULT1]] +; GCN: s_endpgm +define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 { + %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1 + %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1 + store volatile float %fma0, float addrspace(1)* %out + store volatile float %fma1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_s0_s1_k_f32: +; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000 +; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]] + +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]] +; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000 +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]] + +; GCN: buffer_store_dword [[RESULT0]] +; GCN: buffer_store_dword [[RESULT1]] +define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 { + %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1 + %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1 + store volatile float %fma0, float addrspace(1)* %out + store volatile float %fma1, float addrspace(1)* %out + ret void +} + +; FIXME: Immediate in SGPRs just copied to VGPRs +; GCN-LABEL: {{^}}test_s0_s1_k_f64: +; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} +; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], 0x40900000 +; GCN-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}} + +; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]] +; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]] +; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}} + +; Same zero component is re-used for half of each immediate. +; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000 +; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}} + +; GCN: buffer_store_dwordx2 [[RESULT0]] +; GCN: buffer_store_dwordx2 [[RESULT1]] +define void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 { + %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1 + %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1 + store volatile double %fma0, double addrspace(1)* %out + store volatile double %fma1, double addrspace(1)* %out + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll index 7d0ebd139f518..1cbefba60c95f 100644 --- a/test/CodeGen/AMDGPU/valu-i1.ll +++ b/test/CodeGen/AMDGPU/valu-i1.ll @@ -78,8 +78,8 @@ exit: ; SI: BB2_3: ; SI: buffer_load_dword -; SI: buffer_store_dword -; SI: v_cmp_eq_i32_e32 vcc, +; SI-DAG: buffer_store_dword +; SI-DAG: v_cmp_eq_i32_e32 vcc, ; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]] ; SI: s_andn2_b64 exec, exec, [[OR_SREG]] ; SI: s_cbranch_execnz BB2_3 @@ -128,18 +128,18 @@ exit: ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] ; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] -; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]] -; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]] +; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] +; SI: s_xor_b64 [[ORNEG2]], exec, [[ORNEG2]] ; SI: s_cbranch_execz BB3_5 ; SI: BB#4: ; SI: buffer_store_dword -; SI: v_cmp_ge_i64_e32 vcc -; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]] +; SI: v_cmp_ge_i64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]] +; SI: s_or_b64 [[COND_STATE]], [[CMP]], [[COND_STATE]] ; SI: BB3_5: -; SI: s_or_b64 exec, exec, [[ORNEG1]] -; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]] +; SI: s_or_b64 exec, exec, [[ORNEG2]] +; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[COND_STATE]] ; SI: s_andn2_b64 exec, exec, [[COND_STATE]] ; SI: s_cbranch_execnz BB3_3 diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll new file mode 100644 index 0000000000000..cd7c78f408ddd --- /dev/null +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -0,0 +1,585 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s +; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s + +; This ends up using all 256 registers and requires register +; scavenging which will fail to find an unsued register. + +; Check the ScratchSize to avoid regressions from spilling +; intermediate register class copies. + +; FIXME: The same register is initialized to 0 for every spill. + +declare i32 @llvm.r600.read.tgid.x() #1 +declare i32 @llvm.r600.read.tgid.y() #1 +declare i32 @llvm.r600.read.tgid.z() #1 + +; GCN-LABEL: {{^}}spill_vgpr_compute: + +; GCN: s_mov_b32 s16, s3 +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0x80f000 +; VI-NEXT: s_mov_b32 s15, 0x800000 + + +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} + +; GCN: NumVgprs: 256 +; GCN: ScratchSize: 1024 + +; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset. +define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 { +bb: + %tmp = add i32 %arg1, %arg2 + %tmp7 = extractelement <4 x float> %arg6, i32 0 + %tmp8 = extractelement <4 x float> %arg6, i32 1 + %tmp9 = extractelement <4 x float> %arg6, i32 2 + %tmp10 = extractelement <4 x float> %arg6, i32 3 + %tmp11 = bitcast float %arg5 to i32 + br label %bb12 + +bb12: ; preds = %bb145, %bb + %tmp13 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb145 ] + %tmp14 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb145 ] + %tmp15 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb145 ] + %tmp16 = phi float [ 0.000000e+00, %bb ], [ %tmp339, %bb145 ] + %tmp17 = phi float [ 0.000000e+00, %bb ], [ %tmp335, %bb145 ] + %tmp18 = phi float [ 0.000000e+00, %bb ], [ %tmp334, %bb145 ] + %tmp19 = phi float [ 0.000000e+00, %bb ], [ %tmp333, %bb145 ] + %tmp20 = phi float [ 0.000000e+00, %bb ], [ %tmp340, %bb145 ] + %tmp21 = phi float [ 0.000000e+00, %bb ], [ %tmp332, %bb145 ] + %tmp22 = phi float [ 0.000000e+00, %bb ], [ %tmp331, %bb145 ] + %tmp23 = phi float [ 0.000000e+00, %bb ], [ %tmp330, %bb145 ] + %tmp24 = phi float [ 0.000000e+00, %bb ], [ %tmp341, %bb145 ] + %tmp25 = phi float [ 0.000000e+00, %bb ], [ %tmp329, %bb145 ] + %tmp26 = phi float [ 0.000000e+00, %bb ], [ %tmp328, %bb145 ] + %tmp27 = phi float [ 0.000000e+00, %bb ], [ %tmp327, %bb145 ] + %tmp28 = phi float [ 0.000000e+00, %bb ], [ %tmp342, %bb145 ] + %tmp29 = phi float [ 0.000000e+00, %bb ], [ %tmp326, %bb145 ] + %tmp30 = phi float [ 0.000000e+00, %bb ], [ %tmp325, %bb145 ] + %tmp31 = phi float [ 0.000000e+00, %bb ], [ %tmp324, %bb145 ] + %tmp32 = phi float [ 0.000000e+00, %bb ], [ %tmp343, %bb145 ] + %tmp33 = phi float [ 0.000000e+00, %bb ], [ %tmp323, %bb145 ] + %tmp34 = phi float [ 0.000000e+00, %bb ], [ %tmp322, %bb145 ] + %tmp35 = phi float [ 0.000000e+00, %bb ], [ %tmp321, %bb145 ] + %tmp36 = phi float [ 0.000000e+00, %bb ], [ %tmp344, %bb145 ] + %tmp37 = phi float [ 0.000000e+00, %bb ], [ %tmp320, %bb145 ] + %tmp38 = phi float [ 0.000000e+00, %bb ], [ %tmp319, %bb145 ] + %tmp39 = phi float [ 0.000000e+00, %bb ], [ %tmp318, %bb145 ] + %tmp40 = phi float [ 0.000000e+00, %bb ], [ %tmp345, %bb145 ] + %tmp41 = phi float [ 0.000000e+00, %bb ], [ %tmp317, %bb145 ] + %tmp42 = phi float [ 0.000000e+00, %bb ], [ %tmp316, %bb145 ] + %tmp43 = phi float [ 0.000000e+00, %bb ], [ %tmp315, %bb145 ] + %tmp44 = phi float [ 0.000000e+00, %bb ], [ %tmp346, %bb145 ] + %tmp45 = phi float [ 0.000000e+00, %bb ], [ %tmp314, %bb145 ] + %tmp46 = phi float [ 0.000000e+00, %bb ], [ %tmp313, %bb145 ] + %tmp47 = phi float [ 0.000000e+00, %bb ], [ %tmp312, %bb145 ] + %tmp48 = phi float [ 0.000000e+00, %bb ], [ %tmp347, %bb145 ] + %tmp49 = phi float [ 0.000000e+00, %bb ], [ %tmp311, %bb145 ] + %tmp50 = phi float [ 0.000000e+00, %bb ], [ %tmp310, %bb145 ] + %tmp51 = phi float [ 0.000000e+00, %bb ], [ %tmp309, %bb145 ] + %tmp52 = phi float [ 0.000000e+00, %bb ], [ %tmp348, %bb145 ] + %tmp53 = phi float [ 0.000000e+00, %bb ], [ %tmp308, %bb145 ] + %tmp54 = phi float [ 0.000000e+00, %bb ], [ %tmp307, %bb145 ] + %tmp55 = phi float [ 0.000000e+00, %bb ], [ %tmp306, %bb145 ] + %tmp56 = phi float [ 0.000000e+00, %bb ], [ %tmp349, %bb145 ] + %tmp57 = phi float [ 0.000000e+00, %bb ], [ %tmp305, %bb145 ] + %tmp58 = phi float [ 0.000000e+00, %bb ], [ %tmp304, %bb145 ] + %tmp59 = phi float [ 0.000000e+00, %bb ], [ %tmp303, %bb145 ] + %tmp60 = phi float [ 0.000000e+00, %bb ], [ %tmp350, %bb145 ] + %tmp61 = phi float [ 0.000000e+00, %bb ], [ %tmp302, %bb145 ] + %tmp62 = phi float [ 0.000000e+00, %bb ], [ %tmp301, %bb145 ] + %tmp63 = phi float [ 0.000000e+00, %bb ], [ %tmp300, %bb145 ] + %tmp64 = phi float [ 0.000000e+00, %bb ], [ %tmp351, %bb145 ] + %tmp65 = phi float [ 0.000000e+00, %bb ], [ %tmp299, %bb145 ] + %tmp66 = phi float [ 0.000000e+00, %bb ], [ %tmp298, %bb145 ] + %tmp67 = phi float [ 0.000000e+00, %bb ], [ %tmp297, %bb145 ] + %tmp68 = phi float [ 0.000000e+00, %bb ], [ %tmp352, %bb145 ] + %tmp69 = phi float [ 0.000000e+00, %bb ], [ %tmp296, %bb145 ] + %tmp70 = phi float [ 0.000000e+00, %bb ], [ %tmp295, %bb145 ] + %tmp71 = phi float [ 0.000000e+00, %bb ], [ %tmp294, %bb145 ] + %tmp72 = phi float [ 0.000000e+00, %bb ], [ %tmp353, %bb145 ] + %tmp73 = phi float [ 0.000000e+00, %bb ], [ %tmp293, %bb145 ] + %tmp74 = phi float [ 0.000000e+00, %bb ], [ %tmp292, %bb145 ] + %tmp75 = phi float [ 0.000000e+00, %bb ], [ %tmp291, %bb145 ] + %tmp76 = phi float [ 0.000000e+00, %bb ], [ %tmp354, %bb145 ] + %tmp77 = phi float [ 0.000000e+00, %bb ], [ %tmp290, %bb145 ] + %tmp78 = phi float [ 0.000000e+00, %bb ], [ %tmp289, %bb145 ] + %tmp79 = phi float [ 0.000000e+00, %bb ], [ %tmp288, %bb145 ] + %tmp80 = phi float [ 0.000000e+00, %bb ], [ %tmp355, %bb145 ] + %tmp81 = phi float [ 0.000000e+00, %bb ], [ %tmp287, %bb145 ] + %tmp82 = phi float [ 0.000000e+00, %bb ], [ %tmp286, %bb145 ] + %tmp83 = phi float [ 0.000000e+00, %bb ], [ %tmp285, %bb145 ] + %tmp84 = phi float [ 0.000000e+00, %bb ], [ %tmp356, %bb145 ] + %tmp85 = phi float [ 0.000000e+00, %bb ], [ %tmp284, %bb145 ] + %tmp86 = phi float [ 0.000000e+00, %bb ], [ %tmp283, %bb145 ] + %tmp87 = phi float [ 0.000000e+00, %bb ], [ %tmp282, %bb145 ] + %tmp88 = phi float [ 0.000000e+00, %bb ], [ %tmp357, %bb145 ] + %tmp89 = phi float [ 0.000000e+00, %bb ], [ %tmp281, %bb145 ] + %tmp90 = phi float [ 0.000000e+00, %bb ], [ %tmp280, %bb145 ] + %tmp91 = phi float [ 0.000000e+00, %bb ], [ %tmp279, %bb145 ] + %tmp92 = phi float [ 0.000000e+00, %bb ], [ %tmp358, %bb145 ] + %tmp93 = phi float [ 0.000000e+00, %bb ], [ %tmp359, %bb145 ] + %tmp94 = phi float [ 0.000000e+00, %bb ], [ %tmp360, %bb145 ] + %tmp95 = phi float [ 0.000000e+00, %bb ], [ %tmp409, %bb145 ] + %tmp96 = phi float [ 0.000000e+00, %bb ], [ %tmp361, %bb145 ] + %tmp97 = phi float [ 0.000000e+00, %bb ], [ %tmp362, %bb145 ] + %tmp98 = phi float [ 0.000000e+00, %bb ], [ %tmp363, %bb145 ] + %tmp99 = phi float [ 0.000000e+00, %bb ], [ %tmp364, %bb145 ] + %tmp100 = phi float [ 0.000000e+00, %bb ], [ %tmp365, %bb145 ] + %tmp101 = phi float [ 0.000000e+00, %bb ], [ %tmp366, %bb145 ] + %tmp102 = phi float [ 0.000000e+00, %bb ], [ %tmp367, %bb145 ] + %tmp103 = phi float [ 0.000000e+00, %bb ], [ %tmp368, %bb145 ] + %tmp104 = phi float [ 0.000000e+00, %bb ], [ %tmp369, %bb145 ] + %tmp105 = phi float [ 0.000000e+00, %bb ], [ %tmp370, %bb145 ] + %tmp106 = phi float [ 0.000000e+00, %bb ], [ %tmp371, %bb145 ] + %tmp107 = phi float [ 0.000000e+00, %bb ], [ %tmp372, %bb145 ] + %tmp108 = phi float [ 0.000000e+00, %bb ], [ %tmp373, %bb145 ] + %tmp109 = phi float [ 0.000000e+00, %bb ], [ %tmp374, %bb145 ] + %tmp110 = phi float [ 0.000000e+00, %bb ], [ %tmp375, %bb145 ] + %tmp111 = phi float [ 0.000000e+00, %bb ], [ %tmp376, %bb145 ] + %tmp112 = phi float [ 0.000000e+00, %bb ], [ %tmp377, %bb145 ] + %tmp113 = phi float [ 0.000000e+00, %bb ], [ %tmp378, %bb145 ] + %tmp114 = phi float [ 0.000000e+00, %bb ], [ %tmp379, %bb145 ] + %tmp115 = phi float [ 0.000000e+00, %bb ], [ %tmp380, %bb145 ] + %tmp116 = phi float [ 0.000000e+00, %bb ], [ %tmp381, %bb145 ] + %tmp117 = phi float [ 0.000000e+00, %bb ], [ %tmp382, %bb145 ] + %tmp118 = phi float [ 0.000000e+00, %bb ], [ %tmp383, %bb145 ] + %tmp119 = phi float [ 0.000000e+00, %bb ], [ %tmp384, %bb145 ] + %tmp120 = phi float [ 0.000000e+00, %bb ], [ %tmp385, %bb145 ] + %tmp121 = phi float [ 0.000000e+00, %bb ], [ %tmp386, %bb145 ] + %tmp122 = phi float [ 0.000000e+00, %bb ], [ %tmp387, %bb145 ] + %tmp123 = phi float [ 0.000000e+00, %bb ], [ %tmp388, %bb145 ] + %tmp124 = phi float [ 0.000000e+00, %bb ], [ %tmp389, %bb145 ] + %tmp125 = phi float [ 0.000000e+00, %bb ], [ %tmp390, %bb145 ] + %tmp126 = phi float [ 0.000000e+00, %bb ], [ %tmp391, %bb145 ] + %tmp127 = phi float [ 0.000000e+00, %bb ], [ %tmp392, %bb145 ] + %tmp128 = phi float [ 0.000000e+00, %bb ], [ %tmp393, %bb145 ] + %tmp129 = phi float [ 0.000000e+00, %bb ], [ %tmp394, %bb145 ] + %tmp130 = phi float [ 0.000000e+00, %bb ], [ %tmp395, %bb145 ] + %tmp131 = phi float [ 0.000000e+00, %bb ], [ %tmp396, %bb145 ] + %tmp132 = phi float [ 0.000000e+00, %bb ], [ %tmp397, %bb145 ] + %tmp133 = phi float [ 0.000000e+00, %bb ], [ %tmp398, %bb145 ] + %tmp134 = phi float [ 0.000000e+00, %bb ], [ %tmp399, %bb145 ] + %tmp135 = phi float [ 0.000000e+00, %bb ], [ %tmp400, %bb145 ] + %tmp136 = phi float [ 0.000000e+00, %bb ], [ %tmp401, %bb145 ] + %tmp137 = phi float [ 0.000000e+00, %bb ], [ %tmp402, %bb145 ] + %tmp138 = phi float [ 0.000000e+00, %bb ], [ %tmp403, %bb145 ] + %tmp139 = phi float [ 0.000000e+00, %bb ], [ %tmp404, %bb145 ] + %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ] + %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ] + %tmp142 = bitcast float %tmp95 to i32 + %tmp143 = icmp sgt i32 %tmp142, 125 + br i1 %tmp143, label %bb144, label %bb145 + +bb144: ; preds = %bb12 + store volatile float %arg3, float addrspace(1)* %arg + store volatile float %tmp91, float addrspace(1)* %arg + store volatile float %tmp90, float addrspace(1)* %arg + store volatile float %tmp89, float addrspace(1)* %arg + store volatile float %tmp87, float addrspace(1)* %arg + store volatile float %tmp86, float addrspace(1)* %arg + store volatile float %tmp85, float addrspace(1)* %arg + store volatile float %tmp83, float addrspace(1)* %arg + store volatile float %tmp82, float addrspace(1)* %arg + store volatile float %tmp81, float addrspace(1)* %arg + store volatile float %tmp79, float addrspace(1)* %arg + store volatile float %tmp78, float addrspace(1)* %arg + store volatile float %tmp77, float addrspace(1)* %arg + store volatile float %tmp75, float addrspace(1)* %arg + store volatile float %tmp74, float addrspace(1)* %arg + store volatile float %tmp73, float addrspace(1)* %arg + store volatile float %tmp71, float addrspace(1)* %arg + store volatile float %tmp70, float addrspace(1)* %arg + store volatile float %tmp69, float addrspace(1)* %arg + store volatile float %tmp67, float addrspace(1)* %arg + store volatile float %tmp66, float addrspace(1)* %arg + store volatile float %tmp65, float addrspace(1)* %arg + store volatile float %tmp63, float addrspace(1)* %arg + store volatile float %tmp62, float addrspace(1)* %arg + store volatile float %tmp61, float addrspace(1)* %arg + store volatile float %tmp59, float addrspace(1)* %arg + store volatile float %tmp58, float addrspace(1)* %arg + store volatile float %tmp57, float addrspace(1)* %arg + store volatile float %tmp55, float addrspace(1)* %arg + store volatile float %tmp54, float addrspace(1)* %arg + store volatile float %tmp53, float addrspace(1)* %arg + store volatile float %tmp51, float addrspace(1)* %arg + store volatile float %tmp50, float addrspace(1)* %arg + store volatile float %tmp49, float addrspace(1)* %arg + store volatile float %tmp47, float addrspace(1)* %arg + store volatile float %tmp46, float addrspace(1)* %arg + store volatile float %tmp45, float addrspace(1)* %arg + store volatile float %tmp43, float addrspace(1)* %arg + store volatile float %tmp42, float addrspace(1)* %arg + store volatile float %tmp41, float addrspace(1)* %arg + store volatile float %tmp39, float addrspace(1)* %arg + store volatile float %tmp38, float addrspace(1)* %arg + store volatile float %tmp37, float addrspace(1)* %arg + store volatile float %tmp35, float addrspace(1)* %arg + store volatile float %tmp34, float addrspace(1)* %arg + store volatile float %tmp33, float addrspace(1)* %arg + store volatile float %tmp31, float addrspace(1)* %arg + store volatile float %tmp30, float addrspace(1)* %arg + store volatile float %tmp29, float addrspace(1)* %arg + store volatile float %tmp27, float addrspace(1)* %arg + store volatile float %tmp26, float addrspace(1)* %arg + store volatile float %tmp25, float addrspace(1)* %arg + store volatile float %tmp23, float addrspace(1)* %arg + store volatile float %tmp22, float addrspace(1)* %arg + store volatile float %tmp21, float addrspace(1)* %arg + store volatile float %tmp19, float addrspace(1)* %arg + store volatile float %tmp18, float addrspace(1)* %arg + store volatile float %tmp17, float addrspace(1)* %arg + store volatile float %tmp15, float addrspace(1)* %arg + store volatile float %tmp14, float addrspace(1)* %arg + store volatile float %tmp13, float addrspace(1)* %arg + store volatile float %tmp16, float addrspace(1)* %arg + store volatile float %tmp20, float addrspace(1)* %arg + store volatile float %tmp24, float addrspace(1)* %arg + store volatile float %tmp28, float addrspace(1)* %arg + store volatile float %tmp32, float addrspace(1)* %arg + store volatile float %tmp36, float addrspace(1)* %arg + store volatile float %tmp40, float addrspace(1)* %arg + store volatile float %tmp44, float addrspace(1)* %arg + store volatile float %tmp48, float addrspace(1)* %arg + store volatile float %tmp52, float addrspace(1)* %arg + store volatile float %tmp56, float addrspace(1)* %arg + store volatile float %tmp60, float addrspace(1)* %arg + store volatile float %tmp64, float addrspace(1)* %arg + store volatile float %tmp68, float addrspace(1)* %arg + store volatile float %tmp72, float addrspace(1)* %arg + store volatile float %tmp76, float addrspace(1)* %arg + store volatile float %tmp80, float addrspace(1)* %arg + store volatile float %tmp84, float addrspace(1)* %arg + store volatile float %tmp88, float addrspace(1)* %arg + store volatile float %tmp92, float addrspace(1)* %arg + store volatile float %tmp93, float addrspace(1)* %arg + store volatile float %tmp94, float addrspace(1)* %arg + store volatile float %tmp96, float addrspace(1)* %arg + store volatile float %tmp97, float addrspace(1)* %arg + store volatile float %tmp98, float addrspace(1)* %arg + store volatile float %tmp99, float addrspace(1)* %arg + store volatile float %tmp100, float addrspace(1)* %arg + store volatile float %tmp101, float addrspace(1)* %arg + store volatile float %tmp102, float addrspace(1)* %arg + store volatile float %tmp103, float addrspace(1)* %arg + store volatile float %tmp104, float addrspace(1)* %arg + store volatile float %tmp105, float addrspace(1)* %arg + store volatile float %tmp106, float addrspace(1)* %arg + store volatile float %tmp107, float addrspace(1)* %arg + store volatile float %tmp108, float addrspace(1)* %arg + store volatile float %tmp109, float addrspace(1)* %arg + store volatile float %tmp110, float addrspace(1)* %arg + store volatile float %tmp111, float addrspace(1)* %arg + store volatile float %tmp112, float addrspace(1)* %arg + store volatile float %tmp113, float addrspace(1)* %arg + store volatile float %tmp114, float addrspace(1)* %arg + store volatile float %tmp115, float addrspace(1)* %arg + store volatile float %tmp116, float addrspace(1)* %arg + store volatile float %tmp117, float addrspace(1)* %arg + store volatile float %tmp118, float addrspace(1)* %arg + store volatile float %tmp119, float addrspace(1)* %arg + store volatile float %tmp120, float addrspace(1)* %arg + store volatile float %tmp121, float addrspace(1)* %arg + store volatile float %tmp122, float addrspace(1)* %arg + store volatile float %tmp123, float addrspace(1)* %arg + store volatile float %tmp124, float addrspace(1)* %arg + store volatile float %tmp125, float addrspace(1)* %arg + store volatile float %tmp126, float addrspace(1)* %arg + store volatile float %tmp127, float addrspace(1)* %arg + store volatile float %tmp128, float addrspace(1)* %arg + store volatile float %tmp129, float addrspace(1)* %arg + store volatile float %tmp130, float addrspace(1)* %arg + store volatile float %tmp131, float addrspace(1)* %arg + store volatile float %tmp132, float addrspace(1)* %arg + store volatile float %tmp133, float addrspace(1)* %arg + store volatile float %tmp134, float addrspace(1)* %arg + store volatile float %tmp135, float addrspace(1)* %arg + store volatile float %tmp136, float addrspace(1)* %arg + store volatile float %tmp137, float addrspace(1)* %arg + store volatile float %tmp138, float addrspace(1)* %arg + store volatile float %tmp139, float addrspace(1)* %arg + store volatile float %arg4, float addrspace(1)* %arg + store volatile float %tmp7, float addrspace(1)* %arg + store volatile float %tmp8, float addrspace(1)* %arg + store volatile float %tmp9, float addrspace(1)* %arg + store volatile float %tmp10, float addrspace(1)* %arg + ret void + +bb145: ; preds = %bb12 + %tmp146 = bitcast float %tmp95 to i32 + %tmp147 = bitcast float %tmp95 to i32 + %tmp148 = add i32 %tmp11, %tmp147 + %tmp149 = bitcast i32 %tmp148 to float + %tmp150 = insertelement <128 x float> undef, float %tmp91, i32 0 + %tmp151 = insertelement <128 x float> %tmp150, float %tmp90, i32 1 + %tmp152 = insertelement <128 x float> %tmp151, float %tmp89, i32 2 + %tmp153 = insertelement <128 x float> %tmp152, float %tmp87, i32 3 + %tmp154 = insertelement <128 x float> %tmp153, float %tmp86, i32 4 + %tmp155 = insertelement <128 x float> %tmp154, float %tmp85, i32 5 + %tmp156 = insertelement <128 x float> %tmp155, float %tmp83, i32 6 + %tmp157 = insertelement <128 x float> %tmp156, float %tmp82, i32 7 + %tmp158 = insertelement <128 x float> %tmp157, float %tmp81, i32 8 + %tmp159 = insertelement <128 x float> %tmp158, float %tmp79, i32 9 + %tmp160 = insertelement <128 x float> %tmp159, float %tmp78, i32 10 + %tmp161 = insertelement <128 x float> %tmp160, float %tmp77, i32 11 + %tmp162 = insertelement <128 x float> %tmp161, float %tmp75, i32 12 + %tmp163 = insertelement <128 x float> %tmp162, float %tmp74, i32 13 + %tmp164 = insertelement <128 x float> %tmp163, float %tmp73, i32 14 + %tmp165 = insertelement <128 x float> %tmp164, float %tmp71, i32 15 + %tmp166 = insertelement <128 x float> %tmp165, float %tmp70, i32 16 + %tmp167 = insertelement <128 x float> %tmp166, float %tmp69, i32 17 + %tmp168 = insertelement <128 x float> %tmp167, float %tmp67, i32 18 + %tmp169 = insertelement <128 x float> %tmp168, float %tmp66, i32 19 + %tmp170 = insertelement <128 x float> %tmp169, float %tmp65, i32 20 + %tmp171 = insertelement <128 x float> %tmp170, float %tmp63, i32 21 + %tmp172 = insertelement <128 x float> %tmp171, float %tmp62, i32 22 + %tmp173 = insertelement <128 x float> %tmp172, float %tmp61, i32 23 + %tmp174 = insertelement <128 x float> %tmp173, float %tmp59, i32 24 + %tmp175 = insertelement <128 x float> %tmp174, float %tmp58, i32 25 + %tmp176 = insertelement <128 x float> %tmp175, float %tmp57, i32 26 + %tmp177 = insertelement <128 x float> %tmp176, float %tmp55, i32 27 + %tmp178 = insertelement <128 x float> %tmp177, float %tmp54, i32 28 + %tmp179 = insertelement <128 x float> %tmp178, float %tmp53, i32 29 + %tmp180 = insertelement <128 x float> %tmp179, float %tmp51, i32 30 + %tmp181 = insertelement <128 x float> %tmp180, float %tmp50, i32 31 + %tmp182 = insertelement <128 x float> %tmp181, float %tmp49, i32 32 + %tmp183 = insertelement <128 x float> %tmp182, float %tmp47, i32 33 + %tmp184 = insertelement <128 x float> %tmp183, float %tmp46, i32 34 + %tmp185 = insertelement <128 x float> %tmp184, float %tmp45, i32 35 + %tmp186 = insertelement <128 x float> %tmp185, float %tmp43, i32 36 + %tmp187 = insertelement <128 x float> %tmp186, float %tmp42, i32 37 + %tmp188 = insertelement <128 x float> %tmp187, float %tmp41, i32 38 + %tmp189 = insertelement <128 x float> %tmp188, float %tmp39, i32 39 + %tmp190 = insertelement <128 x float> %tmp189, float %tmp38, i32 40 + %tmp191 = insertelement <128 x float> %tmp190, float %tmp37, i32 41 + %tmp192 = insertelement <128 x float> %tmp191, float %tmp35, i32 42 + %tmp193 = insertelement <128 x float> %tmp192, float %tmp34, i32 43 + %tmp194 = insertelement <128 x float> %tmp193, float %tmp33, i32 44 + %tmp195 = insertelement <128 x float> %tmp194, float %tmp31, i32 45 + %tmp196 = insertelement <128 x float> %tmp195, float %tmp30, i32 46 + %tmp197 = insertelement <128 x float> %tmp196, float %tmp29, i32 47 + %tmp198 = insertelement <128 x float> %tmp197, float %tmp27, i32 48 + %tmp199 = insertelement <128 x float> %tmp198, float %tmp26, i32 49 + %tmp200 = insertelement <128 x float> %tmp199, float %tmp25, i32 50 + %tmp201 = insertelement <128 x float> %tmp200, float %tmp23, i32 51 + %tmp202 = insertelement <128 x float> %tmp201, float %tmp22, i32 52 + %tmp203 = insertelement <128 x float> %tmp202, float %tmp21, i32 53 + %tmp204 = insertelement <128 x float> %tmp203, float %tmp19, i32 54 + %tmp205 = insertelement <128 x float> %tmp204, float %tmp18, i32 55 + %tmp206 = insertelement <128 x float> %tmp205, float %tmp17, i32 56 + %tmp207 = insertelement <128 x float> %tmp206, float %tmp15, i32 57 + %tmp208 = insertelement <128 x float> %tmp207, float %tmp14, i32 58 + %tmp209 = insertelement <128 x float> %tmp208, float %tmp13, i32 59 + %tmp210 = insertelement <128 x float> %tmp209, float %tmp16, i32 60 + %tmp211 = insertelement <128 x float> %tmp210, float %tmp20, i32 61 + %tmp212 = insertelement <128 x float> %tmp211, float %tmp24, i32 62 + %tmp213 = insertelement <128 x float> %tmp212, float %tmp28, i32 63 + %tmp214 = insertelement <128 x float> %tmp213, float %tmp32, i32 64 + %tmp215 = insertelement <128 x float> %tmp214, float %tmp36, i32 65 + %tmp216 = insertelement <128 x float> %tmp215, float %tmp40, i32 66 + %tmp217 = insertelement <128 x float> %tmp216, float %tmp44, i32 67 + %tmp218 = insertelement <128 x float> %tmp217, float %tmp48, i32 68 + %tmp219 = insertelement <128 x float> %tmp218, float %tmp52, i32 69 + %tmp220 = insertelement <128 x float> %tmp219, float %tmp56, i32 70 + %tmp221 = insertelement <128 x float> %tmp220, float %tmp60, i32 71 + %tmp222 = insertelement <128 x float> %tmp221, float %tmp64, i32 72 + %tmp223 = insertelement <128 x float> %tmp222, float %tmp68, i32 73 + %tmp224 = insertelement <128 x float> %tmp223, float %tmp72, i32 74 + %tmp225 = insertelement <128 x float> %tmp224, float %tmp76, i32 75 + %tmp226 = insertelement <128 x float> %tmp225, float %tmp80, i32 76 + %tmp227 = insertelement <128 x float> %tmp226, float %tmp84, i32 77 + %tmp228 = insertelement <128 x float> %tmp227, float %tmp88, i32 78 + %tmp229 = insertelement <128 x float> %tmp228, float %tmp92, i32 79 + %tmp230 = insertelement <128 x float> %tmp229, float %tmp93, i32 80 + %tmp231 = insertelement <128 x float> %tmp230, float %tmp94, i32 81 + %tmp232 = insertelement <128 x float> %tmp231, float %tmp96, i32 82 + %tmp233 = insertelement <128 x float> %tmp232, float %tmp97, i32 83 + %tmp234 = insertelement <128 x float> %tmp233, float %tmp98, i32 84 + %tmp235 = insertelement <128 x float> %tmp234, float %tmp99, i32 85 + %tmp236 = insertelement <128 x float> %tmp235, float %tmp100, i32 86 + %tmp237 = insertelement <128 x float> %tmp236, float %tmp101, i32 87 + %tmp238 = insertelement <128 x float> %tmp237, float %tmp102, i32 88 + %tmp239 = insertelement <128 x float> %tmp238, float %tmp103, i32 89 + %tmp240 = insertelement <128 x float> %tmp239, float %tmp104, i32 90 + %tmp241 = insertelement <128 x float> %tmp240, float %tmp105, i32 91 + %tmp242 = insertelement <128 x float> %tmp241, float %tmp106, i32 92 + %tmp243 = insertelement <128 x float> %tmp242, float %tmp107, i32 93 + %tmp244 = insertelement <128 x float> %tmp243, float %tmp108, i32 94 + %tmp245 = insertelement <128 x float> %tmp244, float %tmp109, i32 95 + %tmp246 = insertelement <128 x float> %tmp245, float %tmp110, i32 96 + %tmp247 = insertelement <128 x float> %tmp246, float %tmp111, i32 97 + %tmp248 = insertelement <128 x float> %tmp247, float %tmp112, i32 98 + %tmp249 = insertelement <128 x float> %tmp248, float %tmp113, i32 99 + %tmp250 = insertelement <128 x float> %tmp249, float %tmp114, i32 100 + %tmp251 = insertelement <128 x float> %tmp250, float %tmp115, i32 101 + %tmp252 = insertelement <128 x float> %tmp251, float %tmp116, i32 102 + %tmp253 = insertelement <128 x float> %tmp252, float %tmp117, i32 103 + %tmp254 = insertelement <128 x float> %tmp253, float %tmp118, i32 104 + %tmp255 = insertelement <128 x float> %tmp254, float %tmp119, i32 105 + %tmp256 = insertelement <128 x float> %tmp255, float %tmp120, i32 106 + %tmp257 = insertelement <128 x float> %tmp256, float %tmp121, i32 107 + %tmp258 = insertelement <128 x float> %tmp257, float %tmp122, i32 108 + %tmp259 = insertelement <128 x float> %tmp258, float %tmp123, i32 109 + %tmp260 = insertelement <128 x float> %tmp259, float %tmp124, i32 110 + %tmp261 = insertelement <128 x float> %tmp260, float %tmp125, i32 111 + %tmp262 = insertelement <128 x float> %tmp261, float %tmp126, i32 112 + %tmp263 = insertelement <128 x float> %tmp262, float %tmp127, i32 113 + %tmp264 = insertelement <128 x float> %tmp263, float %tmp128, i32 114 + %tmp265 = insertelement <128 x float> %tmp264, float %tmp129, i32 115 + %tmp266 = insertelement <128 x float> %tmp265, float %tmp130, i32 116 + %tmp267 = insertelement <128 x float> %tmp266, float %tmp131, i32 117 + %tmp268 = insertelement <128 x float> %tmp267, float %tmp132, i32 118 + %tmp269 = insertelement <128 x float> %tmp268, float %tmp133, i32 119 + %tmp270 = insertelement <128 x float> %tmp269, float %tmp134, i32 120 + %tmp271 = insertelement <128 x float> %tmp270, float %tmp135, i32 121 + %tmp272 = insertelement <128 x float> %tmp271, float %tmp136, i32 122 + %tmp273 = insertelement <128 x float> %tmp272, float %tmp137, i32 123 + %tmp274 = insertelement <128 x float> %tmp273, float %tmp138, i32 124 + %tmp275 = insertelement <128 x float> %tmp274, float %tmp139, i32 125 + %tmp276 = insertelement <128 x float> %tmp275, float %tmp140, i32 126 + %tmp277 = insertelement <128 x float> %tmp276, float %tmp141, i32 127 + %tmp278 = insertelement <128 x float> %tmp277, float %tmp149, i32 %tmp146 + %tmp279 = extractelement <128 x float> %tmp278, i32 0 + %tmp280 = extractelement <128 x float> %tmp278, i32 1 + %tmp281 = extractelement <128 x float> %tmp278, i32 2 + %tmp282 = extractelement <128 x float> %tmp278, i32 3 + %tmp283 = extractelement <128 x float> %tmp278, i32 4 + %tmp284 = extractelement <128 x float> %tmp278, i32 5 + %tmp285 = extractelement <128 x float> %tmp278, i32 6 + %tmp286 = extractelement <128 x float> %tmp278, i32 7 + %tmp287 = extractelement <128 x float> %tmp278, i32 8 + %tmp288 = extractelement <128 x float> %tmp278, i32 9 + %tmp289 = extractelement <128 x float> %tmp278, i32 10 + %tmp290 = extractelement <128 x float> %tmp278, i32 11 + %tmp291 = extractelement <128 x float> %tmp278, i32 12 + %tmp292 = extractelement <128 x float> %tmp278, i32 13 + %tmp293 = extractelement <128 x float> %tmp278, i32 14 + %tmp294 = extractelement <128 x float> %tmp278, i32 15 + %tmp295 = extractelement <128 x float> %tmp278, i32 16 + %tmp296 = extractelement <128 x float> %tmp278, i32 17 + %tmp297 = extractelement <128 x float> %tmp278, i32 18 + %tmp298 = extractelement <128 x float> %tmp278, i32 19 + %tmp299 = extractelement <128 x float> %tmp278, i32 20 + %tmp300 = extractelement <128 x float> %tmp278, i32 21 + %tmp301 = extractelement <128 x float> %tmp278, i32 22 + %tmp302 = extractelement <128 x float> %tmp278, i32 23 + %tmp303 = extractelement <128 x float> %tmp278, i32 24 + %tmp304 = extractelement <128 x float> %tmp278, i32 25 + %tmp305 = extractelement <128 x float> %tmp278, i32 26 + %tmp306 = extractelement <128 x float> %tmp278, i32 27 + %tmp307 = extractelement <128 x float> %tmp278, i32 28 + %tmp308 = extractelement <128 x float> %tmp278, i32 29 + %tmp309 = extractelement <128 x float> %tmp278, i32 30 + %tmp310 = extractelement <128 x float> %tmp278, i32 31 + %tmp311 = extractelement <128 x float> %tmp278, i32 32 + %tmp312 = extractelement <128 x float> %tmp278, i32 33 + %tmp313 = extractelement <128 x float> %tmp278, i32 34 + %tmp314 = extractelement <128 x float> %tmp278, i32 35 + %tmp315 = extractelement <128 x float> %tmp278, i32 36 + %tmp316 = extractelement <128 x float> %tmp278, i32 37 + %tmp317 = extractelement <128 x float> %tmp278, i32 38 + %tmp318 = extractelement <128 x float> %tmp278, i32 39 + %tmp319 = extractelement <128 x float> %tmp278, i32 40 + %tmp320 = extractelement <128 x float> %tmp278, i32 41 + %tmp321 = extractelement <128 x float> %tmp278, i32 42 + %tmp322 = extractelement <128 x float> %tmp278, i32 43 + %tmp323 = extractelement <128 x float> %tmp278, i32 44 + %tmp324 = extractelement <128 x float> %tmp278, i32 45 + %tmp325 = extractelement <128 x float> %tmp278, i32 46 + %tmp326 = extractelement <128 x float> %tmp278, i32 47 + %tmp327 = extractelement <128 x float> %tmp278, i32 48 + %tmp328 = extractelement <128 x float> %tmp278, i32 49 + %tmp329 = extractelement <128 x float> %tmp278, i32 50 + %tmp330 = extractelement <128 x float> %tmp278, i32 51 + %tmp331 = extractelement <128 x float> %tmp278, i32 52 + %tmp332 = extractelement <128 x float> %tmp278, i32 53 + %tmp333 = extractelement <128 x float> %tmp278, i32 54 + %tmp334 = extractelement <128 x float> %tmp278, i32 55 + %tmp335 = extractelement <128 x float> %tmp278, i32 56 + %tmp336 = extractelement <128 x float> %tmp278, i32 57 + %tmp337 = extractelement <128 x float> %tmp278, i32 58 + %tmp338 = extractelement <128 x float> %tmp278, i32 59 + %tmp339 = extractelement <128 x float> %tmp278, i32 60 + %tmp340 = extractelement <128 x float> %tmp278, i32 61 + %tmp341 = extractelement <128 x float> %tmp278, i32 62 + %tmp342 = extractelement <128 x float> %tmp278, i32 63 + %tmp343 = extractelement <128 x float> %tmp278, i32 64 + %tmp344 = extractelement <128 x float> %tmp278, i32 65 + %tmp345 = extractelement <128 x float> %tmp278, i32 66 + %tmp346 = extractelement <128 x float> %tmp278, i32 67 + %tmp347 = extractelement <128 x float> %tmp278, i32 68 + %tmp348 = extractelement <128 x float> %tmp278, i32 69 + %tmp349 = extractelement <128 x float> %tmp278, i32 70 + %tmp350 = extractelement <128 x float> %tmp278, i32 71 + %tmp351 = extractelement <128 x float> %tmp278, i32 72 + %tmp352 = extractelement <128 x float> %tmp278, i32 73 + %tmp353 = extractelement <128 x float> %tmp278, i32 74 + %tmp354 = extractelement <128 x float> %tmp278, i32 75 + %tmp355 = extractelement <128 x float> %tmp278, i32 76 + %tmp356 = extractelement <128 x float> %tmp278, i32 77 + %tmp357 = extractelement <128 x float> %tmp278, i32 78 + %tmp358 = extractelement <128 x float> %tmp278, i32 79 + %tmp359 = extractelement <128 x float> %tmp278, i32 80 + %tmp360 = extractelement <128 x float> %tmp278, i32 81 + %tmp361 = extractelement <128 x float> %tmp278, i32 82 + %tmp362 = extractelement <128 x float> %tmp278, i32 83 + %tmp363 = extractelement <128 x float> %tmp278, i32 84 + %tmp364 = extractelement <128 x float> %tmp278, i32 85 + %tmp365 = extractelement <128 x float> %tmp278, i32 86 + %tmp366 = extractelement <128 x float> %tmp278, i32 87 + %tmp367 = extractelement <128 x float> %tmp278, i32 88 + %tmp368 = extractelement <128 x float> %tmp278, i32 89 + %tmp369 = extractelement <128 x float> %tmp278, i32 90 + %tmp370 = extractelement <128 x float> %tmp278, i32 91 + %tmp371 = extractelement <128 x float> %tmp278, i32 92 + %tmp372 = extractelement <128 x float> %tmp278, i32 93 + %tmp373 = extractelement <128 x float> %tmp278, i32 94 + %tmp374 = extractelement <128 x float> %tmp278, i32 95 + %tmp375 = extractelement <128 x float> %tmp278, i32 96 + %tmp376 = extractelement <128 x float> %tmp278, i32 97 + %tmp377 = extractelement <128 x float> %tmp278, i32 98 + %tmp378 = extractelement <128 x float> %tmp278, i32 99 + %tmp379 = extractelement <128 x float> %tmp278, i32 100 + %tmp380 = extractelement <128 x float> %tmp278, i32 101 + %tmp381 = extractelement <128 x float> %tmp278, i32 102 + %tmp382 = extractelement <128 x float> %tmp278, i32 103 + %tmp383 = extractelement <128 x float> %tmp278, i32 104 + %tmp384 = extractelement <128 x float> %tmp278, i32 105 + %tmp385 = extractelement <128 x float> %tmp278, i32 106 + %tmp386 = extractelement <128 x float> %tmp278, i32 107 + %tmp387 = extractelement <128 x float> %tmp278, i32 108 + %tmp388 = extractelement <128 x float> %tmp278, i32 109 + %tmp389 = extractelement <128 x float> %tmp278, i32 110 + %tmp390 = extractelement <128 x float> %tmp278, i32 111 + %tmp391 = extractelement <128 x float> %tmp278, i32 112 + %tmp392 = extractelement <128 x float> %tmp278, i32 113 + %tmp393 = extractelement <128 x float> %tmp278, i32 114 + %tmp394 = extractelement <128 x float> %tmp278, i32 115 + %tmp395 = extractelement <128 x float> %tmp278, i32 116 + %tmp396 = extractelement <128 x float> %tmp278, i32 117 + %tmp397 = extractelement <128 x float> %tmp278, i32 118 + %tmp398 = extractelement <128 x float> %tmp278, i32 119 + %tmp399 = extractelement <128 x float> %tmp278, i32 120 + %tmp400 = extractelement <128 x float> %tmp278, i32 121 + %tmp401 = extractelement <128 x float> %tmp278, i32 122 + %tmp402 = extractelement <128 x float> %tmp278, i32 123 + %tmp403 = extractelement <128 x float> %tmp278, i32 124 + %tmp404 = extractelement <128 x float> %tmp278, i32 125 + %tmp405 = extractelement <128 x float> %tmp278, i32 126 + %tmp406 = extractelement <128 x float> %tmp278, i32 127 + %tmp407 = bitcast float %tmp95 to i32 + %tmp408 = add i32 %tmp407, 1 + %tmp409 = bitcast i32 %tmp408 to float + br label %bb12 +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll new file mode 100644 index 0000000000000..16abb89bb0b80 --- /dev/null +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -0,0 +1,494 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; This ends up using all 255 registers and requires register +; scavenging which will fail to find an unsued register. + +; Check the ScratchSize to avoid regressions from spilling +; intermediate register class copies. + +; FIXME: The same register is initialized to 0 for every spill. + +; GCN-LABEL: {{^}}main: + +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0x80f000 +; VI-NEXT: s_mov_b32 s15, 0x800000 + +; s12 is offset user SGPR +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: NumVgprs: 256 +; GCN: ScratchSize: 1024 + +define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { +bb: + %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i64 0, i64 0 + %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0 + %tmp12 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 0) + %tmp13 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 16) + %tmp14 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 32) + %tmp15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0 + %tmp16 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp15, align 16, !tbaa !0 + %tmp17 = add i32 %arg5, %arg7 + %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp16, i32 0, i32 %tmp17) + %tmp19 = extractelement <4 x float> %tmp18, i32 0 + %tmp20 = extractelement <4 x float> %tmp18, i32 1 + %tmp21 = extractelement <4 x float> %tmp18, i32 2 + %tmp22 = extractelement <4 x float> %tmp18, i32 3 + %tmp23 = bitcast float %tmp14 to i32 + br label %bb24 + +bb24: ; preds = %bb157, %bb + %tmp25 = phi float [ 0.000000e+00, %bb ], [ %tmp350, %bb157 ] + %tmp26 = phi float [ 0.000000e+00, %bb ], [ %tmp349, %bb157 ] + %tmp27 = phi float [ 0.000000e+00, %bb ], [ %tmp348, %bb157 ] + %tmp28 = phi float [ 0.000000e+00, %bb ], [ %tmp351, %bb157 ] + %tmp29 = phi float [ 0.000000e+00, %bb ], [ %tmp347, %bb157 ] + %tmp30 = phi float [ 0.000000e+00, %bb ], [ %tmp346, %bb157 ] + %tmp31 = phi float [ 0.000000e+00, %bb ], [ %tmp345, %bb157 ] + %tmp32 = phi float [ 0.000000e+00, %bb ], [ %tmp352, %bb157 ] + %tmp33 = phi float [ 0.000000e+00, %bb ], [ %tmp344, %bb157 ] + %tmp34 = phi float [ 0.000000e+00, %bb ], [ %tmp343, %bb157 ] + %tmp35 = phi float [ 0.000000e+00, %bb ], [ %tmp342, %bb157 ] + %tmp36 = phi float [ 0.000000e+00, %bb ], [ %tmp353, %bb157 ] + %tmp37 = phi float [ 0.000000e+00, %bb ], [ %tmp341, %bb157 ] + %tmp38 = phi float [ 0.000000e+00, %bb ], [ %tmp340, %bb157 ] + %tmp39 = phi float [ 0.000000e+00, %bb ], [ %tmp339, %bb157 ] + %tmp40 = phi float [ 0.000000e+00, %bb ], [ %tmp354, %bb157 ] + %tmp41 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb157 ] + %tmp42 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb157 ] + %tmp43 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb157 ] + %tmp44 = phi float [ 0.000000e+00, %bb ], [ %tmp355, %bb157 ] + %tmp45 = phi float [ 0.000000e+00, %bb ], [ %tmp335, %bb157 ] + %tmp46 = phi float [ 0.000000e+00, %bb ], [ %tmp334, %bb157 ] + %tmp47 = phi float [ 0.000000e+00, %bb ], [ %tmp333, %bb157 ] + %tmp48 = phi float [ 0.000000e+00, %bb ], [ %tmp356, %bb157 ] + %tmp49 = phi float [ 0.000000e+00, %bb ], [ %tmp332, %bb157 ] + %tmp50 = phi float [ 0.000000e+00, %bb ], [ %tmp331, %bb157 ] + %tmp51 = phi float [ 0.000000e+00, %bb ], [ %tmp330, %bb157 ] + %tmp52 = phi float [ 0.000000e+00, %bb ], [ %tmp357, %bb157 ] + %tmp53 = phi float [ 0.000000e+00, %bb ], [ %tmp329, %bb157 ] + %tmp54 = phi float [ 0.000000e+00, %bb ], [ %tmp328, %bb157 ] + %tmp55 = phi float [ 0.000000e+00, %bb ], [ %tmp327, %bb157 ] + %tmp56 = phi float [ 0.000000e+00, %bb ], [ %tmp358, %bb157 ] + %tmp57 = phi float [ 0.000000e+00, %bb ], [ %tmp326, %bb157 ] + %tmp58 = phi float [ 0.000000e+00, %bb ], [ %tmp325, %bb157 ] + %tmp59 = phi float [ 0.000000e+00, %bb ], [ %tmp324, %bb157 ] + %tmp60 = phi float [ 0.000000e+00, %bb ], [ %tmp359, %bb157 ] + %tmp61 = phi float [ 0.000000e+00, %bb ], [ %tmp323, %bb157 ] + %tmp62 = phi float [ 0.000000e+00, %bb ], [ %tmp322, %bb157 ] + %tmp63 = phi float [ 0.000000e+00, %bb ], [ %tmp321, %bb157 ] + %tmp64 = phi float [ 0.000000e+00, %bb ], [ %tmp360, %bb157 ] + %tmp65 = phi float [ 0.000000e+00, %bb ], [ %tmp320, %bb157 ] + %tmp66 = phi float [ 0.000000e+00, %bb ], [ %tmp319, %bb157 ] + %tmp67 = phi float [ 0.000000e+00, %bb ], [ %tmp318, %bb157 ] + %tmp68 = phi float [ 0.000000e+00, %bb ], [ %tmp361, %bb157 ] + %tmp69 = phi float [ 0.000000e+00, %bb ], [ %tmp317, %bb157 ] + %tmp70 = phi float [ 0.000000e+00, %bb ], [ %tmp316, %bb157 ] + %tmp71 = phi float [ 0.000000e+00, %bb ], [ %tmp315, %bb157 ] + %tmp72 = phi float [ 0.000000e+00, %bb ], [ %tmp362, %bb157 ] + %tmp73 = phi float [ 0.000000e+00, %bb ], [ %tmp314, %bb157 ] + %tmp74 = phi float [ 0.000000e+00, %bb ], [ %tmp313, %bb157 ] + %tmp75 = phi float [ 0.000000e+00, %bb ], [ %tmp312, %bb157 ] + %tmp76 = phi float [ 0.000000e+00, %bb ], [ %tmp363, %bb157 ] + %tmp77 = phi float [ 0.000000e+00, %bb ], [ %tmp311, %bb157 ] + %tmp78 = phi float [ 0.000000e+00, %bb ], [ %tmp310, %bb157 ] + %tmp79 = phi float [ 0.000000e+00, %bb ], [ %tmp309, %bb157 ] + %tmp80 = phi float [ 0.000000e+00, %bb ], [ %tmp364, %bb157 ] + %tmp81 = phi float [ 0.000000e+00, %bb ], [ %tmp308, %bb157 ] + %tmp82 = phi float [ 0.000000e+00, %bb ], [ %tmp307, %bb157 ] + %tmp83 = phi float [ 0.000000e+00, %bb ], [ %tmp306, %bb157 ] + %tmp84 = phi float [ 0.000000e+00, %bb ], [ %tmp365, %bb157 ] + %tmp85 = phi float [ 0.000000e+00, %bb ], [ %tmp305, %bb157 ] + %tmp86 = phi float [ 0.000000e+00, %bb ], [ %tmp304, %bb157 ] + %tmp87 = phi float [ 0.000000e+00, %bb ], [ %tmp303, %bb157 ] + %tmp88 = phi float [ 0.000000e+00, %bb ], [ %tmp366, %bb157 ] + %tmp89 = phi float [ 0.000000e+00, %bb ], [ %tmp302, %bb157 ] + %tmp90 = phi float [ 0.000000e+00, %bb ], [ %tmp301, %bb157 ] + %tmp91 = phi float [ 0.000000e+00, %bb ], [ %tmp300, %bb157 ] + %tmp92 = phi float [ 0.000000e+00, %bb ], [ %tmp367, %bb157 ] + %tmp93 = phi float [ 0.000000e+00, %bb ], [ %tmp299, %bb157 ] + %tmp94 = phi float [ 0.000000e+00, %bb ], [ %tmp298, %bb157 ] + %tmp95 = phi float [ 0.000000e+00, %bb ], [ %tmp297, %bb157 ] + %tmp96 = phi float [ 0.000000e+00, %bb ], [ %tmp368, %bb157 ] + %tmp97 = phi float [ 0.000000e+00, %bb ], [ %tmp296, %bb157 ] + %tmp98 = phi float [ 0.000000e+00, %bb ], [ %tmp295, %bb157 ] + %tmp99 = phi float [ 0.000000e+00, %bb ], [ %tmp294, %bb157 ] + %tmp100 = phi float [ 0.000000e+00, %bb ], [ %tmp369, %bb157 ] + %tmp101 = phi float [ 0.000000e+00, %bb ], [ %tmp293, %bb157 ] + %tmp102 = phi float [ 0.000000e+00, %bb ], [ %tmp292, %bb157 ] + %tmp103 = phi float [ 0.000000e+00, %bb ], [ %tmp291, %bb157 ] + %tmp104 = phi float [ 0.000000e+00, %bb ], [ %tmp370, %bb157 ] + %tmp105 = phi float [ 0.000000e+00, %bb ], [ %tmp371, %bb157 ] + %tmp106 = phi float [ 0.000000e+00, %bb ], [ %tmp372, %bb157 ] + %tmp107 = phi float [ 0.000000e+00, %bb ], [ %tmp421, %bb157 ] + %tmp108 = phi float [ 0.000000e+00, %bb ], [ %tmp373, %bb157 ] + %tmp109 = phi float [ 0.000000e+00, %bb ], [ %tmp374, %bb157 ] + %tmp110 = phi float [ 0.000000e+00, %bb ], [ %tmp375, %bb157 ] + %tmp111 = phi float [ 0.000000e+00, %bb ], [ %tmp376, %bb157 ] + %tmp112 = phi float [ 0.000000e+00, %bb ], [ %tmp377, %bb157 ] + %tmp113 = phi float [ 0.000000e+00, %bb ], [ %tmp378, %bb157 ] + %tmp114 = phi float [ 0.000000e+00, %bb ], [ %tmp379, %bb157 ] + %tmp115 = phi float [ 0.000000e+00, %bb ], [ %tmp380, %bb157 ] + %tmp116 = phi float [ 0.000000e+00, %bb ], [ %tmp381, %bb157 ] + %tmp117 = phi float [ 0.000000e+00, %bb ], [ %tmp382, %bb157 ] + %tmp118 = phi float [ 0.000000e+00, %bb ], [ %tmp383, %bb157 ] + %tmp119 = phi float [ 0.000000e+00, %bb ], [ %tmp384, %bb157 ] + %tmp120 = phi float [ 0.000000e+00, %bb ], [ %tmp385, %bb157 ] + %tmp121 = phi float [ 0.000000e+00, %bb ], [ %tmp386, %bb157 ] + %tmp122 = phi float [ 0.000000e+00, %bb ], [ %tmp387, %bb157 ] + %tmp123 = phi float [ 0.000000e+00, %bb ], [ %tmp388, %bb157 ] + %tmp124 = phi float [ 0.000000e+00, %bb ], [ %tmp389, %bb157 ] + %tmp125 = phi float [ 0.000000e+00, %bb ], [ %tmp390, %bb157 ] + %tmp126 = phi float [ 0.000000e+00, %bb ], [ %tmp391, %bb157 ] + %tmp127 = phi float [ 0.000000e+00, %bb ], [ %tmp392, %bb157 ] + %tmp128 = phi float [ 0.000000e+00, %bb ], [ %tmp393, %bb157 ] + %tmp129 = phi float [ 0.000000e+00, %bb ], [ %tmp394, %bb157 ] + %tmp130 = phi float [ 0.000000e+00, %bb ], [ %tmp395, %bb157 ] + %tmp131 = phi float [ 0.000000e+00, %bb ], [ %tmp396, %bb157 ] + %tmp132 = phi float [ 0.000000e+00, %bb ], [ %tmp397, %bb157 ] + %tmp133 = phi float [ 0.000000e+00, %bb ], [ %tmp398, %bb157 ] + %tmp134 = phi float [ 0.000000e+00, %bb ], [ %tmp399, %bb157 ] + %tmp135 = phi float [ 0.000000e+00, %bb ], [ %tmp400, %bb157 ] + %tmp136 = phi float [ 0.000000e+00, %bb ], [ %tmp401, %bb157 ] + %tmp137 = phi float [ 0.000000e+00, %bb ], [ %tmp402, %bb157 ] + %tmp138 = phi float [ 0.000000e+00, %bb ], [ %tmp403, %bb157 ] + %tmp139 = phi float [ 0.000000e+00, %bb ], [ %tmp404, %bb157 ] + %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb157 ] + %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb157 ] + %tmp142 = phi float [ 0.000000e+00, %bb ], [ %tmp407, %bb157 ] + %tmp143 = phi float [ 0.000000e+00, %bb ], [ %tmp408, %bb157 ] + %tmp144 = phi float [ 0.000000e+00, %bb ], [ %tmp409, %bb157 ] + %tmp145 = phi float [ 0.000000e+00, %bb ], [ %tmp410, %bb157 ] + %tmp146 = phi float [ 0.000000e+00, %bb ], [ %tmp411, %bb157 ] + %tmp147 = phi float [ 0.000000e+00, %bb ], [ %tmp412, %bb157 ] + %tmp148 = phi float [ 0.000000e+00, %bb ], [ %tmp413, %bb157 ] + %tmp149 = phi float [ 0.000000e+00, %bb ], [ %tmp414, %bb157 ] + %tmp150 = phi float [ 0.000000e+00, %bb ], [ %tmp415, %bb157 ] + %tmp151 = phi float [ 0.000000e+00, %bb ], [ %tmp416, %bb157 ] + %tmp152 = phi float [ 0.000000e+00, %bb ], [ %tmp417, %bb157 ] + %tmp153 = phi float [ 0.000000e+00, %bb ], [ %tmp418, %bb157 ] + %tmp154 = bitcast float %tmp107 to i32 + %tmp155 = icmp sgt i32 %tmp154, 125 + br i1 %tmp155, label %bb156, label %bb157 + +bb156: ; preds = %bb24 + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp12, float %tmp103, float %tmp102, float %tmp101) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 33, i32 0, float %tmp99, float %tmp98, float %tmp97, float %tmp95) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 34, i32 0, float %tmp94, float %tmp93, float %tmp91, float %tmp90) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 35, i32 0, float %tmp89, float %tmp87, float %tmp86, float %tmp85) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 36, i32 0, float %tmp83, float %tmp82, float %tmp81, float %tmp79) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 37, i32 0, float %tmp78, float %tmp77, float %tmp75, float %tmp74) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 38, i32 0, float %tmp73, float %tmp71, float %tmp70, float %tmp69) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 39, i32 0, float %tmp67, float %tmp66, float %tmp65, float %tmp63) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 40, i32 0, float %tmp62, float %tmp61, float %tmp59, float %tmp58) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 41, i32 0, float %tmp57, float %tmp55, float %tmp54, float %tmp53) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 42, i32 0, float %tmp51, float %tmp50, float %tmp49, float %tmp47) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 43, i32 0, float %tmp46, float %tmp45, float %tmp43, float %tmp42) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 44, i32 0, float %tmp41, float %tmp39, float %tmp38, float %tmp37) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 45, i32 0, float %tmp35, float %tmp34, float %tmp33, float %tmp31) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 46, i32 0, float %tmp30, float %tmp29, float %tmp27, float %tmp26) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 47, i32 0, float %tmp25, float %tmp28, float %tmp32, float %tmp36) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 48, i32 0, float %tmp40, float %tmp44, float %tmp48, float %tmp52) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 49, i32 0, float %tmp56, float %tmp60, float %tmp64, float %tmp68) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 50, i32 0, float %tmp72, float %tmp76, float %tmp80, float %tmp84) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 51, i32 0, float %tmp88, float %tmp92, float %tmp96, float %tmp100) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 52, i32 0, float %tmp104, float %tmp105, float %tmp106, float %tmp108) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 53, i32 0, float %tmp109, float %tmp110, float %tmp111, float %tmp112) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 54, i32 0, float %tmp113, float %tmp114, float %tmp115, float %tmp116) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 55, i32 0, float %tmp117, float %tmp118, float %tmp119, float %tmp120) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 56, i32 0, float %tmp121, float %tmp122, float %tmp123, float %tmp124) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 57, i32 0, float %tmp125, float %tmp126, float %tmp127, float %tmp128) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 58, i32 0, float %tmp129, float %tmp130, float %tmp131, float %tmp132) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 59, i32 0, float %tmp133, float %tmp134, float %tmp135, float %tmp136) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 60, i32 0, float %tmp137, float %tmp138, float %tmp139, float %tmp140) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 61, i32 0, float %tmp141, float %tmp142, float %tmp143, float %tmp144) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 62, i32 0, float %tmp145, float %tmp146, float %tmp147, float %tmp148) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float %tmp149, float %tmp150, float %tmp151, float %tmp13) + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) + ret void + +bb157: ; preds = %bb24 + %tmp158 = bitcast float %tmp107 to i32 + %tmp159 = bitcast float %tmp107 to i32 + %tmp160 = add i32 %tmp23, %tmp159 + %tmp161 = bitcast i32 %tmp160 to float + %tmp162 = insertelement <128 x float> undef, float %tmp103, i32 0 + %tmp163 = insertelement <128 x float> %tmp162, float %tmp102, i32 1 + %tmp164 = insertelement <128 x float> %tmp163, float %tmp101, i32 2 + %tmp165 = insertelement <128 x float> %tmp164, float %tmp99, i32 3 + %tmp166 = insertelement <128 x float> %tmp165, float %tmp98, i32 4 + %tmp167 = insertelement <128 x float> %tmp166, float %tmp97, i32 5 + %tmp168 = insertelement <128 x float> %tmp167, float %tmp95, i32 6 + %tmp169 = insertelement <128 x float> %tmp168, float %tmp94, i32 7 + %tmp170 = insertelement <128 x float> %tmp169, float %tmp93, i32 8 + %tmp171 = insertelement <128 x float> %tmp170, float %tmp91, i32 9 + %tmp172 = insertelement <128 x float> %tmp171, float %tmp90, i32 10 + %tmp173 = insertelement <128 x float> %tmp172, float %tmp89, i32 11 + %tmp174 = insertelement <128 x float> %tmp173, float %tmp87, i32 12 + %tmp175 = insertelement <128 x float> %tmp174, float %tmp86, i32 13 + %tmp176 = insertelement <128 x float> %tmp175, float %tmp85, i32 14 + %tmp177 = insertelement <128 x float> %tmp176, float %tmp83, i32 15 + %tmp178 = insertelement <128 x float> %tmp177, float %tmp82, i32 16 + %tmp179 = insertelement <128 x float> %tmp178, float %tmp81, i32 17 + %tmp180 = insertelement <128 x float> %tmp179, float %tmp79, i32 18 + %tmp181 = insertelement <128 x float> %tmp180, float %tmp78, i32 19 + %tmp182 = insertelement <128 x float> %tmp181, float %tmp77, i32 20 + %tmp183 = insertelement <128 x float> %tmp182, float %tmp75, i32 21 + %tmp184 = insertelement <128 x float> %tmp183, float %tmp74, i32 22 + %tmp185 = insertelement <128 x float> %tmp184, float %tmp73, i32 23 + %tmp186 = insertelement <128 x float> %tmp185, float %tmp71, i32 24 + %tmp187 = insertelement <128 x float> %tmp186, float %tmp70, i32 25 + %tmp188 = insertelement <128 x float> %tmp187, float %tmp69, i32 26 + %tmp189 = insertelement <128 x float> %tmp188, float %tmp67, i32 27 + %tmp190 = insertelement <128 x float> %tmp189, float %tmp66, i32 28 + %tmp191 = insertelement <128 x float> %tmp190, float %tmp65, i32 29 + %tmp192 = insertelement <128 x float> %tmp191, float %tmp63, i32 30 + %tmp193 = insertelement <128 x float> %tmp192, float %tmp62, i32 31 + %tmp194 = insertelement <128 x float> %tmp193, float %tmp61, i32 32 + %tmp195 = insertelement <128 x float> %tmp194, float %tmp59, i32 33 + %tmp196 = insertelement <128 x float> %tmp195, float %tmp58, i32 34 + %tmp197 = insertelement <128 x float> %tmp196, float %tmp57, i32 35 + %tmp198 = insertelement <128 x float> %tmp197, float %tmp55, i32 36 + %tmp199 = insertelement <128 x float> %tmp198, float %tmp54, i32 37 + %tmp200 = insertelement <128 x float> %tmp199, float %tmp53, i32 38 + %tmp201 = insertelement <128 x float> %tmp200, float %tmp51, i32 39 + %tmp202 = insertelement <128 x float> %tmp201, float %tmp50, i32 40 + %tmp203 = insertelement <128 x float> %tmp202, float %tmp49, i32 41 + %tmp204 = insertelement <128 x float> %tmp203, float %tmp47, i32 42 + %tmp205 = insertelement <128 x float> %tmp204, float %tmp46, i32 43 + %tmp206 = insertelement <128 x float> %tmp205, float %tmp45, i32 44 + %tmp207 = insertelement <128 x float> %tmp206, float %tmp43, i32 45 + %tmp208 = insertelement <128 x float> %tmp207, float %tmp42, i32 46 + %tmp209 = insertelement <128 x float> %tmp208, float %tmp41, i32 47 + %tmp210 = insertelement <128 x float> %tmp209, float %tmp39, i32 48 + %tmp211 = insertelement <128 x float> %tmp210, float %tmp38, i32 49 + %tmp212 = insertelement <128 x float> %tmp211, float %tmp37, i32 50 + %tmp213 = insertelement <128 x float> %tmp212, float %tmp35, i32 51 + %tmp214 = insertelement <128 x float> %tmp213, float %tmp34, i32 52 + %tmp215 = insertelement <128 x float> %tmp214, float %tmp33, i32 53 + %tmp216 = insertelement <128 x float> %tmp215, float %tmp31, i32 54 + %tmp217 = insertelement <128 x float> %tmp216, float %tmp30, i32 55 + %tmp218 = insertelement <128 x float> %tmp217, float %tmp29, i32 56 + %tmp219 = insertelement <128 x float> %tmp218, float %tmp27, i32 57 + %tmp220 = insertelement <128 x float> %tmp219, float %tmp26, i32 58 + %tmp221 = insertelement <128 x float> %tmp220, float %tmp25, i32 59 + %tmp222 = insertelement <128 x float> %tmp221, float %tmp28, i32 60 + %tmp223 = insertelement <128 x float> %tmp222, float %tmp32, i32 61 + %tmp224 = insertelement <128 x float> %tmp223, float %tmp36, i32 62 + %tmp225 = insertelement <128 x float> %tmp224, float %tmp40, i32 63 + %tmp226 = insertelement <128 x float> %tmp225, float %tmp44, i32 64 + %tmp227 = insertelement <128 x float> %tmp226, float %tmp48, i32 65 + %tmp228 = insertelement <128 x float> %tmp227, float %tmp52, i32 66 + %tmp229 = insertelement <128 x float> %tmp228, float %tmp56, i32 67 + %tmp230 = insertelement <128 x float> %tmp229, float %tmp60, i32 68 + %tmp231 = insertelement <128 x float> %tmp230, float %tmp64, i32 69 + %tmp232 = insertelement <128 x float> %tmp231, float %tmp68, i32 70 + %tmp233 = insertelement <128 x float> %tmp232, float %tmp72, i32 71 + %tmp234 = insertelement <128 x float> %tmp233, float %tmp76, i32 72 + %tmp235 = insertelement <128 x float> %tmp234, float %tmp80, i32 73 + %tmp236 = insertelement <128 x float> %tmp235, float %tmp84, i32 74 + %tmp237 = insertelement <128 x float> %tmp236, float %tmp88, i32 75 + %tmp238 = insertelement <128 x float> %tmp237, float %tmp92, i32 76 + %tmp239 = insertelement <128 x float> %tmp238, float %tmp96, i32 77 + %tmp240 = insertelement <128 x float> %tmp239, float %tmp100, i32 78 + %tmp241 = insertelement <128 x float> %tmp240, float %tmp104, i32 79 + %tmp242 = insertelement <128 x float> %tmp241, float %tmp105, i32 80 + %tmp243 = insertelement <128 x float> %tmp242, float %tmp106, i32 81 + %tmp244 = insertelement <128 x float> %tmp243, float %tmp108, i32 82 + %tmp245 = insertelement <128 x float> %tmp244, float %tmp109, i32 83 + %tmp246 = insertelement <128 x float> %tmp245, float %tmp110, i32 84 + %tmp247 = insertelement <128 x float> %tmp246, float %tmp111, i32 85 + %tmp248 = insertelement <128 x float> %tmp247, float %tmp112, i32 86 + %tmp249 = insertelement <128 x float> %tmp248, float %tmp113, i32 87 + %tmp250 = insertelement <128 x float> %tmp249, float %tmp114, i32 88 + %tmp251 = insertelement <128 x float> %tmp250, float %tmp115, i32 89 + %tmp252 = insertelement <128 x float> %tmp251, float %tmp116, i32 90 + %tmp253 = insertelement <128 x float> %tmp252, float %tmp117, i32 91 + %tmp254 = insertelement <128 x float> %tmp253, float %tmp118, i32 92 + %tmp255 = insertelement <128 x float> %tmp254, float %tmp119, i32 93 + %tmp256 = insertelement <128 x float> %tmp255, float %tmp120, i32 94 + %tmp257 = insertelement <128 x float> %tmp256, float %tmp121, i32 95 + %tmp258 = insertelement <128 x float> %tmp257, float %tmp122, i32 96 + %tmp259 = insertelement <128 x float> %tmp258, float %tmp123, i32 97 + %tmp260 = insertelement <128 x float> %tmp259, float %tmp124, i32 98 + %tmp261 = insertelement <128 x float> %tmp260, float %tmp125, i32 99 + %tmp262 = insertelement <128 x float> %tmp261, float %tmp126, i32 100 + %tmp263 = insertelement <128 x float> %tmp262, float %tmp127, i32 101 + %tmp264 = insertelement <128 x float> %tmp263, float %tmp128, i32 102 + %tmp265 = insertelement <128 x float> %tmp264, float %tmp129, i32 103 + %tmp266 = insertelement <128 x float> %tmp265, float %tmp130, i32 104 + %tmp267 = insertelement <128 x float> %tmp266, float %tmp131, i32 105 + %tmp268 = insertelement <128 x float> %tmp267, float %tmp132, i32 106 + %tmp269 = insertelement <128 x float> %tmp268, float %tmp133, i32 107 + %tmp270 = insertelement <128 x float> %tmp269, float %tmp134, i32 108 + %tmp271 = insertelement <128 x float> %tmp270, float %tmp135, i32 109 + %tmp272 = insertelement <128 x float> %tmp271, float %tmp136, i32 110 + %tmp273 = insertelement <128 x float> %tmp272, float %tmp137, i32 111 + %tmp274 = insertelement <128 x float> %tmp273, float %tmp138, i32 112 + %tmp275 = insertelement <128 x float> %tmp274, float %tmp139, i32 113 + %tmp276 = insertelement <128 x float> %tmp275, float %tmp140, i32 114 + %tmp277 = insertelement <128 x float> %tmp276, float %tmp141, i32 115 + %tmp278 = insertelement <128 x float> %tmp277, float %tmp142, i32 116 + %tmp279 = insertelement <128 x float> %tmp278, float %tmp143, i32 117 + %tmp280 = insertelement <128 x float> %tmp279, float %tmp144, i32 118 + %tmp281 = insertelement <128 x float> %tmp280, float %tmp145, i32 119 + %tmp282 = insertelement <128 x float> %tmp281, float %tmp146, i32 120 + %tmp283 = insertelement <128 x float> %tmp282, float %tmp147, i32 121 + %tmp284 = insertelement <128 x float> %tmp283, float %tmp148, i32 122 + %tmp285 = insertelement <128 x float> %tmp284, float %tmp149, i32 123 + %tmp286 = insertelement <128 x float> %tmp285, float %tmp150, i32 124 + %tmp287 = insertelement <128 x float> %tmp286, float %tmp151, i32 125 + %tmp288 = insertelement <128 x float> %tmp287, float %tmp152, i32 126 + %tmp289 = insertelement <128 x float> %tmp288, float %tmp153, i32 127 + %tmp290 = insertelement <128 x float> %tmp289, float %tmp161, i32 %tmp158 + %tmp291 = extractelement <128 x float> %tmp290, i32 0 + %tmp292 = extractelement <128 x float> %tmp290, i32 1 + %tmp293 = extractelement <128 x float> %tmp290, i32 2 + %tmp294 = extractelement <128 x float> %tmp290, i32 3 + %tmp295 = extractelement <128 x float> %tmp290, i32 4 + %tmp296 = extractelement <128 x float> %tmp290, i32 5 + %tmp297 = extractelement <128 x float> %tmp290, i32 6 + %tmp298 = extractelement <128 x float> %tmp290, i32 7 + %tmp299 = extractelement <128 x float> %tmp290, i32 8 + %tmp300 = extractelement <128 x float> %tmp290, i32 9 + %tmp301 = extractelement <128 x float> %tmp290, i32 10 + %tmp302 = extractelement <128 x float> %tmp290, i32 11 + %tmp303 = extractelement <128 x float> %tmp290, i32 12 + %tmp304 = extractelement <128 x float> %tmp290, i32 13 + %tmp305 = extractelement <128 x float> %tmp290, i32 14 + %tmp306 = extractelement <128 x float> %tmp290, i32 15 + %tmp307 = extractelement <128 x float> %tmp290, i32 16 + %tmp308 = extractelement <128 x float> %tmp290, i32 17 + %tmp309 = extractelement <128 x float> %tmp290, i32 18 + %tmp310 = extractelement <128 x float> %tmp290, i32 19 + %tmp311 = extractelement <128 x float> %tmp290, i32 20 + %tmp312 = extractelement <128 x float> %tmp290, i32 21 + %tmp313 = extractelement <128 x float> %tmp290, i32 22 + %tmp314 = extractelement <128 x float> %tmp290, i32 23 + %tmp315 = extractelement <128 x float> %tmp290, i32 24 + %tmp316 = extractelement <128 x float> %tmp290, i32 25 + %tmp317 = extractelement <128 x float> %tmp290, i32 26 + %tmp318 = extractelement <128 x float> %tmp290, i32 27 + %tmp319 = extractelement <128 x float> %tmp290, i32 28 + %tmp320 = extractelement <128 x float> %tmp290, i32 29 + %tmp321 = extractelement <128 x float> %tmp290, i32 30 + %tmp322 = extractelement <128 x float> %tmp290, i32 31 + %tmp323 = extractelement <128 x float> %tmp290, i32 32 + %tmp324 = extractelement <128 x float> %tmp290, i32 33 + %tmp325 = extractelement <128 x float> %tmp290, i32 34 + %tmp326 = extractelement <128 x float> %tmp290, i32 35 + %tmp327 = extractelement <128 x float> %tmp290, i32 36 + %tmp328 = extractelement <128 x float> %tmp290, i32 37 + %tmp329 = extractelement <128 x float> %tmp290, i32 38 + %tmp330 = extractelement <128 x float> %tmp290, i32 39 + %tmp331 = extractelement <128 x float> %tmp290, i32 40 + %tmp332 = extractelement <128 x float> %tmp290, i32 41 + %tmp333 = extractelement <128 x float> %tmp290, i32 42 + %tmp334 = extractelement <128 x float> %tmp290, i32 43 + %tmp335 = extractelement <128 x float> %tmp290, i32 44 + %tmp336 = extractelement <128 x float> %tmp290, i32 45 + %tmp337 = extractelement <128 x float> %tmp290, i32 46 + %tmp338 = extractelement <128 x float> %tmp290, i32 47 + %tmp339 = extractelement <128 x float> %tmp290, i32 48 + %tmp340 = extractelement <128 x float> %tmp290, i32 49 + %tmp341 = extractelement <128 x float> %tmp290, i32 50 + %tmp342 = extractelement <128 x float> %tmp290, i32 51 + %tmp343 = extractelement <128 x float> %tmp290, i32 52 + %tmp344 = extractelement <128 x float> %tmp290, i32 53 + %tmp345 = extractelement <128 x float> %tmp290, i32 54 + %tmp346 = extractelement <128 x float> %tmp290, i32 55 + %tmp347 = extractelement <128 x float> %tmp290, i32 56 + %tmp348 = extractelement <128 x float> %tmp290, i32 57 + %tmp349 = extractelement <128 x float> %tmp290, i32 58 + %tmp350 = extractelement <128 x float> %tmp290, i32 59 + %tmp351 = extractelement <128 x float> %tmp290, i32 60 + %tmp352 = extractelement <128 x float> %tmp290, i32 61 + %tmp353 = extractelement <128 x float> %tmp290, i32 62 + %tmp354 = extractelement <128 x float> %tmp290, i32 63 + %tmp355 = extractelement <128 x float> %tmp290, i32 64 + %tmp356 = extractelement <128 x float> %tmp290, i32 65 + %tmp357 = extractelement <128 x float> %tmp290, i32 66 + %tmp358 = extractelement <128 x float> %tmp290, i32 67 + %tmp359 = extractelement <128 x float> %tmp290, i32 68 + %tmp360 = extractelement <128 x float> %tmp290, i32 69 + %tmp361 = extractelement <128 x float> %tmp290, i32 70 + %tmp362 = extractelement <128 x float> %tmp290, i32 71 + %tmp363 = extractelement <128 x float> %tmp290, i32 72 + %tmp364 = extractelement <128 x float> %tmp290, i32 73 + %tmp365 = extractelement <128 x float> %tmp290, i32 74 + %tmp366 = extractelement <128 x float> %tmp290, i32 75 + %tmp367 = extractelement <128 x float> %tmp290, i32 76 + %tmp368 = extractelement <128 x float> %tmp290, i32 77 + %tmp369 = extractelement <128 x float> %tmp290, i32 78 + %tmp370 = extractelement <128 x float> %tmp290, i32 79 + %tmp371 = extractelement <128 x float> %tmp290, i32 80 + %tmp372 = extractelement <128 x float> %tmp290, i32 81 + %tmp373 = extractelement <128 x float> %tmp290, i32 82 + %tmp374 = extractelement <128 x float> %tmp290, i32 83 + %tmp375 = extractelement <128 x float> %tmp290, i32 84 + %tmp376 = extractelement <128 x float> %tmp290, i32 85 + %tmp377 = extractelement <128 x float> %tmp290, i32 86 + %tmp378 = extractelement <128 x float> %tmp290, i32 87 + %tmp379 = extractelement <128 x float> %tmp290, i32 88 + %tmp380 = extractelement <128 x float> %tmp290, i32 89 + %tmp381 = extractelement <128 x float> %tmp290, i32 90 + %tmp382 = extractelement <128 x float> %tmp290, i32 91 + %tmp383 = extractelement <128 x float> %tmp290, i32 92 + %tmp384 = extractelement <128 x float> %tmp290, i32 93 + %tmp385 = extractelement <128 x float> %tmp290, i32 94 + %tmp386 = extractelement <128 x float> %tmp290, i32 95 + %tmp387 = extractelement <128 x float> %tmp290, i32 96 + %tmp388 = extractelement <128 x float> %tmp290, i32 97 + %tmp389 = extractelement <128 x float> %tmp290, i32 98 + %tmp390 = extractelement <128 x float> %tmp290, i32 99 + %tmp391 = extractelement <128 x float> %tmp290, i32 100 + %tmp392 = extractelement <128 x float> %tmp290, i32 101 + %tmp393 = extractelement <128 x float> %tmp290, i32 102 + %tmp394 = extractelement <128 x float> %tmp290, i32 103 + %tmp395 = extractelement <128 x float> %tmp290, i32 104 + %tmp396 = extractelement <128 x float> %tmp290, i32 105 + %tmp397 = extractelement <128 x float> %tmp290, i32 106 + %tmp398 = extractelement <128 x float> %tmp290, i32 107 + %tmp399 = extractelement <128 x float> %tmp290, i32 108 + %tmp400 = extractelement <128 x float> %tmp290, i32 109 + %tmp401 = extractelement <128 x float> %tmp290, i32 110 + %tmp402 = extractelement <128 x float> %tmp290, i32 111 + %tmp403 = extractelement <128 x float> %tmp290, i32 112 + %tmp404 = extractelement <128 x float> %tmp290, i32 113 + %tmp405 = extractelement <128 x float> %tmp290, i32 114 + %tmp406 = extractelement <128 x float> %tmp290, i32 115 + %tmp407 = extractelement <128 x float> %tmp290, i32 116 + %tmp408 = extractelement <128 x float> %tmp290, i32 117 + %tmp409 = extractelement <128 x float> %tmp290, i32 118 + %tmp410 = extractelement <128 x float> %tmp290, i32 119 + %tmp411 = extractelement <128 x float> %tmp290, i32 120 + %tmp412 = extractelement <128 x float> %tmp290, i32 121 + %tmp413 = extractelement <128 x float> %tmp290, i32 122 + %tmp414 = extractelement <128 x float> %tmp290, i32 123 + %tmp415 = extractelement <128 x float> %tmp290, i32 124 + %tmp416 = extractelement <128 x float> %tmp290, i32 125 + %tmp417 = extractelement <128 x float> %tmp290, i32 126 + %tmp418 = extractelement <128 x float> %tmp290, i32 127 + %tmp419 = bitcast float %tmp107 to i32 + %tmp420 = add i32 %tmp419, 1 + %tmp421 = bitcast i32 %tmp420 to float + br label %bb24 +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" } +attributes #1 = { nounwind readnone } + +!0 = !{!1, !1, i64 0, i32 1} +!1 = !{!"const", null} diff --git a/test/CodeGen/AMDGPU/vop-shrink.ll b/test/CodeGen/AMDGPU/vop-shrink.ll index 9b2f229c05af1..2bfe1b2bd6ec2 100644 --- a/test/CodeGen/AMDGPU/vop-shrink.ll +++ b/test/CodeGen/AMDGPU/vop-shrink.ll @@ -3,8 +3,8 @@ ; Test that we correctly commute a sub instruction ; FUNC-LABEL: {{^}}sub_rev: -; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s -; SI: v_subrev_i32_e32 v{{[0-9]+}}, s +; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, vcc, s +; SI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, s ; ModuleID = 'vop-shrink.ll' diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll index 5cc7577cad332..107e84b33be9a 100644 --- a/test/CodeGen/AMDGPU/wait.ll +++ b/test/CodeGen/AMDGPU/wait.ll @@ -1,11 +1,16 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s - -; CHECK-LABEL: {{^}}main: -; CHECK: s_load_dwordx4 -; CHECK: s_load_dwordx4 -; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; CHECK: s_endpgm +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT +; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX +; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX +; The ilpmax scheduler is used for the second test to get the ordering we want for the test. + +; DEFAULT-LABEL: {{^}}main: +; DEFAULT: s_load_dwordx4 +; DEFAULT: s_load_dwordx4 +; DEFAULT: s_waitcnt vmcnt(0) +; DEFAULT: exp +; DEFAULT: s_waitcnt lgkmcnt(0) +; DEFAULT: s_endpgm define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0 @@ -29,7 +34,43 @@ main_body: ret void } -; Function Attrs: noduplicate nounwind +; ILPMAX-LABEL: {{^}}main2: +; ILPMAX: s_load_dwordx4 +; ILPMAX: s_waitcnt lgkmcnt(0) +; ILPMAX: buffer_load +; ILPMAX: s_load_dwordx4 +; ILPMAX: s_waitcnt lgkmcnt(0) +; ILPMAX: buffer_load +; ILPMAX: s_waitcnt vmcnt(1) +; ILPMAX: s_waitcnt vmcnt(0) +; ILPMAX: s_endpgm + +define void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* +byval, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 { +main_body: + %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0 + %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0 + %13 = add i32 %5, %7 + %14 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %12, i32 0, i32 %13) + %15 = extractelement <4 x float> %14, i32 0 + %16 = extractelement <4 x float> %14, i32 1 + %17 = extractelement <4 x float> %14, i32 2 + %18 = extractelement <4 x float> %14, i32 3 + %19 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 1 + %20 = load <16 x i8>, <16 x i8> addrspace(2)* %19, align 16, !tbaa !0 + %21 = add i32 %5, %7 + %22 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %20, i32 0, i32 %21) + %23 = extractelement <4 x float> %22, i32 0 + %24 = extractelement <4 x float> %22, i32 1 + %25 = extractelement <4 x float> %22, i32 2 + %26 = extractelement <4 x float> %22, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %15, float %16, float %17, float %18) + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %23, float %24, float %25, float %26) + ret void +} + + +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.global() #1 ; Function Attrs: nounwind readnone @@ -38,7 +79,7 @@ declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { "ShaderType"="1" } -attributes #1 = { noduplicate nounwind } +attributes #1 = { convergent nounwind } attributes #2 = { nounwind readnone } !0 = !{!1, !1, i64 0, i32 1} diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll index 4328e964c1bf8..e7fcd1ff36501 100644 --- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll +++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -1,15 +1,34 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}ngroups_x: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].X +; EG: MOV {{\*? *}}[[VAL]], KC0[0].X + +; HSA: .amd_kernel_code_t + +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; HSA: .end_amd_kernel_code_t + + +; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] define void @ngroups_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.x() #0 @@ -19,12 +38,12 @@ entry: ; FUNC-LABEL: {{^}}ngroups_y: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].Y +; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.y() #0 @@ -34,12 +53,12 @@ entry: ; FUNC-LABEL: {{^}}ngroups_z: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].Z +; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.z() #0 @@ -49,12 +68,12 @@ entry: ; FUNC-LABEL: {{^}}global_size_x: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].W +; EG: MOV {{\*? *}}[[VAL]], KC0[0].W -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.x() #0 @@ -64,12 +83,12 @@ entry: ; FUNC-LABEL: {{^}}global_size_y: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].X +; EG: MOV {{\*? *}}[[VAL]], KC0[1].X -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.y() #0 @@ -79,12 +98,12 @@ entry: ; FUNC-LABEL: {{^}}global_size_z: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].Y +; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.z() #0 @@ -92,74 +111,34 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].W - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].X - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}get_work_dim: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @get_work_dim (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.AMDGPU.read.workdim() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; The tgid values are stored in sgprs offset by the number of user sgprs. -; Currently we always use exactly 2 user sgprs for the pointer to the -; kernel arguments, but this may change in the future. +; The tgid values are stored in sgprs offset by the number of user +; sgprs. ; FUNC-LABEL: {{^}}tgid_x: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 -; GCN: buffer_store_dword [[VVAL]] -define void @tgid_x (i32 addrspace(1)* %out) { +; HSA: .amd_kernel_code_t +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; HSA: .end_amd_kernel_code_t + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}} +; GCN-NOHSA: buffer_store_dword [[VVAL]] +; HSA: flat_store_dword [[VVAL]] + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -167,9 +146,26 @@ entry: } ; FUNC-LABEL: {{^}}tgid_y: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5 -; GCN: buffer_store_dword [[VVAL]] -define void @tgid_y (i32 addrspace(1)* %out) { +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 1 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3 +; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7 +; GCN-NOHSA: buffer_store_dword [[VVAL]] +; HSA: flat_store_dword [[VVAL]] + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -177,36 +173,81 @@ entry: } ; FUNC-LABEL: {{^}}tgid_z: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6 -; GCN: buffer_store_dword [[VVAL]] -define void @tgid_z (i32 addrspace(1)* %out) { +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 1 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}} +; GCN-NOHSA: buffer_store_dword [[VVAL]] +; HSA: flat_store_dword [[VVAL]] + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.z() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 132{{$}} + ; FUNC-LABEL: {{^}}tidig_x: -; GCN: buffer_store_dword v0 -define void @tidig_x (i32 addrspace(1)* %out) { +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; GCN-NOHSA: buffer_store_dword v0 +; HSA: flat_store_dword v0 +define void @tidig_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 2180{{$}} + ; FUNC-LABEL: {{^}}tidig_y: -; GCN: buffer_store_dword v1 -define void @tidig_y (i32 addrspace(1)* %out) { + +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1 +; GCN-NOHSA: buffer_store_dword v1 +; HSA: flat_store_dword v1 +define void @tidig_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.y() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 4228{{$}} + ; FUNC-LABEL: {{^}}tidig_z: -; GCN: buffer_store_dword v2 -define void @tidig_z (i32 addrspace(1)* %out) { +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2 +; GCN-NOHSA: buffer_store_dword v2 +; HSA: flat_store_dword v2 +define void @tidig_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -221,10 +262,6 @@ declare i32 @llvm.r600.read.global.size.x() #0 declare i32 @llvm.r600.read.global.size.y() #0 declare i32 @llvm.r600.read.global.size.z() #0 -declare i32 @llvm.r600.read.local.size.x() #0 -declare i32 @llvm.r600.read.local.size.y() #0 -declare i32 @llvm.r600.read.local.size.z() #0 - declare i32 @llvm.r600.read.tgid.x() #0 declare i32 @llvm.r600.read.tgid.y() #0 declare i32 @llvm.r600.read.tgid.z() #0 diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll index ddb920af29d84..655655d92f08f 100644 --- a/test/CodeGen/AMDGPU/xor.ll +++ b/test/CodeGen/AMDGPU/xor.ll @@ -38,7 +38,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in } ; FUNC-LABEL: {{^}}xor_i1: -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} +; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}} ; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}} ; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}} diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll index 033055db185a4..35ddf2b0a465b 100644 --- a/test/CodeGen/AMDGPU/zero_extend.ll +++ b/test/CodeGen/AMDGPU/zero_extend.ll @@ -7,8 +7,7 @@ ; R600: MEM_RAT_CACHELESS STORE_RAW ; SI: {{^}}test: -; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}} -; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]] +; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}} ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: diff --git a/test/CodeGen/ARM/2007-03-13-InstrSched.ll b/test/CodeGen/ARM/2007-03-13-InstrSched.ll index 9c0143be06c37..81a6bb64971d6 100644 --- a/test/CodeGen/ARM/2007-03-13-InstrSched.ll +++ b/test/CodeGen/ARM/2007-03-13-InstrSched.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \ ; RUN: -mattr=+v6 | grep r9 ; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \ -; RUN: -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats 2>&1 | grep asm-printer +; RUN: -mattr=+v6,+reserve-r9 -ifcvt-limit=0 -stats 2>&1 | grep asm-printer ; | grep 35 define void @test(i32 %tmp56222, i32 %tmp36224, i32 %tmp46223, i32 %i.0196.0.ph, i32 %tmp8, i32* %tmp1011, i32** %tmp1, i32* %d2.1.out, i32* %d3.1.out, i32* %d0.1.out, i32* %d1.1.out) { diff --git a/test/CodeGen/ARM/2009-10-16-Scope.ll b/test/CodeGen/ARM/2009-10-16-Scope.ll index 3f47488372b8c..613694f091d1e 100644 --- a/test/CodeGen/ARM/2009-10-16-Scope.ll +++ b/test/CodeGen/ARM/2009-10-16-Scope.ll @@ -24,9 +24,9 @@ declare i32 @foo(i32) ssp !0 = !DILocation(line: 5, column: 2, scope: !1) !1 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !2) -!2 = !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !3) -!3 = !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: 0, file: !8, retainedTypes: !9) -!4 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "count_", line: 5, scope: !5, file: !3, type: !6) +!2 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !3) +!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: 0, file: !8, retainedTypes: !9) +!4 = !DILocalVariable(name: "count_", line: 5, scope: !5, file: !3, type: !6) !5 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !1) !6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed) !7 = !DILocation(line: 6, column: 1, scope: !2) diff --git a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll index 638b26c73146d..1341830b4a4b1 100644 --- a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll +++ b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll @@ -14,11 +14,11 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!15} -!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "b", line: 93, arg: 0, scope: !1, file: !2, type: !6) -!1 = !DISubprogram(name: "__addvsi3", linkageName: "__addvsi3", line: 94, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !12, scope: null, type: !4) +!0 = !DILocalVariable(name: "b", line: 93, arg: 2, scope: !1, file: !2, type: !6) +!1 = distinct !DISubprogram(name: "__addvsi3", linkageName: "__addvsi3", line: 94, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !12, scope: null, type: !4) !2 = !DIFile(filename: "libgcc2.c", directory: "/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc") !12 = !DIFile(filename: "libgcc2.c", directory: "/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc") -!3 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", isOptimized: true, emissionKind: 0, file: !12, enums: !13, retainedTypes: !13, subprograms: !14) +!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", isOptimized: true, emissionKind: 0, file: !12, enums: !13, retainedTypes: !13, subprograms: !14) !4 = !DISubroutineType(types: !5) !5 = !{!6, !6, !6} !6 = !DIDerivedType(tag: DW_TAG_typedef, name: "SItype", line: 152, file: !12, baseType: !8) diff --git a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll index cfaffd8234ba2..171b6d2bcc5c9 100644 --- a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll +++ b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll @@ -1,36 +1,36 @@ ; RUN: llc -mtriple=arm-eabi -mattr=+neon -O0 -optimize-regalloc -regalloc=basic %s -o /dev/null ; This test would crash the rewriter when trying to handle a spill after one of -; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register. +; the @llvm.arm.neon.vld3.v8i8.p0i8 defined three parts of a register. %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } -declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly +declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind { - %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1] %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1] - %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1] %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1] - %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] + %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1] - %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] + %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1] - %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1] %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1] - %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1] %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1] %tmp2bd = add <8 x i8> %tmp2b, %tmp2d ; <<8 x i8>> [#uses=1] %tmp4bd = add <8 x i8> %tmp4b, %tmp4d ; <<8 x i8>> [#uses=1] %tmp2abcd = mul <8 x i8> undef, %tmp2bd ; <<8 x i8>> [#uses=1] %tmp4abcd = mul <8 x i8> undef, %tmp4bd ; <<8 x i8>> [#uses=2] - call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1) + call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1) %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f ; <<8 x i8>> [#uses=1] %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h ; <<8 x i8>> [#uses=1] %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h ; <<8 x i8>> [#uses=1] @@ -38,8 +38,8 @@ define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh ; <<8 x i8>> [#uses=1] %tmp3efgh = mul <8 x i8> undef, %tmp3gh ; <<8 x i8>> [#uses=1] %tmp4efgh = mul <8 x i8> %tmp4ef, undef ; <<8 x i8>> [#uses=2] - call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1) + call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1) %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd ; <<8 x i8>> [#uses=1] - tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1) + tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1) ret <8 x i8> %tmp4 } diff --git a/test/CodeGen/ARM/2010-05-21-BuildVector.ll b/test/CodeGen/ARM/2010-05-21-BuildVector.ll index 6a6ccf3d0a014..c6c0e2caee420 100644 --- a/test/CodeGen/ARM/2010-05-21-BuildVector.ll +++ b/test/CodeGen/ARM/2010-05-21-BuildVector.ll @@ -36,8 +36,8 @@ entry: %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3 %19 = fmul <4 x float> %tmp5, %2 %20 = bitcast float* %fltp to i8* - tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19, i32 1) + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %20, <4 x float> %19, i32 1) ret void } -declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind diff --git a/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll b/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll index f86c3ba9ef6e3..1deb98631a4f6 100644 --- a/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll +++ b/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll @@ -12,8 +12,8 @@ entry: %tmp9 = trunc i128 %tmp8 to i64 ; [#uses=1] %tmp16.i = bitcast i64 %tmp6 to <8 x i8> ; <<8 x i8>> [#uses=1] %tmp20.i = bitcast i64 %tmp9 to <8 x i8> ; <<8 x i8>> [#uses=1] - tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind + tail call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind ret void } -declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind diff --git a/test/CodeGen/ARM/2010-06-21-nondarwin-tc.ll b/test/CodeGen/ARM/2010-06-21-nondarwin-tc.ll index bead8d9781e84..47a5ef0bc5444 100755 --- a/test/CodeGen/ARM/2010-06-21-nondarwin-tc.ll +++ b/test/CodeGen/ARM/2010-06-21-nondarwin-tc.ll @@ -20,7 +20,7 @@ @.str51 = external constant [45 x i8] ; <[45 x i8]*> [#uses=1] @__PRETTY_FUNCTION__._ZNK4llvm7VarInit12getFieldInitERNS_6RecordEPKNS_9RecordValERKSs = external constant [116 x i8] ; <[116 x i8]*> [#uses=1] -@_ZN4llvm9RecordValC1ERKSsPNS_5RecTyEj = alias void (%"class.llvm::RecordVal"*, %"class.std::basic_string"*, %"struct.llvm::Init"*, i32)* @_ZN4llvm9RecordValC2ERKSsPNS_5RecTyEj ; [#uses=0] +@_ZN4llvm9RecordValC1ERKSsPNS_5RecTyEj = alias void (%"class.llvm::RecordVal"*, %"class.std::basic_string"*, %"struct.llvm::Init"*, i32), void (%"class.llvm::RecordVal"*, %"class.std::basic_string"*, %"struct.llvm::Init"*, i32)* @_ZN4llvm9RecordValC2ERKSsPNS_5RecTyEj ; [#uses=0] declare i8* @__dynamic_cast(i8*, i8*, i8*, i32) diff --git a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll index 95bb2769759e6..38b352c473b1c 100644 --- a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll +++ b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll @@ -47,19 +47,19 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon !llvm.dbg.lv.fn = !{!0, !8, !10, !12} !llvm.dbg.gv = !{!14} -!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "buf", line: 4, arg: 0, scope: !1, file: !2, type: !6) -!1 = !DISubprogram(name: "x0", linkageName: "x0", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !26, scope: null, type: !4) +!0 = !DILocalVariable(name: "buf", line: 4, arg: 1, scope: !1, file: !2, type: !6) +!1 = distinct !DISubprogram(name: "x0", linkageName: "x0", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !26, scope: null, type: !4) !2 = !DIFile(filename: "t.c", directory: "/private/tmp") -!3 = !DICompileUnit(language: DW_LANG_C99, producer: "clang 2.0", isOptimized: true, file: !26) +!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 2.0", isOptimized: true, file: !26) !4 = !DISubroutineType(types: !5) !5 = !{null} !6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !26, scope: !2, baseType: !7) !7 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char) -!8 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "nbytes", line: 4, arg: 0, scope: !1, file: !2, type: !9) +!8 = !DILocalVariable(name: "nbytes", line: 4, arg: 2, scope: !1, file: !2, type: !9) !9 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned long", size: 32, align: 32, encoding: DW_ATE_unsigned) -!10 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "nread", line: 6, scope: !11, file: !2, type: !9) +!10 = !DILocalVariable(name: "nread", line: 6, scope: !11, file: !2, type: !9) !11 = distinct !DILexicalBlock(line: 5, column: 1, file: !26, scope: !1) -!12 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "c", line: 7, scope: !11, file: !2, type: !13) +!12 = !DILocalVariable(name: "c", line: 7, scope: !11, file: !2, type: !13) !13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed) !14 = !DIGlobalVariable(name: "length", linkageName: "length", line: 1, isLocal: false, isDefinition: true, scope: !2, file: !2, type: !13, variable: i32* @length) !15 = !DILocation(line: 4, column: 24, scope: !1) diff --git a/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll b/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll index 1aee5088eee4e..130221d38c235 100644 --- a/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll +++ b/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll @@ -16,10 +16,10 @@ target triple = "thumbv7-apple-darwin10" define i32 @test(i8* %arg) nounwind { entry: - %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg, i32 1) + %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %arg, i32 1) %1 = shufflevector <2 x i64> undef, <2 x i64> %0, <2 x i32> store <2 x i64> %1, <2 x i64>* undef, align 16 ret i32 undef } -declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly +declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll index 953e2bbf291c3..14ddb59b53870 100644 --- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll +++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll @@ -3,7 +3,7 @@ %struct.SVal = type { i8*, i32 } -define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp { +define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp !dbg !17 { entry: %"alloca point" = bitcast i32 0 to i32 ; [#uses=0] call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !23, metadata !DIExpression()), !dbg !24 @@ -31,7 +31,7 @@ return: ; preds = %bb2 ret i32 %.0, !dbg !29 } -define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 { +define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 !dbg !16 { entry: %"alloca point" = bitcast i32 0 to i32 ; [#uses=0] call void @llvm.dbg.value(metadata %struct.SVal* %this, i64 0, metadata !31, metadata !DIExpression()), !dbg !34 @@ -47,7 +47,7 @@ return: ; preds = %entry declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone -define i32 @main() nounwind ssp { +define i32 @main() nounwind ssp !dbg !20 { entry: %0 = alloca %struct.SVal ; <%struct.SVal*> [#uses=3] %v = alloca %struct.SVal ; <%struct.SVal*> [#uses=4] @@ -80,7 +80,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon !0 = !DISubprogram(name: "SVal", line: 11, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !48, scope: !1, type: !14) !1 = !DICompositeType(tag: DW_TAG_structure_type, name: "SVal", line: 1, size: 128, align: 64, file: !48, elements: !4) !2 = !DIFile(filename: "small.cc", directory: "/Users/manav/R8248330") -!3 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 1, file: !48, enums: !47, retainedTypes: !47, subprograms: !46, globals: !47, imports: !47) +!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 1, file: !48, enums: !47, retainedTypes: !47, subprograms: !46, globals: !47, imports: !47) !4 = !{!5, !7, !0, !9} !5 = !DIDerivedType(tag: DW_TAG_member, name: "Data", line: 7, size: 64, align: 64, file: !48, scope: !1, baseType: !6) !6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !48, baseType: null) @@ -93,35 +93,35 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon !13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed) !14 = !DISubroutineType(types: !15) !15 = !{null, !12} -!16 = !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !1, type: !14, function: void (%struct.SVal*)* @_ZN4SValC1Ev) -!17 = !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !2, type: !18, function: i32 (i32, %struct.SVal*)* @_Z3fooi4SVal) +!16 = distinct !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !1, type: !14) +!17 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !2, type: !18) !18 = !DISubroutineType(types: !19) !19 = !{!13, !13, !1} -!20 = !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !2, type: !21, function: i32 ()* @main) +!20 = distinct !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !2, type: !21) !21 = !DISubroutineType(types: !22) !22 = !{!13} -!23 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "i", line: 16, arg: 0, scope: !17, file: !2, type: !13) +!23 = !DILocalVariable(name: "i", line: 16, arg: 1, scope: !17, file: !2, type: !13) !24 = !DILocation(line: 16, scope: !17) -!25 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "location", line: 16, arg: 0, scope: !17, file: !2, type: !26) +!25 = !DILocalVariable(name: "location", line: 16, arg: 2, scope: !17, file: !2, type: !26) !26 = !DIDerivedType(tag: DW_TAG_reference_type, name: "SVal", size: 64, align: 64, file: !48, scope: !2, baseType: !1) !27 = !DILocation(line: 17, scope: !28) !28 = distinct !DILexicalBlock(line: 16, column: 0, file: !2, scope: !17) !29 = !DILocation(line: 18, scope: !28) !30 = !DILocation(line: 20, scope: !28) -!31 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 11, arg: 0, scope: !16, file: !2, type: !32) +!31 = !DILocalVariable(name: "this", line: 11, arg: 1, scope: !16, file: !2, type: !32) !32 = !DIDerivedType(tag: DW_TAG_const_type, size: 64, align: 64, flags: DIFlagArtificial, file: !48, scope: !2, baseType: !33) !33 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !48, scope: !2, baseType: !1) !34 = !DILocation(line: 11, scope: !16) !35 = !DILocation(line: 11, scope: !36) !36 = distinct !DILexicalBlock(line: 11, column: 0, file: !48, scope: !37) !37 = distinct !DILexicalBlock(line: 11, column: 0, file: !48, scope: !16) -!38 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "v", line: 24, scope: !39, file: !2, type: !1) +!38 = !DILocalVariable(name: "v", line: 24, scope: !39, file: !2, type: !1) !39 = distinct !DILexicalBlock(line: 23, column: 0, file: !48, scope: !40) !40 = distinct !DILexicalBlock(line: 23, column: 0, file: !48, scope: !20) !41 = !DILocation(line: 24, scope: !39) !42 = !DILocation(line: 25, scope: !39) !43 = !DILocation(line: 26, scope: !39) -!44 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 26, scope: !39, file: !2, type: !13) +!44 = !DILocalVariable(name: "k", line: 26, scope: !39, file: !2, type: !13) !45 = !DILocation(line: 27, scope: !39) !46 = !{!16, !17, !20} !47 = !{} diff --git a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll index 9a5baf21b8fbd..d5eed8b6a2c47 100644 --- a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll +++ b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll @@ -28,7 +28,7 @@ target triple = "thumbv7-apple-darwin10" ; CHECK-NOT: {{DW_TAG|NULL}} ; CHECK: DW_AT_location [DW_FORM_exprloc] (<0x8> 03 [[ADDR]] 10 01 22 ) -define zeroext i8 @get1(i8 zeroext %a) nounwind optsize { +define zeroext i8 @get1(i8 zeroext %a) nounwind optsize !dbg !0 { entry: tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !10, metadata !DIExpression()), !dbg !30 %0 = load i8, i8* @x1, align 4, !dbg !30 @@ -39,7 +39,7 @@ entry: declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone -define zeroext i8 @get2(i8 zeroext %a) nounwind optsize { +define zeroext i8 @get2(i8 zeroext %a) nounwind optsize !dbg !6 { entry: tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !18, metadata !DIExpression()), !dbg !32 %0 = load i8, i8* @x2, align 4, !dbg !32 @@ -48,7 +48,7 @@ entry: ret i8 %0, !dbg !33 } -define zeroext i8 @get3(i8 zeroext %a) nounwind optsize { +define zeroext i8 @get3(i8 zeroext %a) nounwind optsize !dbg !7 { entry: tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !21, metadata !DIExpression()), !dbg !34 %0 = load i8, i8* @x3, align 4, !dbg !34 @@ -57,7 +57,7 @@ entry: ret i8 %0, !dbg !35 } -define zeroext i8 @get4(i8 zeroext %a) nounwind optsize { +define zeroext i8 @get4(i8 zeroext %a) nounwind optsize !dbg !8 { entry: tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !24, metadata !DIExpression()), !dbg !36 %0 = load i8, i8* @x4, align 4, !dbg !36 @@ -66,7 +66,7 @@ entry: ret i8 %0, !dbg !37 } -define zeroext i8 @get5(i8 zeroext %a) nounwind optsize { +define zeroext i8 @get5(i8 zeroext %a) nounwind optsize !dbg !9 { entry: tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !27, metadata !DIExpression()), !dbg !38 %0 = load i8, i8* @x5, align 4, !dbg !38 @@ -78,35 +78,35 @@ entry: !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!49} -!0 = !DISubprogram(name: "get1", linkageName: "get1", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 4, file: !47, scope: !1, type: !3, function: i8 (i8)* @get1, variables: !42) +!0 = distinct !DISubprogram(name: "get1", linkageName: "get1", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 4, file: !47, scope: !1, type: !3, variables: !42) !1 = !DIFile(filename: "foo.c", directory: "/tmp/") -!2 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", isOptimized: true, emissionKind: 0, file: !47, enums: !48, retainedTypes: !48, subprograms: !40, globals: !41, imports: !48) +!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", isOptimized: true, emissionKind: 0, file: !47, enums: !48, retainedTypes: !48, subprograms: !40, globals: !41, imports: !48) !3 = !DISubroutineType(types: !4) !4 = !{!5, !5} !5 = !DIBasicType(tag: DW_TAG_base_type, name: "_Bool", size: 8, align: 8, encoding: DW_ATE_boolean) -!6 = !DISubprogram(name: "get2", linkageName: "get2", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 7, file: !47, scope: !1, type: !3, function: i8 (i8)* @get2, variables: !43) -!7 = !DISubprogram(name: "get3", linkageName: "get3", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 10, file: !47, scope: !1, type: !3, function: i8 (i8)* @get3, variables: !44) -!8 = !DISubprogram(name: "get4", linkageName: "get4", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 13, file: !47, scope: !1, type: !3, function: i8 (i8)* @get4, variables: !45) -!9 = !DISubprogram(name: "get5", linkageName: "get5", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 16, file: !47, scope: !1, type: !3, function: i8 (i8)* @get5, variables: !46) -!10 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 4, arg: 0, scope: !0, file: !1, type: !5) -!11 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 4, scope: !12, file: !1, type: !5) +!6 = distinct !DISubprogram(name: "get2", linkageName: "get2", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 7, file: !47, scope: !1, type: !3, variables: !43) +!7 = distinct !DISubprogram(name: "get3", linkageName: "get3", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 10, file: !47, scope: !1, type: !3, variables: !44) +!8 = distinct !DISubprogram(name: "get4", linkageName: "get4", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 13, file: !47, scope: !1, type: !3, variables: !45) +!9 = distinct !DISubprogram(name: "get5", linkageName: "get5", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 16, file: !47, scope: !1, type: !3, variables: !46) +!10 = !DILocalVariable(name: "a", line: 4, arg: 1, scope: !0, file: !1, type: !5) +!11 = !DILocalVariable(name: "b", line: 4, scope: !12, file: !1, type: !5) !12 = distinct !DILexicalBlock(line: 4, column: 0, file: !47, scope: !0) !13 = !DIGlobalVariable(name: "x1", line: 3, isLocal: true, isDefinition: true, scope: !1, file: !1, type: !5, variable: i8* @x1) !14 = !DIGlobalVariable(name: "x2", line: 6, isLocal: true, isDefinition: true, scope: !1, file: !1, type: !5, variable: i8* @x2) !15 = !DIGlobalVariable(name: "x3", line: 9, isLocal: true, isDefinition: true, scope: !1, file: !1, type: !5, variable: i8* @x3) !16 = !DIGlobalVariable(name: "x4", line: 12, isLocal: true, isDefinition: true, scope: !1, file: !1, type: !5, variable: i8* @x4) !17 = !DIGlobalVariable(name: "x5", line: 15, isLocal: false, isDefinition: true, scope: !1, file: !1, type: !5, variable: i8* @x5) -!18 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 7, arg: 0, scope: !6, file: !1, type: !5) -!19 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 7, scope: !20, file: !1, type: !5) +!18 = !DILocalVariable(name: "a", line: 7, arg: 1, scope: !6, file: !1, type: !5) +!19 = !DILocalVariable(name: "b", line: 7, scope: !20, file: !1, type: !5) !20 = distinct !DILexicalBlock(line: 7, column: 0, file: !47, scope: !6) -!21 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 10, arg: 0, scope: !7, file: !1, type: !5) -!22 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 10, scope: !23, file: !1, type: !5) +!21 = !DILocalVariable(name: "a", line: 10, arg: 1, scope: !7, file: !1, type: !5) +!22 = !DILocalVariable(name: "b", line: 10, scope: !23, file: !1, type: !5) !23 = distinct !DILexicalBlock(line: 10, column: 0, file: !47, scope: !7) -!24 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 13, arg: 0, scope: !8, file: !1, type: !5) -!25 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 13, scope: !26, file: !1, type: !5) +!24 = !DILocalVariable(name: "a", line: 13, arg: 1, scope: !8, file: !1, type: !5) +!25 = !DILocalVariable(name: "b", line: 13, scope: !26, file: !1, type: !5) !26 = distinct !DILexicalBlock(line: 13, column: 0, file: !47, scope: !8) -!27 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 16, arg: 0, scope: !9, file: !1, type: !5) -!28 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 16, scope: !29, file: !1, type: !5) +!27 = !DILocalVariable(name: "a", line: 16, arg: 1, scope: !9, file: !1, type: !5) +!28 = !DILocalVariable(name: "b", line: 16, scope: !29, file: !1, type: !5) !29 = distinct !DILexicalBlock(line: 16, column: 0, file: !47, scope: !9) !30 = !DILocation(line: 4, scope: !0) !31 = !DILocation(line: 4, scope: !12) diff --git a/test/CodeGen/ARM/2011-06-29-MergeGlobalsAlign.ll b/test/CodeGen/ARM/2011-06-29-MergeGlobalsAlign.ll index aac8f7b3a026b..1097050df54b3 100644 --- a/test/CodeGen/ARM/2011-06-29-MergeGlobalsAlign.ll +++ b/test/CodeGen/ARM/2011-06-29-MergeGlobalsAlign.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -arm-global-merge -global-merge-group-by-use=false | FileCheck %s -; CHECK: .zerofill __DATA,__bss,__MergedGlobals,16,2 +; CHECK: .zerofill __DATA,__bss,l__MergedGlobals,16,2 @prev = external global [0 x i16] @max_lazy_match = internal unnamed_addr global i32 0, align 4 diff --git a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll index 067c719f491c6..3d82e706862c1 100644 --- a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll +++ b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll @@ -28,7 +28,7 @@ target triple = "thumbv7-apple-macosx10.7.0" @x4 = internal unnamed_addr global i32 4, align 4 @x5 = global i32 0, align 4 -define i32 @get1(i32 %a) nounwind optsize ssp { +define i32 @get1(i32 %a) nounwind optsize ssp !dbg !1 { tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !10, metadata !DIExpression()), !dbg !30 %1 = load i32, i32* @x1, align 4, !dbg !31 tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !11, metadata !DIExpression()), !dbg !31 @@ -36,7 +36,7 @@ define i32 @get1(i32 %a) nounwind optsize ssp { ret i32 %1, !dbg !31 } -define i32 @get2(i32 %a) nounwind optsize ssp { +define i32 @get2(i32 %a) nounwind optsize ssp !dbg !6 { tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !13, metadata !DIExpression()), !dbg !32 %1 = load i32, i32* @x2, align 4, !dbg !33 tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !14, metadata !DIExpression()), !dbg !33 @@ -44,7 +44,7 @@ define i32 @get2(i32 %a) nounwind optsize ssp { ret i32 %1, !dbg !33 } -define i32 @get3(i32 %a) nounwind optsize ssp { +define i32 @get3(i32 %a) nounwind optsize ssp !dbg !7 { tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !16, metadata !DIExpression()), !dbg !34 %1 = load i32, i32* @x3, align 4, !dbg !35 tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !17, metadata !DIExpression()), !dbg !35 @@ -52,7 +52,7 @@ define i32 @get3(i32 %a) nounwind optsize ssp { ret i32 %1, !dbg !35 } -define i32 @get4(i32 %a) nounwind optsize ssp { +define i32 @get4(i32 %a) nounwind optsize ssp !dbg !8 { tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !19, metadata !DIExpression()), !dbg !36 %1 = load i32, i32* @x4, align 4, !dbg !37 tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !20, metadata !DIExpression()), !dbg !37 @@ -60,7 +60,7 @@ define i32 @get4(i32 %a) nounwind optsize ssp { ret i32 %1, !dbg !37 } -define i32 @get5(i32 %a) nounwind optsize ssp { +define i32 @get5(i32 %a) nounwind optsize ssp !dbg !9 { tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !27, metadata !DIExpression()), !dbg !38 %1 = load i32, i32* @x5, align 4, !dbg !39 tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !28, metadata !DIExpression()), !dbg !39 @@ -73,32 +73,32 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!49} -!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang", isOptimized: true, emissionKind: 1, file: !47, enums: !48, retainedTypes: !48, subprograms: !40, globals: !41, imports: !48) -!1 = !DISubprogram(name: "get1", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !47, scope: !2, type: !3, function: i32 (i32)* @get1, variables: !42) +!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", isOptimized: true, emissionKind: 1, file: !47, enums: !48, retainedTypes: !48, subprograms: !40, globals: !41, imports: !48) +!1 = distinct !DISubprogram(name: "get1", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !47, scope: !2, type: !3, variables: !42) !2 = !DIFile(filename: "ss3.c", directory: "/private/tmp") !3 = !DISubroutineType(types: !4) !4 = !{!5} !5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed) -!6 = !DISubprogram(name: "get2", line: 8, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 8, file: !47, scope: !2, type: !3, function: i32 (i32)* @get2, variables: !43) -!7 = !DISubprogram(name: "get3", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !47, scope: !2, type: !3, function: i32 (i32)* @get3, variables: !44) -!8 = !DISubprogram(name: "get4", line: 14, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 14, file: !47, scope: !2, type: !3, function: i32 (i32)* @get4, variables: !45) -!9 = !DISubprogram(name: "get5", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 17, file: !47, scope: !2, type: !3, function: i32 (i32)* @get5, variables: !46) -!10 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 5, arg: 1, scope: !1, file: !2, type: !5) -!11 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 5, scope: !12, file: !2, type: !5) +!6 = distinct !DISubprogram(name: "get2", line: 8, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 8, file: !47, scope: !2, type: !3, variables: !43) +!7 = distinct !DISubprogram(name: "get3", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !47, scope: !2, type: !3, variables: !44) +!8 = distinct !DISubprogram(name: "get4", line: 14, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 14, file: !47, scope: !2, type: !3, variables: !45) +!9 = distinct !DISubprogram(name: "get5", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 17, file: !47, scope: !2, type: !3, variables: !46) +!10 = !DILocalVariable(name: "a", line: 5, arg: 1, scope: !1, file: !2, type: !5) +!11 = !DILocalVariable(name: "b", line: 5, scope: !12, file: !2, type: !5) !12 = distinct !DILexicalBlock(line: 5, column: 19, file: !47, scope: !1) -!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 8, arg: 1, scope: !6, file: !2, type: !5) -!14 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 8, scope: !15, file: !2, type: !5) +!13 = !DILocalVariable(name: "a", line: 8, arg: 1, scope: !6, file: !2, type: !5) +!14 = !DILocalVariable(name: "b", line: 8, scope: !15, file: !2, type: !5) !15 = distinct !DILexicalBlock(line: 8, column: 17, file: !47, scope: !6) -!16 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 11, arg: 1, scope: !7, file: !2, type: !5) -!17 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 11, scope: !18, file: !2, type: !5) +!16 = !DILocalVariable(name: "a", line: 11, arg: 1, scope: !7, file: !2, type: !5) +!17 = !DILocalVariable(name: "b", line: 11, scope: !18, file: !2, type: !5) !18 = distinct !DILexicalBlock(line: 11, column: 19, file: !47, scope: !7) -!19 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 14, arg: 1, scope: !8, file: !2, type: !5) -!20 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 14, scope: !21, file: !2, type: !5) +!19 = !DILocalVariable(name: "a", line: 14, arg: 1, scope: !8, file: !2, type: !5) +!20 = !DILocalVariable(name: "b", line: 14, scope: !21, file: !2, type: !5) !21 = distinct !DILexicalBlock(line: 14, column: 19, file: !47, scope: !8) !25 = !DIGlobalVariable(name: "x1", line: 4, isLocal: true, isDefinition: true, scope: !0, file: !2, type: !5, variable: i32* @x1) !26 = !DIGlobalVariable(name: "x2", line: 7, isLocal: true, isDefinition: true, scope: !0, file: !2, type: !5, variable: i32* @x2) -!27 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 17, arg: 1, scope: !9, file: !2, type: !5) -!28 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 17, scope: !29, file: !2, type: !5) +!27 = !DILocalVariable(name: "a", line: 17, arg: 1, scope: !9, file: !2, type: !5) +!28 = !DILocalVariable(name: "b", line: 17, scope: !29, file: !2, type: !5) !29 = distinct !DILexicalBlock(line: 17, column: 19, file: !47, scope: !9) !30 = !DILocation(line: 5, column: 16, scope: !1) !31 = !DILocation(line: 5, column: 32, scope: !12) diff --git a/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll b/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll index 3cbc4cdcd707a..d702af7c0c708 100644 --- a/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll +++ b/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll @@ -4,9 +4,9 @@ define void @test_vmovqqqq_pseudo() nounwind ssp { entry: - %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2) + %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2) store { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, { <8 x i16>, <8 x i16>, <8 x i16> }* undef ret void } -declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly diff --git a/test/CodeGen/ARM/2011-10-26-memset-inline.ll b/test/CodeGen/ARM/2011-10-26-memset-inline.ll index 17bd291a6b55e..5df439389cdb0 100644 --- a/test/CodeGen/ARM/2011-10-26-memset-inline.ll +++ b/test/CodeGen/ARM/2011-10-26-memset-inline.ll @@ -1,5 +1,5 @@ ; Make sure short memsets on ARM lower to stores, even when optimizing for size. -; RUN: llc -march=arm < %s | FileCheck %s -check-prefix=CHECK-GENERIC +; RUN: llc -march=arm -mattr=+strict-align < %s | FileCheck %s -check-prefix=CHECK-GENERIC ; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s -check-prefix=CHECK-UNALIGNED target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" diff --git a/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll b/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll index b70b7f6f3b2ea..f622ceb584e6e 100644 --- a/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll +++ b/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll @@ -52,8 +52,8 @@ cond.end295: ; preds = %entry %shuffle.i35.i.i = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer %shuffle.i34.i.i = shufflevector <1 x i64> %shuffle.i36.i.i, <1 x i64> %shuffle.i35.i.i, <2 x i32> %2 = bitcast <2 x i64> %shuffle.i34.i.i to <4 x float> - tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind - tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind unreachable for.end: ; preds = %entry @@ -63,10 +63,10 @@ for.end: ; preds = %entry ; Check that pseudo-expansion preserves flags. define void @foo3(i8* %p) nounwind ssp { entry: - tail call void @llvm.arm.neon.vst2.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4) + tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4) ret void } declare arm_aapcs_vfpcc void @bar(i8*, float, float, float) -declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind -declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind +declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind diff --git a/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll index 7f30ae10e436d..606af47a3d8ee 100644 --- a/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll +++ b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll @@ -7,8 +7,8 @@ entry: %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1 %0 = bitcast i32* %p to i8* - tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4) + tail call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4) ret void } -declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind diff --git a/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll index 545bfc73c5905..6cff67614c640 100644 --- a/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll +++ b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll @@ -5,9 +5,9 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" target triple = "thumbv7-apple-ios5.1.0" -declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind define void @findEdges(i8*) nounwind ssp { %2 = icmp sgt i32 undef, 0 @@ -19,16 +19,16 @@ define void @findEdges(i8*) nounwind ssp { ;