From dd58ef019b700900793a1eb48b52123db01b654e Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Wed, 30 Dec 2015 11:46:15 +0000
Subject: Vendor import of llvm trunk r256633:
 https://llvm.org/svn/llvm-project/llvm/trunk@256633

---
 .../aarch64-2014-08-11-MachineCombinerCrash.ll     |    46 +-
 test/CodeGen/AArch64/aarch64-addv.ll               |    98 +
 test/CodeGen/AArch64/aarch64-deferred-spilling.ll  |   514 +
 .../AArch64/aarch64-dynamic-stack-layout.ll        |    24 +-
 .../AArch64/aarch64-interleaved-accesses.ll        |   147 +-
 test/CodeGen/AArch64/aarch64-loop-gep-opt.ll       |    50 +
 test/CodeGen/AArch64/aarch64-minmaxv.ll            |   511 +
 test/CodeGen/AArch64/aarch64-smax-constantfold.ll  |    12 +
 test/CodeGen/AArch64/addsub_ext.ll                 |   146 +
 test/CodeGen/AArch64/alloca.ll                     |     4 +-
 .../AArch64/arm64-2011-03-17-AsmPrinterCrash.ll    |    14 +-
 test/CodeGen/AArch64/arm64-aapcs-be.ll             |     2 +-
 test/CodeGen/AArch64/arm64-aapcs.ll                |    21 +-
 test/CodeGen/AArch64/arm64-abi_align.ll            |     2 +-
 test/CodeGen/AArch64/arm64-addr-type-promotion.ll  |     9 +-
 .../AArch64/arm64-alloca-frame-pointer-offset.ll   |     6 +-
 test/CodeGen/AArch64/arm64-arith.ll                |     3 +-
 test/CodeGen/AArch64/arm64-atomic-128.ll           |     7 +-
 test/CodeGen/AArch64/arm64-atomic.ll               |    70 +-
 test/CodeGen/AArch64/arm64-builtins-linux.ll       |    11 +
 test/CodeGen/AArch64/arm64-ccmp-heuristics.ll      |     4 +-
 test/CodeGen/AArch64/arm64-ccmp.ll                 |   166 +-
 test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll |    17 +
 test/CodeGen/AArch64/arm64-collect-loh.ll          |   604 +
 test/CodeGen/AArch64/arm64-fast-isel-br.ll         |    15 +-
 test/CodeGen/AArch64/arm64-fmax-safe.ll            |    53 +
 test/CodeGen/AArch64/arm64-fmax.ll                 |    46 +-
 test/CodeGen/AArch64/arm64-fp128.ll                |    31 +-
 test/CodeGen/AArch64/arm64-hello.ll                |     4 +-
 test/CodeGen/AArch64/arm64-indexed-memory.ll       |    33 +
 test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll  |    26 +-
 test/CodeGen/AArch64/arm64-inline-asm.ll           |     2 +-
 test/CodeGen/AArch64/arm64-join-reserved.ll        |     2 +-
 test/CodeGen/AArch64/arm64-large-frame.ll          |     2 +-
 test/CodeGen/AArch64/arm64-ld-from-st.ll           |   666 +
 test/CodeGen/AArch64/arm64-ldp.ll                  |   188 +-
 test/CodeGen/AArch64/arm64-long-shift.ll           |    80 +-
 .../AArch64/arm64-misaligned-memcpy-inline.ll      |     2 +-
 test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll    |   406 +
 test/CodeGen/AArch64/arm64-neon-2velem.ll          |    55 +
 test/CodeGen/AArch64/arm64-neon-copy.ll            |    17 +-
 .../AArch64/arm64-patchpoint-webkit_jscc.ll        |     8 +-
 test/CodeGen/AArch64/arm64-platform-reg.ll         |     4 +-
 test/CodeGen/AArch64/arm64-popcnt.ll               |     8 +-
 test/CodeGen/AArch64/arm64-rounding.ll             |    62 +-
 test/CodeGen/AArch64/arm64-shrink-wrapping.ll      |    95 +-
 test/CodeGen/AArch64/arm64-spill-lr.ll             |     6 +-
 test/CodeGen/AArch64/arm64-stackmap.ll             |     4 +-
 test/CodeGen/AArch64/arm64-stp.ll                  |    34 +-
 test/CodeGen/AArch64/arm64-strict-align.ll         |     5 +-
 test/CodeGen/AArch64/arm64-tls-dynamic-together.ll |    43 +-
 test/CodeGen/AArch64/arm64-trunc-store.ll          |     2 +-
 test/CodeGen/AArch64/arm64-vabs.ll                 |    66 +
 test/CodeGen/AArch64/arm64-variadic-aapcs.ll       |     2 +-
 test/CodeGen/AArch64/arm64-vector-ext.ll           |    54 +-
 test/CodeGen/AArch64/arm64-vminmaxnm.ll            |    17 +-
 test/CodeGen/AArch64/arm64-xaluo.ll                |     4 +-
 test/CodeGen/AArch64/atomic-ops.ll                 |    20 +-
 test/CodeGen/AArch64/bitcast-v2i8.ll               |     2 +-
 test/CodeGen/AArch64/bitfield-insert.ll            |    41 +
 test/CodeGen/AArch64/bitfield.ll                   |    46 +-
 test/CodeGen/AArch64/bitreverse.ll                 |    87 +
 test/CodeGen/AArch64/combine-comparisons-by-cse.ll |    26 +
 test/CodeGen/AArch64/cpus.ll                       |     1 +
 test/CodeGen/AArch64/cxx-tlscc.ll                  |    76 +
 test/CodeGen/AArch64/dag-combine-select.ll         |    47 +
 test/CodeGen/AArch64/divrem.ll                     |    22 +
 test/CodeGen/AArch64/emutls.ll                     |   116 +
 test/CodeGen/AArch64/emutls_generic.ll             |    59 +
 test/CodeGen/AArch64/eon.ll                        |    29 +
 test/CodeGen/AArch64/f16-instructions.ll           |   111 +-
 test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll |    19 +
 .../CodeGen/AArch64/fast-isel-branch-cond-split.ll |    52 +-
 test/CodeGen/AArch64/fast-isel-cmp-vec.ll          |   100 +
 test/CodeGen/AArch64/fast-isel-folded-shift.ll     |   125 +
 test/CodeGen/AArch64/fast-isel-logic-op.ll         |     2 +-
 test/CodeGen/AArch64/fastcc-reserved.ll            |     4 +-
 test/CodeGen/AArch64/fastcc.ll                     |     8 +-
 test/CodeGen/AArch64/fcvt_combine.ll               |   154 +
 test/CodeGen/AArch64/fdiv_combine.ll               |   115 +
 test/CodeGen/AArch64/fold-constants.ll             |    19 +-
 test/CodeGen/AArch64/fp16-v4-instructions.ll       |    51 +-
 test/CodeGen/AArch64/fp16-v8-instructions.ll       |    63 +
 test/CodeGen/AArch64/free-zext.ll                  |    59 +-
 test/CodeGen/AArch64/func-argpassing.ll            |     4 +-
 test/CodeGen/AArch64/func-calls.ll                 |     4 +-
 test/CodeGen/AArch64/global-alignment.ll           |     2 +-
 test/CodeGen/AArch64/global-merge-1.ll             |    16 +-
 test/CodeGen/AArch64/global-merge-2.ll             |    34 +-
 test/CodeGen/AArch64/global-merge-3.ll             |    46 +-
 test/CodeGen/AArch64/global-merge-4.ll             |     6 +-
 test/CodeGen/AArch64/global-merge-group-by-use.ll  |     6 +-
 .../global-merge-ignore-single-use-minsize.ll      |     2 +-
 .../AArch64/global-merge-ignore-single-use.ll      |     2 +-
 test/CodeGen/AArch64/ldst-opt.ll                   |   477 +-
 test/CodeGen/AArch64/merge-store.ll                |    30 +
 test/CodeGen/AArch64/misched-fusion.ll             |    34 +
 test/CodeGen/AArch64/mul-lohi.ll                   |    29 +
 test/CodeGen/AArch64/nest-register.ll              |     2 +-
 test/CodeGen/AArch64/nontemporal.ll                |   339 +
 test/CodeGen/AArch64/pic-eh-stubs.ll               |     2 +-
 test/CodeGen/AArch64/readcyclecounter.ll           |    15 +
 test/CodeGen/AArch64/regress-tblgen-chains.ll      |     4 +-
 test/CodeGen/AArch64/remat.ll                      |     1 +
 test/CodeGen/AArch64/rotate.ll                     |    14 +
 test/CodeGen/AArch64/round-conv.ll                 |   330 +
 test/CodeGen/AArch64/shrink-wrap.ll                |   184 +
 test/CodeGen/AArch64/stackmap-frame-setup.ll       |    20 +
 test/CodeGen/AArch64/tail-call.ll                  |     6 +-
 test/CodeGen/AArch64/tailcall-explicit-sret.ll     |     2 +-
 test/CodeGen/AArch64/tbi.ll                        |   102 +
 test/CodeGen/AArch64/vector-fcopysign.ll           |   178 +
 test/CodeGen/AArch64/xbfiz.ll                      |    30 +
 test/CodeGen/AMDGPU/add.ll                         |    14 +-
 test/CodeGen/AMDGPU/address-space.ll               |     6 +-
 test/CodeGen/AMDGPU/addrspacecast.ll               |    66 +
 test/CodeGen/AMDGPU/and.ll                         |   101 +-
 test/CodeGen/AMDGPU/annotate-kernel-features.ll    |   193 +
 test/CodeGen/AMDGPU/array-ptr-calc-i32.ll          |     8 +-
 test/CodeGen/AMDGPU/bitreverse.ll                  |   115 +
 test/CodeGen/AMDGPU/calling-conventions.ll         |    20 +
 test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll   |    98 +
 test/CodeGen/AMDGPU/cgp-addressing-modes.ll        |   254 +-
 test/CodeGen/AMDGPU/ci-use-flat-for-global.ll      |    15 +
 test/CodeGen/AMDGPU/ctpop64.ll                     |    22 +-
 test/CodeGen/AMDGPU/cvt_f32_ubyte.ll               |    12 +-
 test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll  |    52 +
 .../ds-negative-offset-addressing-mode-loop.ll     |    10 +-
 test/CodeGen/AMDGPU/ds-sub-offset.ll               |   125 +
 test/CodeGen/AMDGPU/ds_read2.ll                    |    10 +-
 test/CodeGen/AMDGPU/ds_read2_superreg.ll           |    89 +-
 test/CodeGen/AMDGPU/ds_read2st64.ll                |     8 +-
 test/CodeGen/AMDGPU/ds_write2.ll                   |     9 +-
 test/CodeGen/AMDGPU/ds_write2st64.ll               |     4 +-
 test/CodeGen/AMDGPU/dynamic_stackalloc.ll          |    11 +
 test/CodeGen/AMDGPU/extract-vector-elt-i64.ll      |    43 +
 test/CodeGen/AMDGPU/fadd64.ll                      |    50 +-
 test/CodeGen/AMDGPU/fceil64.ll                     |    12 +-
 test/CodeGen/AMDGPU/fcmp.ll                        |     2 +-
 test/CodeGen/AMDGPU/flat-address-space.ll          |    77 +-
 test/CodeGen/AMDGPU/flat-scratch-reg.ll            |    36 +
 test/CodeGen/AMDGPU/fma-combine.ll                 |   200 +
 test/CodeGen/AMDGPU/fmax_legacy.ll                 |    40 +
 test/CodeGen/AMDGPU/fmin_legacy.ll                 |    63 +
 test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll    |   102 +
 test/CodeGen/AMDGPU/fneg-fabs.ll                   |    27 +-
 test/CodeGen/AMDGPU/ftrunc.f64.ll                  |    12 +-
 test/CodeGen/AMDGPU/gep-address-space.ll           |    34 +-
 test/CodeGen/AMDGPU/global-constant.ll             |    27 +
 test/CodeGen/AMDGPU/global-extload-i32.ll          |   327 +-
 test/CodeGen/AMDGPU/global_atomics.ll              |    20 +-
 test/CodeGen/AMDGPU/half.ll                        |   256 +-
 test/CodeGen/AMDGPU/hsa-globals.ll                 |   132 +
 test/CodeGen/AMDGPU/hsa-group-segment.ll           |    14 +
 test/CodeGen/AMDGPU/hsa.ll                         |    36 +-
 test/CodeGen/AMDGPU/image-attributes.ll            |   206 +
 test/CodeGen/AMDGPU/image-resource-id.ll           |   409 +
 test/CodeGen/AMDGPU/imm.ll                         |    24 +-
 test/CodeGen/AMDGPU/indirect-addressing-si.ll      |    67 +-
 test/CodeGen/AMDGPU/indirect-private-64.ll         |    34 +-
 test/CodeGen/AMDGPU/inline-constraints.ll          |    23 +
 test/CodeGen/AMDGPU/insert_vector_elt.ll           |   103 +-
 test/CodeGen/AMDGPU/kernel-args.ll                 |    26 +-
 test/CodeGen/AMDGPU/large-alloca-compute.ll        |    57 +
 test/CodeGen/AMDGPU/large-alloca-graphics.ll       |    47 +
 test/CodeGen/AMDGPU/large-alloca.ll                |    15 -
 test/CodeGen/AMDGPU/literals.ll                    |     8 +-
 test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll             |     4 +-
 test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll         |     2 +-
 test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll            |    28 -
 test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll           |     6 +-
 test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll        |     1 -
 test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll    |    37 +
 test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll           |     2 +-
 test/CodeGen/AMDGPU/llvm.SI.packf16.ll             |    29 +
 test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll  |    16 +
 .../AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll        |    14 +
 .../AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll       |    16 +
 test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll    |    16 +
 test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll          |    30 +
 test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll           |    24 +
 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll    |    29 +
 .../CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll |    29 +
 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll     |    27 +
 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll |    27 +
 test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll             |     2 +-
 test/CodeGen/AMDGPU/llvm.dbg.value.ll              |    12 +-
 test/CodeGen/AMDGPU/llvm.memcpy.ll                 |    66 +-
 test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll   |   184 +
 test/CodeGen/AMDGPU/llvm.round.f64.ll              |     5 +-
 test/CodeGen/AMDGPU/load.ll                        |    34 +-
 test/CodeGen/AMDGPU/local-memory-two-objects.ll    |     4 +-
 test/CodeGen/AMDGPU/local-memory.ll                |     4 +-
 test/CodeGen/AMDGPU/max.ll                         |   116 +-
 test/CodeGen/AMDGPU/merge-stores.ll                |   196 +-
 test/CodeGen/AMDGPU/min.ll                         |   171 +-
 .../AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll  |    36 +
 test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll      |    52 +
 test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll     |    18 +
 test/CodeGen/AMDGPU/no-shrink-extloads.ll          |    12 +
 test/CodeGen/AMDGPU/opencl-image-metadata.ll       |    24 +
 test/CodeGen/AMDGPU/operand-folding.ll             |     2 +-
 test/CodeGen/AMDGPU/or.ll                          |     2 +-
 .../partially-dead-super-register-immediate.ll     |    28 +
 test/CodeGen/AMDGPU/private-memory.ll              |    12 +
 test/CodeGen/AMDGPU/register-count-comments.ll     |     3 +-
 test/CodeGen/AMDGPU/reorder-stores.ll              |    58 +-
 test/CodeGen/AMDGPU/s_movk_i32.ll                  |    18 +-
 test/CodeGen/AMDGPU/salu-to-valu.ll                |   418 +-
 test/CodeGen/AMDGPU/sampler-resource-id.ll         |    65 +
 .../AMDGPU/schedule-vs-if-nested-loop-failure.ll   |     2 +-
 test/CodeGen/AMDGPU/scratch-buffer.ll              |     2 +-
 test/CodeGen/AMDGPU/select64.ll                    |     8 +-
 test/CodeGen/AMDGPU/set-dx10.ll                    |    48 +-
 test/CodeGen/AMDGPU/setcc-opt.ll                   |    22 +-
 test/CodeGen/AMDGPU/sext-in-reg.ll                 |    54 +-
 test/CodeGen/AMDGPU/shl.ll                         |    15 +-
 test/CodeGen/AMDGPU/shl_add_constant.ll            |     6 +-
 test/CodeGen/AMDGPU/shl_add_ptr.ll                 |     2 +-
 .../si-instr-info-correct-implicit-operands.ll     |    16 +
 test/CodeGen/AMDGPU/si-literal-folding.ll          |    17 +
 test/CodeGen/AMDGPU/si-sgpr-spill.ll               |    10 +
 test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll |     7 +-
 test/CodeGen/AMDGPU/sint_to_fp.f64.ll              |     6 +-
 test/CodeGen/AMDGPU/sminmax.ll                     |   130 +
 test/CodeGen/AMDGPU/smrd.ll                        |    73 +-
 test/CodeGen/AMDGPU/split-scalar-i64-add.ll        |    42 +-
 .../AMDGPU/split-vector-memoperand-offsets.ll      |   104 +
 test/CodeGen/AMDGPU/sra.ll                         |     8 +-
 test/CodeGen/AMDGPU/srl.ll                         |    13 +-
 test/CodeGen/AMDGPU/store-barrier.ll               |     4 +-
 test/CodeGen/AMDGPU/store.ll                       |    25 +-
 test/CodeGen/AMDGPU/store_typed.ll                 |    24 +
 test/CodeGen/AMDGPU/sub.ll                         |    14 +-
 test/CodeGen/AMDGPU/trunc.ll                       |     8 +-
 test/CodeGen/AMDGPU/udivrem.ll                     |   130 +-
 test/CodeGen/AMDGPU/uint_to_fp.f64.ll              |     6 +-
 test/CodeGen/AMDGPU/unsupported-cc.ll              |    32 +-
 test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll     |   167 +
 test/CodeGen/AMDGPU/valu-i1.ll                     |    16 +-
 .../vgpr-spill-emergency-stack-slot-compute.ll     |   585 +
 .../AMDGPU/vgpr-spill-emergency-stack-slot.ll      |   494 +
 test/CodeGen/AMDGPU/vop-shrink.ll                  |     4 +-
 test/CodeGen/AMDGPU/wait.ll                        |    61 +-
 test/CodeGen/AMDGPU/work-item-intrinsics.ll        |   263 +-
 test/CodeGen/AMDGPU/xor.ll                         |     2 +-
 test/CodeGen/AMDGPU/zero_extend.ll                 |     3 +-
 test/CodeGen/ARM/2007-03-13-InstrSched.ll          |     2 +-
 test/CodeGen/ARM/2009-10-16-Scope.ll               |     6 +-
 test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll |     6 +-
 test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll      |    24 +-
 test/CodeGen/ARM/2010-05-21-BuildVector.ll         |     4 +-
 test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll     |     4 +-
 test/CodeGen/ARM/2010-06-21-nondarwin-tc.ll        |     2 +-
 .../ARM/2010-06-25-Thumb2ITInvalidIterator.ll      |    12 +-
 .../ARM/2010-06-29-PartialRedefFastAlloc.ll        |     4 +-
 test/CodeGen/ARM/2010-08-04-StackVariable.ll       |    24 +-
 test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll     |    42 +-
 test/CodeGen/ARM/2011-06-29-MergeGlobalsAlign.ll   |     2 +-
 test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll     |    42 +-
 test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll     |     4 +-
 test/CodeGen/ARM/2011-10-26-memset-inline.ll       |     2 +-
 .../CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll |    10 +-
 test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll  |     4 +-
 test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll    |    14 +-
 test/CodeGen/ARM/2012-11-14-subs_carry.ll          |    10 +-
 test/CodeGen/ARM/2013-10-11-select-stalls.ll       |    13 +-
 .../ARM/2014-01-09-pseudo_expand_implicit_reg.ll   |     4 +-
 test/CodeGen/ARM/MachO-subtypes.ll                 |    68 +
 test/CodeGen/ARM/Windows/division.ll               |    38 +
 .../Windows/integer-floating-point-conversion.ll   |    74 -
 test/CodeGen/ARM/Windows/libcalls.ll               |    75 +
 test/CodeGen/ARM/Windows/no-eabi.ll                |    10 +
 test/CodeGen/ARM/Windows/no-frame-register.ll      |    22 +
 test/CodeGen/ARM/Windows/overflow.ll               |    77 +
 test/CodeGen/ARM/adv-copy-opt.ll                   |    14 +-
 test/CodeGen/ARM/aliases.ll                        |    30 +-
 test/CodeGen/ARM/align-sp-adjustment.ll            |    47 +
 test/CodeGen/ARM/apcs-vfp.ll                       |   153 +
 test/CodeGen/ARM/arm-eabi.ll                       |    63 +
 test/CodeGen/ARM/arm-interleaved-accesses.ll       |   190 +-
 test/CodeGen/ARM/arm-shrink-wrapping-linux.ll      |   142 +
 test/CodeGen/ARM/arm-shrink-wrapping.ll            |   683 +
 test/CodeGen/ARM/atomic-64bit.ll                   |     6 +
 test/CodeGen/ARM/atomic-cmp.ll                     |     4 +-
 test/CodeGen/ARM/atomic-cmpxchg.ll                 |    98 +-
 test/CodeGen/ARM/atomic-op.ll                      |    46 +-
 test/CodeGen/ARM/atomic-ops-v8.ll                  |    38 +-
 test/CodeGen/ARM/avoid-cpsr-rmw.ll                 |    16 +-
 test/CodeGen/ARM/bfi.ll                            |    95 +
 .../ARM/build-attributes-optimization-minsize.ll   |    18 +
 .../ARM/build-attributes-optimization-mixed.ll     |    23 +
 .../ARM/build-attributes-optimization-optnone.ll   |    18 +
 .../ARM/build-attributes-optimization-optsize.ll   |    18 +
 test/CodeGen/ARM/build-attributes-optimization.ll  |    23 +
 test/CodeGen/ARM/build-attributes.ll               |   142 +-
 test/CodeGen/ARM/call-tc.ll                        |     8 +-
 test/CodeGen/ARM/cfi-alignment.ll                  |    48 +
 test/CodeGen/ARM/cmpxchg-idioms.ll                 |     6 +-
 test/CodeGen/ARM/cmpxchg-weak.ll                   |    56 +-
 test/CodeGen/ARM/coalesce-dbgvalue.ll              |    10 +-
 test/CodeGen/ARM/coalesce-subregs.ll               |    38 +-
 test/CodeGen/ARM/combine-vmovdrr.ll                |    72 +
 test/CodeGen/ARM/constants.ll                      |     6 +-
 test/CodeGen/ARM/dagcombine-concatvector.ll        |     4 +-
 test/CodeGen/ARM/debug-frame-vararg.ll             |    14 +-
 test/CodeGen/ARM/debug-frame.ll                    |    28 +-
 test/CodeGen/ARM/debug-info-arg.ll                 |    20 +-
 test/CodeGen/ARM/debug-info-blocks.ll              |    40 +-
 test/CodeGen/ARM/debug-info-branch-folding.ll      |    32 +-
 test/CodeGen/ARM/debug-info-d16-reg.ll             |    38 +-
 test/CodeGen/ARM/debug-info-no-frame.ll            |     8 +-
 test/CodeGen/ARM/debug-info-qreg.ll                |    28 +-
 test/CodeGen/ARM/debug-info-s16-reg.ll             |    38 +-
 test/CodeGen/ARM/debug-info-sreg2.ll               |    10 +-
 test/CodeGen/ARM/debug-segmented-stacks.ll         |    14 +-
 test/CodeGen/ARM/debugtrap.ll                      |    17 +
 test/CodeGen/ARM/div.ll                            |    71 +-
 test/CodeGen/ARM/divmod-eabi.ll                    |     4 +-
 test/CodeGen/ARM/eh-resume-darwin.ll               |     8 +-
 test/CodeGen/ARM/emutls.ll                         |   258 +
 test/CodeGen/ARM/emutls1.ll                        |    31 +
 test/CodeGen/ARM/emutls_generic.ll                 |    61 +
 test/CodeGen/ARM/fast-isel-align.ll                |    24 +-
 test/CodeGen/ARM/fast-isel-ext.ll                  |    35 -
 test/CodeGen/ARM/fast-isel-mvn.ll                  |    10 +-
 test/CodeGen/ARM/fast-isel-pic.ll                  |    23 +-
 test/CodeGen/ARM/fold-stack-adjust.ll              |    18 +-
 test/CodeGen/ARM/fp16-args.ll                      |    40 +
 test/CodeGen/ARM/fp16-promote.ll                   |   471 +-
 test/CodeGen/ARM/fp16.ll                           |    62 +-
 test/CodeGen/ARM/fparith.ll                        |     4 +-
 test/CodeGen/ARM/gep-optimization.ll               |    77 +
 test/CodeGen/ARM/global-merge-1.ll                 |     6 +-
 test/CodeGen/ARM/global-merge-external.ll          |    46 +
 test/CodeGen/ARM/globals.ll                        |     9 +-
 test/CodeGen/ARM/ifcvt-branch-weight-bug.ll        |    14 +-
 test/CodeGen/ARM/ifcvt-branch-weight.ll            |     2 +-
 test/CodeGen/ARM/ifcvt-iter-indbr.ll               |     6 +
 test/CodeGen/ARM/ifcvt4.ll                         |     6 +-
 test/CodeGen/ARM/ifcvt5.ll                         |     4 +-
 test/CodeGen/ARM/ifcvt6.ll                         |     2 +-
 test/CodeGen/ARM/ifcvt8.ll                         |     4 +-
 test/CodeGen/ARM/inlineasm-switch-mode.ll          |     4 +-
 test/CodeGen/ARM/ldm-stm-base-materialization.ll   |    93 +
 test/CodeGen/ARM/ldrd.ll                           |    58 +-
 test/CodeGen/ARM/legalize-unaligned-load.ll        |    35 +
 test/CodeGen/ARM/load-global.ll                    |    12 +-
 test/CodeGen/ARM/load-store-flags.ll               |     4 +-
 test/CodeGen/ARM/load.ll                           |   571 +-
 test/CodeGen/ARM/machine-cse-cmp.ll                |     2 +-
 test/CodeGen/ARM/memcpy-inline.ll                  |     2 +-
 test/CodeGen/ARM/memcpy-ldm-stm.ll                 |    94 +
 test/CodeGen/ARM/memfunc.ll                        |   255 +-
 test/CodeGen/ARM/minmax.ll                         |   193 +
 test/CodeGen/ARM/neon_minmax.ll                    |     1 +
 test/CodeGen/ARM/neon_spill.ll                     |     6 +-
 test/CodeGen/ARM/neon_vabs.ll                      |    38 +
 test/CodeGen/ARM/neon_vshl_minint.ll               |    13 +
 test/CodeGen/ARM/out-of-registers.ll               |     8 +-
 test/CodeGen/ARM/pr25317.ll                        |    11 +
 test/CodeGen/ARM/pr25838.ll                        |    34 +
 test/CodeGen/ARM/rbit.ll                           |    11 +
 test/CodeGen/ARM/reg_sequence.ll                   |    64 +-
 test/CodeGen/ARM/rotate.ll                         |    14 +
 test/CodeGen/ARM/sat-arith.ll                      |    63 +
 test/CodeGen/ARM/sched-it-debug-nodes.ll           |    88 -
 test/CodeGen/ARM/setjmp_longjmp.ll                 |   113 +
 test/CodeGen/ARM/shifter_operand.ll                |   228 +-
 test/CodeGen/ARM/sjlj-prepare-critical-edge.ll     |     2 +-
 .../ARM/sjljehprepare-lower-empty-struct.ll        |     1 +
 test/CodeGen/ARM/softfp-fabs-fneg.ll               |    41 +
 test/CodeGen/ARM/special-reg-mcore.ll              |     2 +-
 test/CodeGen/ARM/spill-q.ll                        |    28 +-
 test/CodeGen/ARM/ssat-lower.ll                     |    11 +
 test/CodeGen/ARM/ssat-upper.ll                     |    11 +
 test/CodeGen/ARM/subtarget-no-movt.ll              |    45 +
 test/CodeGen/ARM/tail-merge-branch-weight.ll       |     2 +-
 test/CodeGen/ARM/taildup-branch-weight.ll          |     4 +-
 test/CodeGen/ARM/test-sharedidx.ll                 |    15 +-
 test/CodeGen/ARM/thumb-alignment.ll                |     2 +-
 test/CodeGen/ARM/thumb1-ldst-opt.ll                |    27 +
 test/CodeGen/ARM/thumb1_return_sequence.ll         |    70 +-
 test/CodeGen/ARM/thumb2-it-block.ll                |    24 +-
 test/CodeGen/ARM/thumb_indirect_calls.ll           |     5 +-
 test/CodeGen/ARM/tls-models.ll                     |    74 +-
 test/CodeGen/ARM/tls3.ll                           |    29 +-
 test/CodeGen/ARM/unaligned_load_store.ll           |     4 +-
 test/CodeGen/ARM/unaligned_load_store_vfp.ll       |    98 +
 test/CodeGen/ARM/usat-lower.ll                     |    11 +
 test/CodeGen/ARM/usat-upper.ll                     |    11 +
 test/CodeGen/ARM/v7k-abi-align.ll                  |   152 +
 test/CodeGen/ARM/v7k-libcalls.ll                   |   154 +
 test/CodeGen/ARM/v7k-sincos.ll                     |    16 +
 test/CodeGen/ARM/vcge.ll                           |     4 +-
 test/CodeGen/ARM/vcombine.ll                       |    64 +-
 test/CodeGen/ARM/vcvt_combine.ll                   |   103 +-
 test/CodeGen/ARM/vdiv_combine.ll                   |    17 +
 test/CodeGen/ARM/vdup.ll                           |    16 +
 test/CodeGen/ARM/vector-DAGCombine.ll              |     4 +-
 test/CodeGen/ARM/vector-load.ll                    |     4 +-
 test/CodeGen/ARM/vector-store.ll                   |     6 +-
 test/CodeGen/ARM/vext.ll                           |    34 +-
 test/CodeGen/ARM/vfp-reg-stride.ll                 |    42 +
 test/CodeGen/ARM/vfp-regs-dwarf.ll                 |     6 +-
 test/CodeGen/ARM/vld-vst-upgrade.ll                |   139 +
 test/CodeGen/ARM/vld1.ll                           |    52 +-
 test/CodeGen/ARM/vld2.ll                           |    40 +-
 test/CodeGen/ARM/vld3.ll                           |    42 +-
 test/CodeGen/ARM/vld4.ll                           |    42 +-
 test/CodeGen/ARM/vlddup.ll                         |    30 +-
 test/CodeGen/ARM/vldlane.ll                        |    92 +-
 test/CodeGen/ARM/vminmaxnm-safe.ll                 |   396 +
 test/CodeGen/ARM/vminmaxnm.ll                      |   358 +-
 test/CodeGen/ARM/vmov.ll                           |     4 +-
 test/CodeGen/ARM/vmul.ll                           |    14 +-
 test/CodeGen/ARM/vpadd.ll                          |     2 +-
 test/CodeGen/ARM/vselect_imax.ll                   |    26 +-
 test/CodeGen/ARM/vst1.ll                           |    48 +-
 test/CodeGen/ARM/vst2.ll                           |    44 +-
 test/CodeGen/ARM/vst3.ll                           |    42 +-
 test/CodeGen/ARM/vst4.ll                           |    42 +-
 test/CodeGen/ARM/vstlane.ll                        |    90 +-
 test/CodeGen/ARM/vtrn.ll                           |   124 +-
 test/CodeGen/ARM/vuzp.ll                           |   136 +-
 test/CodeGen/ARM/vzip.ll                           |    82 +-
 test/CodeGen/BPF/sockex2.ll                        |     2 +-
 test/CodeGen/CPP/gep.ll                            |    10 +
 test/CodeGen/Generic/2009-03-17-LSR-APInt.ll       |    28 +-
 test/CodeGen/Generic/ForceStackAlign.ll            |    27 +
 test/CodeGen/Generic/MachineBranchProb.ll          |     8 +-
 test/CodeGen/Generic/dbg_value.ll                  |     5 +-
 test/CodeGen/Generic/lit.local.cfg                 |     3 +
 test/CodeGen/Generic/overloaded-intrinsic-name.ll  |    32 +-
 test/CodeGen/Generic/vector.ll                     |     6 +
 test/CodeGen/Hexagon/NVJumpCmp.ll                  |    89 +
 test/CodeGen/Hexagon/absaddr-store.ll              |     1 +
 test/CodeGen/Hexagon/adde.ll                       |     6 +-
 test/CodeGen/Hexagon/alu64.ll                      |   134 +-
 test/CodeGen/Hexagon/bit-eval.ll                   |    53 +
 test/CodeGen/Hexagon/bit-loop.ll                   |    80 +
 test/CodeGen/Hexagon/cfi-late.ll                   |    65 +
 test/CodeGen/Hexagon/clr_set_toggle.ll             |     2 +-
 test/CodeGen/Hexagon/combine.ll                    |     2 +-
 test/CodeGen/Hexagon/combine_ir.ll                 |    16 +-
 test/CodeGen/Hexagon/early-if-conversion-bug1.ll   |   412 +
 test/CodeGen/Hexagon/early-if-phi-i1.ll            |    17 +
 test/CodeGen/Hexagon/early-if-spare.ll             |    57 +
 test/CodeGen/Hexagon/early-if.ll                   |    75 +
 test/CodeGen/Hexagon/extload-combine.ll            |     2 +-
 test/CodeGen/Hexagon/hwloop-dbg.ll                 |    12 +-
 test/CodeGen/Hexagon/i16_VarArg.ll                 |     2 +-
 test/CodeGen/Hexagon/i1_VarArg.ll                  |     2 +-
 test/CodeGen/Hexagon/i8_VarArg.ll                  |     2 +-
 test/CodeGen/Hexagon/ifcvt-edge-weight.ll          |    64 +
 test/CodeGen/Hexagon/memcpy-likely-aligned.ll      |    32 +
 test/CodeGen/Hexagon/mux-basic.ll                  |    28 +
 test/CodeGen/Hexagon/opt-fabs.ll                   |     2 +-
 test/CodeGen/Hexagon/pic-jumptables.ll             |    48 +
 test/CodeGen/Hexagon/pic-simple.ll                 |    22 +
 test/CodeGen/Hexagon/pic-static.ll                 |    21 +
 test/CodeGen/Hexagon/relax.ll                      |     9 +-
 test/CodeGen/Hexagon/sdr-basic.ll                  |    15 +
 test/CodeGen/Hexagon/sdr-shr32.ll                  |    22 +
 test/CodeGen/Hexagon/simple_addend.ll              |     2 +-
 test/CodeGen/Hexagon/store-widen-aliased-load.ll   |    21 +
 test/CodeGen/Hexagon/store-widen-negv.ll           |    11 +
 test/CodeGen/Hexagon/store-widen-negv2.ll          |    19 +
 test/CodeGen/Hexagon/store-widen.ll                |    18 +
 test/CodeGen/Hexagon/struct_args.ll                |     2 +-
 test/CodeGen/Hexagon/sube.ll                       |     8 +-
 test/CodeGen/Hexagon/tail-dup-subreg-abort.ll      |    28 +
 test/CodeGen/Hexagon/tfr-to-combine.ll             |     2 +-
 test/CodeGen/Hexagon/union-1.ll                    |     2 -
 test/CodeGen/Hexagon/v60Intrins.ll                 |  2559 +++
 test/CodeGen/Hexagon/v60Vasr.ll                    |   247 +
 test/CodeGen/Hexagon/v60small.ll                   |    51 +
 test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll        |     2 +-
 test/CodeGen/Hexagon/vect/vect-loadv4i16.ll        |     2 +-
 test/CodeGen/Hexagon/vect/vect-shuffle.ll          |     2 +-
 test/CodeGen/Hexagon/vect/vect-splat.ll            |     2 +-
 test/CodeGen/Hexagon/vect/vect-xor.ll              |     2 +-
 test/CodeGen/Inputs/DbgValueOtherTargets.ll        |     8 +-
 test/CodeGen/MIR/AArch64/cfi-def-cfa.mir           |    31 +
 .../MIR/AArch64/expected-target-flag-name.mir      |    23 +
 .../MIR/AArch64/invalid-target-flag-name.mir       |    23 +
 test/CodeGen/MIR/AArch64/lit.local.cfg             |     8 +
 test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir |    28 +
 .../MIR/AArch64/stack-object-local-offset.mir      |    41 +
 test/CodeGen/MIR/AArch64/target-flags.mir          |    39 +
 .../MIR/AMDGPU/expected-target-index-name.mir      |    64 +
 .../MIR/AMDGPU/invalid-target-index-operand.mir    |    64 +
 test/CodeGen/MIR/AMDGPU/lit.local.cfg              |     2 +
 test/CodeGen/MIR/AMDGPU/target-index-operands.mir  |   104 +
 test/CodeGen/MIR/ARM/ARMLoadStoreDBG.mir           |   165 +
 test/CodeGen/MIR/ARM/bundled-instructions.mir      |    75 +
 test/CodeGen/MIR/ARM/cfi-same-value.mir            |    80 +
 test/CodeGen/MIR/ARM/expected-closing-brace.mir    |    50 +
 .../MIR/ARM/extraneous-closing-brace-error.mir     |    20 +
 test/CodeGen/MIR/ARM/lit.local.cfg                 |     2 +
 .../MIR/ARM/nested-instruction-bundle-error.mir    |    30 +
 test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir      |   160 +
 test/CodeGen/MIR/Generic/basic-blocks.mir          |    49 +
 .../Generic/expected-colon-after-basic-block.mir   |    16 +
 .../expected-mbb-reference-for-successor-mbb.mir   |    28 +
 test/CodeGen/MIR/Generic/frame-info.mir            |    89 +
 .../Generic/function-missing-machine-function.mir  |    13 +
 .../MIR/Generic/invalid-jump-table-kind.mir        |    53 +
 test/CodeGen/MIR/Generic/lit.local.cfg             |     3 +
 .../CodeGen/MIR/Generic/llvm-ir-error-reported.mir |    22 +
 test/CodeGen/MIR/Generic/llvmIR.mir                |    37 +
 test/CodeGen/MIR/Generic/llvmIRMissing.mir         |     9 +
 .../machine-basic-block-ir-block-reference.mir     |    17 +
 .../machine-basic-block-redefinition-error.mir     |    18 +
 .../machine-basic-block-undefined-ir-block.mir     |    15 +
 .../Generic/machine-basic-block-unknown-name.mir   |    18 +
 .../machine-function-missing-body-error.mir        |    15 +
 .../Generic/machine-function-missing-function.mir  |    23 +
 .../MIR/Generic/machine-function-missing-name.mir  |    26 +
 .../machine-function-redefinition-error.mir        |    10 +
 test/CodeGen/MIR/Generic/machine-function.mir      |    66 +
 test/CodeGen/MIR/Generic/register-info.mir         |    40 +
 ...ted-global-value-or-symbol-after-call-entry.mir |    41 +
 test/CodeGen/MIR/Mips/lit.local.cfg                |     2 +
 test/CodeGen/MIR/Mips/memory-operands.mir          |   102 +
 .../MIR/NVPTX/expected-floating-point-literal.mir  |    24 +
 .../NVPTX/floating-point-immediate-operands.mir    |    81 +
 .../NVPTX/floating-point-invalid-type-error.mir    |    24 +
 test/CodeGen/MIR/NVPTX/lit.local.cfg               |     2 +
 test/CodeGen/MIR/PowerPC/lit.local.cfg             |     2 +
 .../MIR/PowerPC/unordered-implicit-registers.mir   |    45 +
 test/CodeGen/MIR/X86/basic-block-liveins.mir       |    57 +-
 .../X86/basic-block-not-at-start-of-line-error.mir |    41 +
 test/CodeGen/MIR/X86/block-address-operands.mir    |   121 +
 test/CodeGen/MIR/X86/callee-saved-info.mir         |    95 +
 test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir        |    29 +
 test/CodeGen/MIR/X86/cfi-def-cfa-register.mir      |    32 +
 test/CodeGen/MIR/X86/cfi-offset.mir                |    47 +
 .../X86/constant-pool-item-redefinition-error.mir  |    25 +
 test/CodeGen/MIR/X86/constant-pool.mir             |   139 +
 test/CodeGen/MIR/X86/constant-value-error.mir      |    25 +
 test/CodeGen/MIR/X86/dead-register-flag.mir        |    14 +-
 .../MIR/X86/def-register-already-tied-error.mir    |    25 +
 .../MIR/X86/duplicate-memory-operand-flag.mir      |    27 +
 .../MIR/X86/duplicate-register-flag-error.mir      |    35 +
 .../MIR/X86/early-clobber-register-flag.mir        |    45 +
 .../MIR/X86/expected-align-in-memory-operand.mir   |    30 +
 ...ted-alignment-after-align-in-memory-operand.mir |    30 +
 .../X86/expected-basic-block-at-start-of-body.mir  |    40 +
 .../expected-block-reference-in-blockaddress.mir   |    30 +
 .../MIR/X86/expected-comma-after-cfi-register.mir  |    42 +
 .../X86/expected-comma-after-memory-operand.mir    |    25 +
 .../X86/expected-different-implicit-operand.mir    |    28 +-
 .../expected-different-implicit-register-flag.mir  |    28 +-
 .../MIR/X86/expected-from-in-memory-operand.mir    |    24 +
 ...ected-function-reference-after-blockaddress.mir |    30 +
 .../expected-global-value-after-blockaddress.mir   |    30 +
 .../MIR/X86/expected-integer-after-offset-sign.mir |    24 +
 .../MIR/X86/expected-integer-after-tied-def.mir    |    25 +
 .../X86/expected-integer-in-successor-weight.mir   |    38 +
 .../expected-load-or-store-in-memory-operand.mir   |    23 +
 test/CodeGen/MIR/X86/expected-machine-operand.mir  |    12 +-
 ...expected-metadata-node-after-debug-location.mir |    59 +
 .../X86/expected-metadata-node-after-exclaim.mir   |    59 +
 .../X86/expected-metadata-node-in-stack-object.mir |    25 +
 .../expected-named-register-in-allocation-hint.mir |    29 +
 ...ted-named-register-in-callee-saved-register.mir |    88 +
 ...expected-named-register-in-functions-livein.mir |    27 +
 .../MIR/X86/expected-named-register-livein.mir     |    15 +-
 .../MIR/X86/expected-newline-at-end-of-list.mir    |    41 +
 test/CodeGen/MIR/X86/expected-number-after-bb.mir  |    28 +-
 .../MIR/X86/expected-offset-after-cfi-operand.mir  |    27 +
 .../expected-pointer-value-in-memory-operand.mir   |    24 +
 .../expected-positive-alignment-after-align.mir    |    30 +
 .../X86/expected-register-after-cfi-operand.mir    |    42 +
 .../MIR/X86/expected-register-after-flags.mir      |    12 +-
 ...xpected-size-integer-after-memory-operation.mir |    24 +
 test/CodeGen/MIR/X86/expected-stack-object.mir     |    67 +
 .../MIR/X86/expected-subregister-after-colon.mir   |    18 +-
 test/CodeGen/MIR/X86/expected-target-flag-name.mir |    24 +
 .../MIR/X86/expected-tied-def-after-lparen.mir     |    25 +
 .../MIR/X86/expected-value-in-memory-operand.mir   |    24 +
 ...pected-virtual-register-in-functions-livein.mir |    27 +
 test/CodeGen/MIR/X86/external-symbol-operands.mir  |    64 +
 .../MIR/X86/fixed-stack-memory-operands.mir        |    39 +
 .../X86/fixed-stack-object-redefinition-error.mir  |    28 +
 test/CodeGen/MIR/X86/fixed-stack-objects.mir       |    12 +-
 .../MIR/X86/frame-info-save-restore-points.mir     |    73 +
 .../MIR/X86/frame-info-stack-references.mir        |    79 +
 .../MIR/X86/frame-setup-instruction-flag.mir       |    35 +
 test/CodeGen/MIR/X86/function-liveins.mir          |    37 +
 test/CodeGen/MIR/X86/global-value-operands.mir     |   127 +-
 test/CodeGen/MIR/X86/immediate-operands.mir        |    28 +-
 test/CodeGen/MIR/X86/implicit-register-flag.mir    |    65 +-
 test/CodeGen/MIR/X86/inline-asm-registers.mir      |    54 +
 .../MIR/X86/instructions-debug-location.mir        |    98 +
 .../CodeGen/MIR/X86/invalid-constant-pool-item.mir |    25 +
 .../CodeGen/MIR/X86/invalid-metadata-node-type.mir |    53 +
 test/CodeGen/MIR/X86/invalid-target-flag-name.mir  |    24 +
 .../MIR/X86/invalid-tied-def-index-error.mir       |    25 +
 test/CodeGen/MIR/X86/jump-table-info.mir           |   150 +
 .../MIR/X86/jump-table-redefinition-error.mir      |    76 +
 test/CodeGen/MIR/X86/killed-register-flag.mir      |    38 +-
 .../MIR/X86/large-cfi-offset-number-error.mir      |    27 +
 .../MIR/X86/large-immediate-operand-error.mir      |    18 +
 test/CodeGen/MIR/X86/large-index-number-error.mir  |    26 +-
 test/CodeGen/MIR/X86/large-offset-number-error.mir |    24 +
 .../MIR/X86/large-size-in-memory-operand-error.mir |    24 +
 test/CodeGen/MIR/X86/liveout-register-mask.mir     |    42 +
 .../MIR/X86/machine-basic-block-operands.mir       |    68 +-
 test/CodeGen/MIR/X86/machine-instructions.mir      |    14 +-
 test/CodeGen/MIR/X86/machine-verifier.mir          |    22 +
 test/CodeGen/MIR/X86/memory-operands.mir           |   508 +
 test/CodeGen/MIR/X86/metadata-operands.mir         |    63 +
 test/CodeGen/MIR/X86/missing-closing-quote.mir     |    22 +
 test/CodeGen/MIR/X86/missing-comma.mir             |    12 +-
 test/CodeGen/MIR/X86/missing-implicit-operand.mir  |    30 +-
 test/CodeGen/MIR/X86/missing-instruction.mir       |    19 -
 test/CodeGen/MIR/X86/named-registers.mir           |    14 +-
 test/CodeGen/MIR/X86/newline-handling.mir          |   109 +
 test/CodeGen/MIR/X86/null-register-operands.mir    |    14 +-
 test/CodeGen/MIR/X86/register-mask-operands.mir    |    28 +-
 .../X86/register-operands-target-flag-error.mir    |    24 +
 .../MIR/X86/simple-register-allocation-hints.mir   |    34 +
 .../X86/spill-slot-fixed-stack-object-aliased.mir  |    12 +-
 .../spill-slot-fixed-stack-object-immutable.mir    |    12 +-
 .../MIR/X86/spill-slot-fixed-stack-objects.mir     |    12 +-
 test/CodeGen/MIR/X86/stack-object-debug-info.mir   |    65 +
 test/CodeGen/MIR/X86/stack-object-invalid-name.mir |    28 +
 .../stack-object-operand-name-mismatch-error.mir   |    33 +
 test/CodeGen/MIR/X86/stack-object-operands.mir     |    45 +
 .../MIR/X86/stack-object-redefinition-error.mir    |    37 +
 test/CodeGen/MIR/X86/stack-objects.mir             |    22 +-
 test/CodeGen/MIR/X86/standalone-register-error.mir |    24 +
 test/CodeGen/MIR/X86/subregister-operands.mir      |    21 +-
 .../MIR/X86/successor-basic-blocks-weights.mir     |    42 +
 test/CodeGen/MIR/X86/successor-basic-blocks.mir    |    83 +
 test/CodeGen/MIR/X86/tied-def-operand-invalid.mir  |    25 +
 test/CodeGen/MIR/X86/undef-register-flag.mir       |    26 +-
 .../MIR/X86/undefined-fixed-stack-object.mir       |    38 +
 test/CodeGen/MIR/X86/undefined-global-value.mir    |    16 +-
 .../MIR/X86/undefined-ir-block-in-blockaddress.mir |    30 +
 .../undefined-ir-block-slot-in-blockaddress.mir    |    29 +
 test/CodeGen/MIR/X86/undefined-jump-table-id.mir   |    73 +
 .../MIR/X86/undefined-named-global-value.mir       |    16 +-
 test/CodeGen/MIR/X86/undefined-register-class.mir  |     8 +-
 test/CodeGen/MIR/X86/undefined-stack-object.mir    |    30 +
 .../MIR/X86/undefined-value-in-memory-operand.mir  |    24 +
 .../CodeGen/MIR/X86/undefined-virtual-register.mir |    14 +-
 test/CodeGen/MIR/X86/unknown-instruction.mir       |    10 +-
 .../MIR/X86/unknown-machine-basic-block.mir        |    26 +-
 test/CodeGen/MIR/X86/unknown-metadata-keyword.mir  |    25 +
 test/CodeGen/MIR/X86/unknown-metadata-node.mir     |    59 +
 .../MIR/X86/unknown-named-machine-basic-block.mir  |    28 +-
 test/CodeGen/MIR/X86/unknown-register.mir          |    12 +-
 test/CodeGen/MIR/X86/unknown-subregister-index.mir |    18 +-
 test/CodeGen/MIR/X86/unrecognized-character.mir    |    10 +-
 .../MIR/X86/used-physical-register-info.mir        |   109 +
 .../X86/variable-sized-stack-object-size-error.mir |    14 +-
 .../MIR/X86/variable-sized-stack-objects.mir       |    18 +-
 .../X86/virtual-register-redefinition-error.mir    |    27 +
 test/CodeGen/MIR/X86/virtual-registers.mir         |    90 +-
 test/CodeGen/MIR/basic-blocks.mir                  |    49 -
 .../MIR/expected-eof-after-successor-mbb.mir       |    29 -
 .../expected-mbb-reference-for-successor-mbb.mir   |    29 -
 test/CodeGen/MIR/frame-info.mir                    |    91 -
 .../MIR/function-missing-machine-function.mir      |    13 -
 test/CodeGen/MIR/llvm-ir-error-reported.mir        |    22 -
 test/CodeGen/MIR/llvmIR.mir                        |    37 -
 test/CodeGen/MIR/llvmIRMissing.mir                 |     9 -
 .../MIR/machine-basic-block-redefinition-error.mir |    17 -
 .../MIR/machine-basic-block-unknown-name.mir       |    19 -
 .../MIR/machine-function-missing-body-error.mir    |    15 -
 .../MIR/machine-function-missing-function.mir      |    23 -
 test/CodeGen/MIR/machine-function-missing-name.mir |    26 -
 .../MIR/machine-function-redefinition-error.mir    |    10 -
 test/CodeGen/MIR/machine-function.mir              |    66 -
 test/CodeGen/MIR/register-info.mir                 |    40 -
 test/CodeGen/MIR/successor-basic-blocks.mir        |    58 -
 .../CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll |    27 +
 test/CodeGen/Mips/addi.ll                          |     2 +-
 test/CodeGen/Mips/adjust-callstack-sp.ll           |     2 +-
 test/CodeGen/Mips/align16.ll                       |     2 +-
 test/CodeGen/Mips/alloca16.ll                      |     2 +-
 test/CodeGen/Mips/and1.ll                          |     2 +-
 test/CodeGen/Mips/asm-large-immediate.ll           |     3 +-
 test/CodeGen/Mips/atomicops.ll                     |     2 +-
 test/CodeGen/Mips/beqzc.ll                         |     2 +-
 test/CodeGen/Mips/beqzc1.ll                        |     2 +-
 test/CodeGen/Mips/br-jmp.ll                        |     4 +-
 test/CodeGen/Mips/brconeq.ll                       |     2 +-
 test/CodeGen/Mips/brconeqk.ll                      |     2 +-
 test/CodeGen/Mips/brconeqz.ll                      |     2 +-
 test/CodeGen/Mips/brconge.ll                       |     2 +-
 test/CodeGen/Mips/brcongt.ll                       |     2 +-
 test/CodeGen/Mips/brconle.ll                       |     2 +-
 test/CodeGen/Mips/brconlt.ll                       |     2 +-
 test/CodeGen/Mips/brconne.ll                       |     2 +-
 test/CodeGen/Mips/brconnek.ll                      |     2 +-
 test/CodeGen/Mips/brconnez.ll                      |     2 +-
 test/CodeGen/Mips/brind.ll                         |     2 +-
 test/CodeGen/Mips/brsize3.ll                       |     4 +-
 test/CodeGen/Mips/brsize3a.ll                      |     2 +-
 test/CodeGen/Mips/cconv/arguments-varargs.ll       |    72 +-
 test/CodeGen/Mips/ci2.ll                           |     2 +-
 test/CodeGen/Mips/cmplarge.ll                      |     2 +-
 test/CodeGen/Mips/const1.ll                        |     2 +-
 test/CodeGen/Mips/const4a.ll                       |     2 +-
 test/CodeGen/Mips/const6.ll                        |     4 +-
 test/CodeGen/Mips/const6a.ll                       |     4 +-
 test/CodeGen/Mips/div.ll                           |     2 +-
 test/CodeGen/Mips/div_rem.ll                       |     2 +-
 test/CodeGen/Mips/divu.ll                          |     2 +-
 test/CodeGen/Mips/divu_remu.ll                     |     2 +-
 test/CodeGen/Mips/eh.ll                            |     2 +-
 test/CodeGen/Mips/emergency-spill-slot-near-fp.ll  |     4 +-
 test/CodeGen/Mips/emutls_generic.ll                |    70 +
 test/CodeGen/Mips/ex2.ll                           |     2 +-
 test/CodeGen/Mips/extins.ll                        |     2 +-
 test/CodeGen/Mips/f16abs.ll                        |     2 +-
 test/CodeGen/Mips/fixdfsf.ll                       |     4 +-
 test/CodeGen/Mips/fp16instrinsmc.ll                |     4 +-
 test/CodeGen/Mips/fp16mix.ll                       |     6 +-
 test/CodeGen/Mips/fp16static.ll                    |     2 +-
 test/CodeGen/Mips/helloworld.ll                    |    12 +-
 test/CodeGen/Mips/hf16_1.ll                        |     4 +-
 test/CodeGen/Mips/hf16call32.ll                    |   408 +-
 test/CodeGen/Mips/hf16call32_body.ll               |   206 +-
 test/CodeGen/Mips/hf1_body.ll                      |    18 +-
 test/CodeGen/Mips/hfptrcall.ll                     |     2 +-
 test/CodeGen/Mips/i32k.ll                          |     2 +-
 .../CodeGen/Mips/inlineasm-assembler-directives.ll |     4 +-
 test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll         |    36 +-
 test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll       |     4 +-
 test/CodeGen/Mips/inlineasm-operand-code.ll        |   185 +-
 test/CodeGen/Mips/inlineasm_constraint.ll          |    94 +-
 test/CodeGen/Mips/inlineasmmemop.ll                |     8 +-
 test/CodeGen/Mips/insn-zero-size-bb.ll             |     4 +-
 test/CodeGen/Mips/interrupt-attr-64-error.ll       |     9 +
 test/CodeGen/Mips/interrupt-attr-args-error.ll     |     9 +
 test/CodeGen/Mips/interrupt-attr-error.ll          |     9 +
 test/CodeGen/Mips/interrupt-attr.ll                |   244 +
 test/CodeGen/Mips/jtstat.ll                        |     2 +-
 test/CodeGen/Mips/l3mc.ll                          |    20 +-
 test/CodeGen/Mips/lb1.ll                           |     2 +-
 test/CodeGen/Mips/lbu1.ll                          |     2 +-
 test/CodeGen/Mips/lcb2.ll                          |     4 +-
 test/CodeGen/Mips/lcb3c.ll                         |     2 +-
 test/CodeGen/Mips/lcb4a.ll                         |     2 +-
 test/CodeGen/Mips/lcb5.ll                          |     2 +-
 test/CodeGen/Mips/lh1.ll                           |     2 +-
 test/CodeGen/Mips/lhu1.ll                          |     2 +-
 test/CodeGen/Mips/llcarry.ll                       |     2 +-
 test/CodeGen/Mips/llvm-ir/atomicrmx.ll             |    26 +
 test/CodeGen/Mips/llvm-ir/call.ll                  |    14 +
 test/CodeGen/Mips/llvm-ir/load-atomic.ll           |    42 +
 test/CodeGen/Mips/llvm-ir/sqrt.ll                  |    13 +
 test/CodeGen/Mips/llvm-ir/store-atomic.ll          |    42 +
 test/CodeGen/Mips/madd-msub.ll                     |     2 +-
 test/CodeGen/Mips/mbrsize4a.ll                     |     2 +-
 test/CodeGen/Mips/mips16-hf-attr-2.ll              |     2 +-
 test/CodeGen/Mips/mips16-hf-attr.ll                |     2 +-
 test/CodeGen/Mips/mips16_32_1.ll                   |     2 +-
 test/CodeGen/Mips/mips16_32_10.ll                  |     2 +-
 test/CodeGen/Mips/mips16_32_3.ll                   |     2 +-
 test/CodeGen/Mips/mips16_32_4.ll                   |     2 +-
 test/CodeGen/Mips/mips16_32_5.ll                   |     2 +-
 test/CodeGen/Mips/mips16_32_6.ll                   |     2 +-
 test/CodeGen/Mips/mips16_32_7.ll                   |     2 +-
 test/CodeGen/Mips/mips16_fpret.ll                  |     8 +-
 test/CodeGen/Mips/mips16ex.ll                      |     2 +-
 test/CodeGen/Mips/mips16fpe.ll                     |     6 +-
 test/CodeGen/Mips/misha.ll                         |     2 +-
 test/CodeGen/Mips/msa/elm_copy.ll                  |     5 +-
 test/CodeGen/Mips/mul.ll                           |     2 +-
 test/CodeGen/Mips/mulll.ll                         |     2 +-
 test/CodeGen/Mips/mulull.ll                        |     2 +-
 test/CodeGen/Mips/nacl-align.ll                    |     7 +-
 test/CodeGen/Mips/neg1.ll                          |     2 +-
 test/CodeGen/Mips/no-odd-spreg-msa.ll              |    24 +-
 test/CodeGen/Mips/nomips16.ll                      |     2 +-
 test/CodeGen/Mips/not1.ll                          |     2 +-
 test/CodeGen/Mips/null.ll                          |     2 +-
 test/CodeGen/Mips/or1.ll                           |     2 +-
 test/CodeGen/Mips/powif64_16.ll                    |     2 +-
 test/CodeGen/Mips/rem.ll                           |     2 +-
 test/CodeGen/Mips/remu.ll                          |     2 +-
 test/CodeGen/Mips/s2rem.ll                         |     4 +-
 test/CodeGen/Mips/sb1.ll                           |     2 +-
 test/CodeGen/Mips/sel1c.ll                         |     2 +-
 test/CodeGen/Mips/sel2c.ll                         |     2 +-
 test/CodeGen/Mips/selTBteqzCmpi.ll                 |     2 +-
 test/CodeGen/Mips/selTBtnezCmpi.ll                 |     2 +-
 test/CodeGen/Mips/selTBtnezSlti.ll                 |     2 +-
 test/CodeGen/Mips/seleq.ll                         |     2 +-
 test/CodeGen/Mips/seleqk.ll                        |     2 +-
 test/CodeGen/Mips/selgek.ll                        |     2 +-
 test/CodeGen/Mips/selgt.ll                         |     2 +-
 test/CodeGen/Mips/selle.ll                         |     2 +-
 test/CodeGen/Mips/selltk.ll                        |     2 +-
 test/CodeGen/Mips/selne.ll                         |     2 +-
 test/CodeGen/Mips/selnek.ll                        |     2 +-
 test/CodeGen/Mips/selpat.ll                        |     2 +-
 test/CodeGen/Mips/seteq.ll                         |     2 +-
 test/CodeGen/Mips/seteqz.ll                        |     2 +-
 test/CodeGen/Mips/setge.ll                         |     2 +-
 test/CodeGen/Mips/setgek.ll                        |     2 +-
 test/CodeGen/Mips/setle.ll                         |     2 +-
 test/CodeGen/Mips/setlt.ll                         |     2 +-
 test/CodeGen/Mips/setltk.ll                        |     2 +-
 test/CodeGen/Mips/setne.ll                         |     2 +-
 test/CodeGen/Mips/setuge.ll                        |     2 +-
 test/CodeGen/Mips/setugt.ll                        |     2 +-
 test/CodeGen/Mips/setule.ll                        |     2 +-
 test/CodeGen/Mips/setult.ll                        |     2 +-
 test/CodeGen/Mips/setultk.ll                       |     2 +-
 test/CodeGen/Mips/sh1.ll                           |     2 +-
 test/CodeGen/Mips/simplebr.ll                      |     2 +-
 test/CodeGen/Mips/sitofp-selectcc-opt.ll           |     3 +-
 test/CodeGen/Mips/sll1.ll                          |     2 +-
 test/CodeGen/Mips/sll2.ll                          |     2 +-
 test/CodeGen/Mips/sr1.ll                           |     4 +-
 test/CodeGen/Mips/sra1.ll                          |     2 +-
 test/CodeGen/Mips/sra2.ll                          |     2 +-
 test/CodeGen/Mips/srl1.ll                          |     2 +-
 test/CodeGen/Mips/srl2.ll                          |     2 +-
 test/CodeGen/Mips/stchar.ll                        |     4 +-
 test/CodeGen/Mips/stldst.ll                        |     2 +-
 test/CodeGen/Mips/sub1.ll                          |     2 +-
 test/CodeGen/Mips/sub2.ll                          |     2 +-
 test/CodeGen/Mips/tail16.ll                        |     2 +-
 test/CodeGen/Mips/tailcall.ll                      |     2 +-
 test/CodeGen/Mips/tls-alias.ll                     |     2 +-
 test/CodeGen/Mips/tls16.ll                         |     2 +-
 test/CodeGen/Mips/tls16_2.ll                       |     2 +-
 test/CodeGen/Mips/trap1.ll                         |     2 +-
 test/CodeGen/Mips/ul1.ll                           |     2 +-
 test/CodeGen/Mips/xor1.ll                          |     2 +-
 test/CodeGen/NVPTX/branch-fold.ll                  |    40 +
 test/CodeGen/NVPTX/bypass-div.ll                   |    80 +
 test/CodeGen/NVPTX/combine-min-max.ll              |   307 +
 test/CodeGen/NVPTX/fma-assoc.ll                    |    13 +
 test/CodeGen/NVPTX/global-addrspace.ll             |    12 +
 test/CodeGen/NVPTX/load-with-non-coherent-cache.ll |   264 +
 test/CodeGen/NVPTX/lower-aggr-copies.ll            |   118 +-
 test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll         |    20 +-
 test/CodeGen/NVPTX/reg-copy.ll                     |   224 +
 test/CodeGen/NVPTX/symbol-naming.ll                |     4 +-
 test/CodeGen/NVPTX/vector-call.ll                  |     2 +-
 test/CodeGen/PowerPC/2006-01-20-ShiftPartsCrash.ll |     1 +
 test/CodeGen/PowerPC/2006-08-15-SelectionCrash.ll  |     1 +
 test/CodeGen/PowerPC/2006-12-07-LargeAlloca.ll     |     1 +
 test/CodeGen/PowerPC/2006-12-07-SelectCrash.ll     |     1 +
 test/CodeGen/PowerPC/2007-11-19-VectorSplitting.ll |     1 +
 test/CodeGen/PowerPC/BoolRetToIntTest.ll           |   203 +
 test/CodeGen/PowerPC/BreakableToken-reduced.ll     |   335 +
 test/CodeGen/PowerPC/aantidep-def-ec.mir           |   117 +
 test/CodeGen/PowerPC/aantidep-inline-asm-use.ll    |   305 +
 test/CodeGen/PowerPC/addisdtprelha-nonr3.mir       |    80 +
 test/CodeGen/PowerPC/alias.ll                      |     4 +-
 test/CodeGen/PowerPC/bitcasts-direct-move.ll       |    83 +
 test/CodeGen/PowerPC/bitreverse.ll                 |    23 +
 test/CodeGen/PowerPC/branch-hint.ll                |   135 +
 test/CodeGen/PowerPC/coal-sections.ll              |    24 +
 test/CodeGen/PowerPC/crbit-asm-disabled.ll         |    16 +
 test/CodeGen/PowerPC/crbit-asm.ll                  |     3 +-
 test/CodeGen/PowerPC/cttz.ll                       |     2 +-
 test/CodeGen/PowerPC/dbg.ll                        |    10 +-
 test/CodeGen/PowerPC/dyn-alloca-offset.ll          |    21 +
 test/CodeGen/PowerPC/e500-1.ll                     |    30 +
 test/CodeGen/PowerPC/emutls_generic.ll             |    41 +
 test/CodeGen/PowerPC/fast-isel-binary.ll           |    26 +-
 test/CodeGen/PowerPC/fast-isel-br-const.ll         |     2 +-
 test/CodeGen/PowerPC/fast-isel-call.ll             |    14 +-
 test/CodeGen/PowerPC/fast-isel-cmp-imm.ll          |    34 +-
 test/CodeGen/PowerPC/fast-isel-const.ll            |     2 +-
 test/CodeGen/PowerPC/fast-isel-conversion-p5.ll    |    20 +-
 test/CodeGen/PowerPC/fast-isel-conversion.ll       |    48 +-
 test/CodeGen/PowerPC/fast-isel-crash.ll            |     4 +-
 test/CodeGen/PowerPC/fast-isel-ext.ll              |    20 +-
 test/CodeGen/PowerPC/fast-isel-fold.ll             |    26 +-
 test/CodeGen/PowerPC/fast-isel-indirectbr.ll       |     2 +-
 test/CodeGen/PowerPC/fast-isel-load-store.ll       |    34 +-
 test/CodeGen/PowerPC/fast-isel-redefinition.ll     |     2 +-
 test/CodeGen/PowerPC/fast-isel-ret.ll              |    52 +-
 test/CodeGen/PowerPC/fast-isel-shifter.ll          |    12 +-
 .../PowerPC/fastisel-gep-promote-before-add.ll     |     2 +-
 .../PowerPC/fma-mutate-register-constraint.ll      |    89 +
 .../PowerPC/fp-int-conversions-direct-moves.ll     |    24 +-
 .../PowerPC/fp128-bitcast-after-operation.ll       |   137 +
 test/CodeGen/PowerPC/load-shift-combine.ll         |     1 +
 test/CodeGen/PowerPC/long-compare.ll               |     2 +-
 test/CodeGen/PowerPC/machine-combiner.ll           |   188 +
 test/CodeGen/PowerPC/mc-instrlat.ll                |    25 +
 test/CodeGen/PowerPC/mcm-13.ll                     |    27 +
 test/CodeGen/PowerPC/memcpy-vec.ll                 |     7 +-
 test/CodeGen/PowerPC/merge-st-chain-op.ll          |    41 +
 .../PowerPC/p8-scalar_vector_conversions.ll        |  1476 ++
 test/CodeGen/PowerPC/peephole-align.ll             |   335 +
 test/CodeGen/PowerPC/ppc-shrink-wrapping.ll        |   784 +
 test/CodeGen/PowerPC/ppc32-i1-vaarg.ll             |     2 +-
 test/CodeGen/PowerPC/ppc64-icbt-pwr7.ll            |     8 +-
 test/CodeGen/PowerPC/ppcsoftops.ll                 |    50 +
 test/CodeGen/PowerPC/pr17168.ll                    |   366 +-
 test/CodeGen/PowerPC/pr24546.ll                    |    22 +-
 test/CodeGen/PowerPC/pr24636.ll                    |    41 +
 test/CodeGen/PowerPC/pr25157-peephole.ll           |    61 +
 test/CodeGen/PowerPC/preincprep-nontrans-crash.ll  |    94 +
 test/CodeGen/PowerPC/qpx-unal-cons-lds.ll          |   217 +
 test/CodeGen/PowerPC/retaddr2.ll                   |     6 +-
 test/CodeGen/PowerPC/rm-zext.ll                    |     6 +-
 test/CodeGen/PowerPC/rotl-rotr-crash.ll            |    12 +
 test/CodeGen/PowerPC/sdiv-pow2.ll                  |     8 +-
 .../selectiondag-extload-computeknownbits.ll       |    12 +
 test/CodeGen/PowerPC/seteq-0.ll                    |     2 +-
 test/CodeGen/PowerPC/sjlj.ll                       |    20 +-
 test/CodeGen/PowerPC/stack-realign.ll              |    26 +-
 test/CodeGen/PowerPC/stackmap-frame-setup.ll       |    20 +
 test/CodeGen/PowerPC/swaps-le-5.ll                 |     4 +-
 test/CodeGen/PowerPC/swaps-le-6.ll                 |    42 +
 test/CodeGen/PowerPC/unal-vec-ldst.ll              |   580 +
 test/CodeGen/PowerPC/unal-vec-negarith.ll          |    17 +
 test/CodeGen/PowerPC/unwind-dw2-g.ll               |     6 +-
 test/CodeGen/PowerPC/variable_elem_vec_extracts.ll |   114 +
 test/CodeGen/PowerPC/vec-asm-disabled.ll           |    14 +
 test/CodeGen/PowerPC/vec_add_sub_quadword.ll       |     6 +-
 .../PowerPC/vector-merge-store-fp-constants.ll     |    28 +
 test/CodeGen/PowerPC/vsx.ll                        |     5 +-
 test/CodeGen/PowerPC/vsx_insert_extract_le.ll      |     6 +-
 test/CodeGen/PowerPC/vsx_scalar_ld_st.ll           |     6 +-
 test/CodeGen/PowerPC/vsx_shuffle_le.ll             |    20 +-
 test/CodeGen/SPARC/2011-01-22-SRet.ll              |     2 +-
 test/CodeGen/SPARC/32abi.ll                        |   191 +
 test/CodeGen/SPARC/64abi.ll                        |    84 +-
 test/CodeGen/SPARC/basictest.ll                    |    21 +-
 test/CodeGen/SPARC/float-constants.ll              |    41 +
 test/CodeGen/SPARC/float.ll                        |    10 +-
 test/CodeGen/SPARC/fp128.ll                        |     4 +-
 test/CodeGen/SPARC/inlineasm.ll                    |    53 +-
 test/CodeGen/SPARC/missing-sret.ll                 |     9 +
 test/CodeGen/SPARC/reserved-regs.ll                |   135 +
 test/CodeGen/SPARC/select-mask.ll                  |    17 +
 test/CodeGen/SPARC/spill.ll                        |    64 +
 test/CodeGen/SPARC/stack-align.ll                  |    22 +
 test/CodeGen/SPARC/tls.ll                          |     2 +-
 test/CodeGen/SPARC/varargs.ll                      |     2 +-
 test/CodeGen/SystemZ/alloca-03.ll                  |    84 +
 test/CodeGen/SystemZ/alloca-04.ll                  |    14 +
 test/CodeGen/SystemZ/args-01.ll                    |     4 +-
 test/CodeGen/SystemZ/args-02.ll                    |     4 +-
 test/CodeGen/SystemZ/args-03.ll                    |     4 +-
 test/CodeGen/SystemZ/args-04.ll                    |     2 +-
 test/CodeGen/SystemZ/args-07.ll                    |     2 +-
 test/CodeGen/SystemZ/asm-17.ll                     |     3 +-
 test/CodeGen/SystemZ/asm-18.ll                     |     3 +-
 test/CodeGen/SystemZ/dag-combine-01.ll             |    97 +
 test/CodeGen/SystemZ/fp-abs-01.ll                  |     4 +-
 test/CodeGen/SystemZ/fp-abs-02.ll                  |     4 +-
 test/CodeGen/SystemZ/fp-add-02.ll                  |     2 +-
 test/CodeGen/SystemZ/fp-cmp-02.ll                  |     5 +-
 test/CodeGen/SystemZ/fp-cmp-05.ll                  |    80 +
 test/CodeGen/SystemZ/fp-const-02.ll                |     4 +-
 test/CodeGen/SystemZ/fp-libcall.ll                 |   273 +
 test/CodeGen/SystemZ/fp-move-05.ll                 |     2 +-
 test/CodeGen/SystemZ/fp-neg-01.ll                  |     4 +-
 test/CodeGen/SystemZ/fp-sincos-01.ll               |    56 +
 test/CodeGen/SystemZ/insert-05.ll                  |     4 +-
 test/CodeGen/SystemZ/int-cmp-44.ll                 |     3 +-
 test/CodeGen/SystemZ/int-cmp-51.ll                 |    34 +
 test/CodeGen/SystemZ/int-cmp-52.ll                 |    24 +
 test/CodeGen/SystemZ/memchr-01.ll                  |     2 +-
 test/CodeGen/SystemZ/spill-01.ll                   |     2 +-
 test/CodeGen/SystemZ/vec-args-04.ll                |    26 +-
 test/CodeGen/SystemZ/vec-args-05.ll                |    10 +-
 test/CodeGen/SystemZ/vec-perm-12.ll                |    43 +
 test/CodeGen/SystemZ/vec-perm-13.ll                |    38 +
 test/CodeGen/SystemZ/xor-01.ll                     |     2 +-
 test/CodeGen/Thumb/2010-07-15-debugOrdering.ll     |    14 +-
 test/CodeGen/Thumb/cortex-m0-unaligned-access.ll   |     2 +-
 test/CodeGen/Thumb/large-stack.ll                  |    20 +-
 .../Thumb/ldm-stm-base-materialization-thumb2.ll   |    93 +
 test/CodeGen/Thumb/ldm-stm-base-materialization.ll |    77 +-
 test/CodeGen/Thumb/pop.ll                          |     4 +-
 test/CodeGen/Thumb/segmented-stacks.ll             |    24 +-
 test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll         |    36 -
 test/CodeGen/Thumb/thumb-shrink-wrapping.ll        |   691 +
 test/CodeGen/Thumb/vargs.ll                        |     6 +-
 test/CodeGen/Thumb2/crash.ll                       |    14 +-
 test/CodeGen/Thumb2/emit-unwinding.ll              |    11 +
 test/CodeGen/Thumb2/float-cmp.ll                   |    44 +-
 test/CodeGen/Thumb2/float-intrinsics-double.ll     |    11 +-
 test/CodeGen/Thumb2/float-intrinsics-float.ll      |     4 +-
 test/CodeGen/Thumb2/ifcvt-compare.ll               |     6 +-
 test/CodeGen/Thumb2/machine-licm.ll                |     8 +-
 test/CodeGen/Thumb2/pic-load.ll                    |    12 +-
 test/CodeGen/Thumb2/setjmp_longjmp.ll              |    89 +
 test/CodeGen/Thumb2/thumb2-ifcvt1.ll               |    14 +-
 test/CodeGen/Thumb2/thumb2-ifcvt2.ll               |     4 +-
 test/CodeGen/Thumb2/thumb2-mulhi.ll                |     2 +-
 test/CodeGen/Thumb2/thumb2-smla.ll                 |     4 +-
 test/CodeGen/Thumb2/thumb2-smul.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-spill-q.ll              |    28 +-
 test/CodeGen/Thumb2/thumb2-uxt_rot.ll              |     8 +-
 test/CodeGen/Thumb2/v8_IT_1.ll                     |     4 +-
 test/CodeGen/Thumb2/v8_IT_3.ll                     |     5 +-
 test/CodeGen/Thumb2/v8_IT_5.ll                     |     4 +-
 test/CodeGen/WebAssembly/call.ll                   |   127 +
 test/CodeGen/WebAssembly/cfg-stackify.ll           |  1102 ++
 test/CodeGen/WebAssembly/comparisons_f32.ll        |   181 +
 test/CodeGen/WebAssembly/comparisons_f64.ll        |   181 +
 test/CodeGen/WebAssembly/comparisons_i32.ll        |    98 +
 test/CodeGen/WebAssembly/comparisons_i64.ll        |    98 +
 test/CodeGen/WebAssembly/conv.ll                   |   255 +
 test/CodeGen/WebAssembly/copysign-casts.ll         |    28 +
 test/CodeGen/WebAssembly/cpus.ll                   |    17 +
 test/CodeGen/WebAssembly/dead-vreg.ll              |    51 +
 test/CodeGen/WebAssembly/f32.ll                    |   154 +
 test/CodeGen/WebAssembly/f64.ll                    |   154 +
 test/CodeGen/WebAssembly/fast-isel.ll              |    20 +
 test/CodeGen/WebAssembly/frem.ll                   |    26 +
 test/CodeGen/WebAssembly/func.ll                   |    62 +
 test/CodeGen/WebAssembly/global.ll                 |   177 +
 test/CodeGen/WebAssembly/globl.ll                  |    10 +
 test/CodeGen/WebAssembly/i32.ll                    |   190 +
 test/CodeGen/WebAssembly/i64.ll                    |   190 +
 test/CodeGen/WebAssembly/ident.ll                  |    12 +
 test/CodeGen/WebAssembly/immediates.ll             |   198 +
 test/CodeGen/WebAssembly/inline-asm.ll             |    94 +
 test/CodeGen/WebAssembly/legalize.ll               |    62 +
 test/CodeGen/WebAssembly/load-ext.ll               |    96 +
 test/CodeGen/WebAssembly/load-store-i1.ll          |    68 +
 test/CodeGen/WebAssembly/load.ll                   |    46 +
 test/CodeGen/WebAssembly/loop-idiom.ll             |    53 +
 test/CodeGen/WebAssembly/memory-addr32.ll          |    27 +
 test/CodeGen/WebAssembly/memory-addr64.ll          |    27 +
 test/CodeGen/WebAssembly/offset-folding.ll         |    48 +
 test/CodeGen/WebAssembly/offset.ll                 |   185 +
 test/CodeGen/WebAssembly/phi.ll                    |    47 +
 test/CodeGen/WebAssembly/reg-stackify.ll           |   126 +
 test/CodeGen/WebAssembly/return-int32.ll           |    10 +
 test/CodeGen/WebAssembly/return-void.ll            |    10 +
 test/CodeGen/WebAssembly/returned.ll               |    49 +
 test/CodeGen/WebAssembly/select.ll                 |   135 +
 test/CodeGen/WebAssembly/signext-zeroext.ll        |    60 +
 test/CodeGen/WebAssembly/store-results.ll          |    61 +
 test/CodeGen/WebAssembly/store-trunc.ll            |    46 +
 test/CodeGen/WebAssembly/store.ll                  |    42 +
 test/CodeGen/WebAssembly/switch.ll                 |   174 +
 test/CodeGen/WebAssembly/unreachable.ll            |    34 +
 test/CodeGen/WebAssembly/unused-argument.ll        |    31 +
 test/CodeGen/WebAssembly/userstack.ll              |    81 +
 test/CodeGen/WebAssembly/varargs.ll                |   123 +
 test/CodeGen/WebAssembly/vtable.ll                 |   171 +
 test/CodeGen/WinEH/cppeh-alloca-sink.ll            |   180 -
 test/CodeGen/WinEH/cppeh-catch-all-win32.ll        |    86 -
 test/CodeGen/WinEH/cppeh-catch-all.ll              |    97 -
 test/CodeGen/WinEH/cppeh-catch-and-throw.ll        |   143 -
 test/CodeGen/WinEH/cppeh-catch-scalar.ll           |   126 -
 test/CodeGen/WinEH/cppeh-catch-unwind.ll           |   240 -
 test/CodeGen/WinEH/cppeh-cleanup-invoke.ll         |    91 -
 test/CodeGen/WinEH/cppeh-demote-liveout.ll         |    72 -
 test/CodeGen/WinEH/cppeh-frame-vars.ll             |   272 -
 test/CodeGen/WinEH/cppeh-inalloca.ll               |   194 -
 test/CodeGen/WinEH/cppeh-min-unwind.ll             |    99 -
 .../CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll |   106 -
 test/CodeGen/WinEH/cppeh-multi-catch.ll            |   226 -
 test/CodeGen/WinEH/cppeh-nested-1.ll               |   194 -
 test/CodeGen/WinEH/cppeh-nested-2.ll               |   324 -
 test/CodeGen/WinEH/cppeh-nested-3.ll               |   260 -
 test/CodeGen/WinEH/cppeh-nested-rethrow.ll         |   212 -
 test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll |   278 -
 test/CodeGen/WinEH/cppeh-prepared-catch-all.ll     |    47 -
 .../WinEH/cppeh-prepared-catch-reordered.ll        |   165 -
 test/CodeGen/WinEH/cppeh-prepared-catch.ll         |   232 -
 test/CodeGen/WinEH/cppeh-prepared-cleanups.ll      |   245 -
 test/CodeGen/WinEH/cppeh-shared-empty-catch.ll     |   110 -
 test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll   |   394 -
 test/CodeGen/WinEH/cppeh-state-calc-1.ll           |   289 -
 test/CodeGen/WinEH/seh-catch-all.ll                |    59 -
 test/CodeGen/WinEH/seh-exception-code.ll           |    66 -
 test/CodeGen/WinEH/seh-exception-code2.ll          |    91 -
 test/CodeGen/WinEH/seh-inlined-finally.ll          |    83 -
 test/CodeGen/WinEH/seh-outlined-finally-win32.ll   |   172 -
 test/CodeGen/WinEH/seh-outlined-finally.ll         |   155 -
 test/CodeGen/WinEH/seh-prepared-basic.ll           |    83 -
 test/CodeGen/WinEH/seh-resume-phi.ll               |    66 -
 test/CodeGen/WinEH/seh-simple.ll                   |   233 -
 test/CodeGen/WinEH/wineh-cloning.ll                |   391 +
 test/CodeGen/WinEH/wineh-demotion.ll               |   356 +
 test/CodeGen/WinEH/wineh-intrinsics-invalid.ll     |    26 +
 test/CodeGen/WinEH/wineh-intrinsics.ll             |    44 +
 test/CodeGen/WinEH/wineh-no-demotion.ll            |   130 +
 .../CodeGen/WinEH/wineh-statenumbering-cleanups.ll |    62 +
 test/CodeGen/WinEH/wineh-statenumbering.ll         |   148 +
 test/CodeGen/X86/2006-10-02-BoolRetCrash.ll        |     1 +
 .../X86/2006-10-19-SwitchUnnecessaryBranching.ll   |     4 +-
 test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll      |     2 +-
 test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll    |     2 +-
 test/CodeGen/X86/2008-03-14-SpillerCrash.ll        |     2 +-
 .../CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll |     8 +-
 test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll      |    15 +-
 test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll        |     8 +-
 test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll   |    14 +-
 test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll       |     8 +-
 test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll |     1 +
 test/CodeGen/X86/2009-06-06-ConcatVectors.ll       |     1 +
 test/CodeGen/X86/2009-10-16-Scope.ll               |     6 +-
 test/CodeGen/X86/2010-01-18-DbgValue.ll            |     8 +-
 test/CodeGen/X86/2010-02-01-DbgValueCrash.ll       |     8 +-
 test/CodeGen/X86/2010-05-25-DotDebugLoc.ll         |    22 +-
 test/CodeGen/X86/2010-05-26-DotDebugLoc.ll         |    20 +-
 test/CodeGen/X86/2010-05-28-Crash.ll               |    18 +-
 test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll     |    28 +-
 test/CodeGen/X86/2010-07-06-DbgCrash.ll            |     7 +-
 test/CodeGen/X86/2010-08-04-StackVariable.ll       |    24 +-
 test/CodeGen/X86/2010-09-16-EmptyFilename.ll       |    10 +-
 test/CodeGen/X86/2010-11-02-DbgParameter.ll        |     8 +-
 test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll |    20 +-
 test/CodeGen/X86/2011-10-21-widen-cmp.ll           |    42 +-
 .../X86/2011-12-06-AVXVectorExtractCombine.ll      |    13 +-
 test/CodeGen/X86/2011-20-21-zext-ui2fp.ll          |    14 +-
 test/CodeGen/X86/2012-01-12-extract-sv.ll          |    28 +-
 test/CodeGen/X86/2012-08-17-legalizer-crash.ll     |     3 +-
 test/CodeGen/X86/2012-1-10-buildvector.ll          |     1 +
 test/CodeGen/X86/2012-11-30-handlemove-dbg.ll      |     8 +-
 test/CodeGen/X86/2012-11-30-misched-dbg.ll         |    16 +-
 test/CodeGen/X86/2012-11-30-regpres-dbg.ll         |     8 +-
 test/CodeGen/X86/3dnow-intrinsics.ll               |     4 +-
 test/CodeGen/X86/GC/alloc_loop.ll                  |     1 +
 test/CodeGen/X86/GC/cg-O0.ll                       |     1 +
 test/CodeGen/X86/GC/dynamic-frame-size.ll          |    10 +-
 test/CodeGen/X86/GC/lower_gcroot.ll                |     1 +
 test/CodeGen/X86/MachineBranchProb.ll              |     4 +-
 test/CodeGen/X86/MachineSink-DbgValue.ll           |    12 +-
 test/CodeGen/X86/MergeConsecutiveStores.ll         |    37 +-
 test/CodeGen/X86/StackColoring-dbg.ll              |     6 +-
 test/CodeGen/X86/add-nsw-sext.ll                   |   168 +
 test/CodeGen/X86/aliases.ll                        |    26 +-
 test/CodeGen/X86/and-encoding.ll                   |    41 +
 test/CodeGen/X86/atomic-flags.ll                   |    61 +
 test/CodeGen/X86/atomic-minmax-i6432.ll            |     8 +-
 test/CodeGen/X86/atomic-non-integer.ll             |   108 +
 test/CodeGen/X86/atomic128.ll                      |    52 +-
 test/CodeGen/X86/atomic_mi.ll                      |   662 +-
 test/CodeGen/X86/avg.ll                            |   724 +
 test/CodeGen/X86/avx-cvt-2.ll                      |     1 +
 test/CodeGen/X86/avx-cvt.ll                        |     6 +-
 test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll     |    66 +
 test/CodeGen/X86/avx-intrinsics-x86.ll             |   685 +-
 test/CodeGen/X86/avx-isa-check.ll                  |   570 +
 test/CodeGen/X86/avx-load-store.ll                 |     4 +-
 test/CodeGen/X86/avx-logic.ll                      |     2 +
 test/CodeGen/X86/avx-shift.ll                      |     1 +
 test/CodeGen/X86/avx-shuffle-x86_32.ll             |    26 +-
 test/CodeGen/X86/avx-splat.ll                      |   114 +-
 test/CodeGen/X86/avx-vbroadcast.ll                 |   261 +-
 test/CodeGen/X86/avx-vperm2x128.ll                 |    44 +-
 test/CodeGen/X86/avx-win64.ll                      |     2 -
 test/CodeGen/X86/avx.ll                            |     6 +-
 test/CodeGen/X86/avx2-conversions.ll               |   131 +-
 test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll    |   120 +
 test/CodeGen/X86/avx2-intrinsics-x86.ll            |    94 +-
 test/CodeGen/X86/avx2-nontemporal.ll               |    17 +-
 test/CodeGen/X86/avx2-vbroadcast.ll                |   441 +-
 test/CodeGen/X86/avx512-arith.ll                   |   358 +-
 test/CodeGen/X86/avx512-bugfix-25270.ll            |    35 +
 test/CodeGen/X86/avx512-build-vector.ll            |     1 +
 test/CodeGen/X86/avx512-calling-conv.ll            |   481 +-
 test/CodeGen/X86/avx512-cvt.ll                     |   119 +-
 test/CodeGen/X86/avx512-ext.ll                     |  1835 +++
 test/CodeGen/X86/avx512-extract-subvector.ll       |    56 +
 test/CodeGen/X86/avx512-fma.ll                     |   155 +-
 test/CodeGen/X86/avx512-gather-scatter-intrin.ll   |   185 +-
 test/CodeGen/X86/avx512-insert-extract.ll          |   519 +-
 test/CodeGen/X86/avx512-intrinsics.ll              |  4965 +++++-
 test/CodeGen/X86/avx512-logic.ll                   |   164 +-
 test/CodeGen/X86/avx512-mask-op.ll                 |  1472 +-
 test/CodeGen/X86/avx512-skx-insert-subvec.ll       |   135 +
 test/CodeGen/X86/avx512-trunc-ext.ll               |   961 --
 test/CodeGen/X86/avx512-trunc.ll                   |   488 +
 test/CodeGen/X86/avx512-vbroadcast.ll              |   262 +-
 test/CodeGen/X86/avx512-vec-cmp.ll                 |    27 +-
 test/CodeGen/X86/avx512bw-intrinsics.ll            |  2674 ++-
 test/CodeGen/X86/avx512bwvl-intrinsics.ll          |   748 +
 test/CodeGen/X86/avx512cd-intrinsics.ll            |    18 +
 test/CodeGen/X86/avx512cdvl-intrinsics.ll          |   179 +
 test/CodeGen/X86/avx512dq-intrinsics.ll            |   667 +
 test/CodeGen/X86/avx512dqvl-intrinsics.ll          |   818 +-
 test/CodeGen/X86/avx512vl-intrinsics.ll            |  2977 +++-
 test/CodeGen/X86/bit-piece-comment.ll              |    64 +
 test/CodeGen/X86/bitreverse.ll                     |    22 +
 test/CodeGen/X86/branchfolding-catchpads.ll        |    95 +
 test/CodeGen/X86/buildvec-insertvec.ll             |     1 +
 test/CodeGen/X86/catchpad-realign-savexmm.ll       |    53 +
 test/CodeGen/X86/catchpad-regmask.ll               |   144 +
 test/CodeGen/X86/catchpad-weight.ll                |    82 +
 test/CodeGen/X86/catchret-empty-fallthrough.ll     |    53 +
 test/CodeGen/X86/catchret-fallthrough.ll           |    42 +
 test/CodeGen/X86/cleanuppad-inalloca.ll            |    68 +
 test/CodeGen/X86/cleanuppad-large-codemodel.ll     |    27 +
 test/CodeGen/X86/cleanuppad-realign.ll             |    78 +
 test/CodeGen/X86/clz.ll                            |   148 +-
 test/CodeGen/X86/cmp.ll                            |    44 +
 test/CodeGen/X86/cmpxchg-clobber-flags.ll          |   150 +-
 test/CodeGen/X86/coal-sections.ll                  |    23 +
 test/CodeGen/X86/coalescer-win64.ll                |    16 +
 .../CodeGen/X86/code_placement_cold_loop_blocks.ll |   122 +
 .../code_placement_ignore_succ_in_inner_loop.ll    |   123 +
 test/CodeGen/X86/code_placement_loop_rotation.ll   |    80 +
 test/CodeGen/X86/code_placement_loop_rotation2.ll  |   122 +
 test/CodeGen/X86/codegen-prepare-cast.ll           |     2 +-
 test/CodeGen/X86/coff-comdat.ll                    |     2 +-
 test/CodeGen/X86/combine-and.ll                    |     1 +
 test/CodeGen/X86/combine-avx-intrinsics.ll         |    59 -
 test/CodeGen/X86/combine-avx2-intrinsics.ll        |    74 -
 test/CodeGen/X86/combine-multiplies.ll             |   163 +
 test/CodeGen/X86/combine-or.ll                     |     1 +
 test/CodeGen/X86/combine-sse2-intrinsics.ll        |    53 -
 test/CodeGen/X86/combine-sse41-intrinsics.ll       |    91 -
 test/CodeGen/X86/commute-two-addr.ll               |     2 +-
 test/CodeGen/X86/constant-hoisting-and.ll          |    19 +
 test/CodeGen/X86/constant-hoisting-cmp.ll          |    25 +
 test/CodeGen/X86/copysign-constant-magnitude.ll    |    24 +-
 test/CodeGen/X86/cppeh-nounwind.ll                 |    35 -
 test/CodeGen/X86/cxx_tlscc64.ll                    |    71 +
 test/CodeGen/X86/dag-fmf-cse.ll                    |    22 +
 test/CodeGen/X86/dag-merge-fast-accesses.ll        |    90 +
 test/CodeGen/X86/darwin-tls.ll                     |    28 +
 .../X86/dbg-changes-codegen-branch-folding.ll      |    48 +-
 test/CodeGen/X86/dbg-changes-codegen.ll            |     9 +-
 test/CodeGen/X86/dbg-combine.ll                    |    12 +-
 test/CodeGen/X86/debugloc-argsize.ll               |    58 +
 test/CodeGen/X86/divide-by-constant.ll             |    32 +
 test/CodeGen/X86/dllexport-x86_64.ll               |    10 +-
 test/CodeGen/X86/dllexport.ll                      |     8 +-
 test/CodeGen/X86/dwarf-comp-dir.ll                 |     2 +-
 test/CodeGen/X86/dynamic-allocas-VLAs.ll           |     2 +-
 test/CodeGen/X86/eh-null-personality.ll            |    25 +
 test/CodeGen/X86/eh_frame.ll                       |     4 +-
 test/CodeGen/X86/emutls-pic.ll                     |   168 +
 test/CodeGen/X86/emutls-pie.ll                     |   131 +
 test/CodeGen/X86/emutls.ll                         |   347 +
 test/CodeGen/X86/emutls_generic.ll                 |   107 +
 test/CodeGen/X86/exedeps-movq.ll                   |    19 +
 test/CodeGen/X86/expand-vr64-gr64-copy.mir         |    36 +
 .../X86/extractelement-legalization-cycle.ll       |    21 +
 test/CodeGen/X86/extractelement-shuffle.ll         |     1 +
 test/CodeGen/X86/fadd-combines.ll                  |   224 +
 test/CodeGen/X86/fast-isel-bitcasts-avx.ll         |   244 +
 test/CodeGen/X86/fast-isel-bitcasts.ll             |   245 +
 test/CodeGen/X86/fast-isel-cmp-branch.ll           |    17 +-
 test/CodeGen/X86/fast-isel-deadcode.ll             |   147 +
 test/CodeGen/X86/fast-isel-emutls.ll               |    48 +
 test/CodeGen/X86/fast-isel-nontemporal.ll          |   111 +
 test/CodeGen/X86/fast-isel-stackcheck.ll           |    44 +
 test/CodeGen/X86/fast-isel-tls.ll                  |     2 +-
 test/CodeGen/X86/fdiv-combine.ll                   |    69 +-
 test/CodeGen/X86/fdiv.ll                           |    52 +-
 test/CodeGen/X86/fixup-lea.ll                      |    34 +
 test/CodeGen/X86/float-asmprint.ll                 |    15 +
 test/CodeGen/X86/floor-soft-float.ll               |     2 +-
 test/CodeGen/X86/fma-commute-x86.ll                |   761 +
 test/CodeGen/X86/fma-do-not-commute.ll             |     2 +-
 test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll  |   499 +-
 test/CodeGen/X86/fma-intrinsics-x86.ll             |   688 +-
 test/CodeGen/X86/fma-scalar-memfold.ll             |   383 +
 test/CodeGen/X86/fma_patterns.ll                   |  1301 +-
 test/CodeGen/X86/fma_patterns_wide.ll              |   851 +-
 test/CodeGen/X86/fmaxnum.ll                        |   203 +-
 test/CodeGen/X86/fminnum.ll                        |   181 +-
 test/CodeGen/X86/fmul-combines.ll                  |    44 +-
 test/CodeGen/X86/fold-load-binops.ll               |     1 +
 test/CodeGen/X86/fold-load-unops.ll                |     1 +
 test/CodeGen/X86/fold-push.ll                      |    40 +
 test/CodeGen/X86/force-align-stack-alloca.ll       |     2 +-
 test/CodeGen/X86/force-align-stack.ll              |     2 +-
 test/CodeGen/X86/fp-fast.ll                        |     1 +
 test/CodeGen/X86/fp-logic.ll                       |   264 +
 test/CodeGen/X86/fp128-calling-conv.ll             |    47 +
 test/CodeGen/X86/fp128-cast.ll                     |   279 +
 test/CodeGen/X86/fp128-compare.ll                  |    96 +
 test/CodeGen/X86/fp128-i128.ll                     |   320 +
 test/CodeGen/X86/fp128-libcalls.ll                 |   107 +
 test/CodeGen/X86/fp128-load.ll                     |    35 +
 test/CodeGen/X86/fp128-store.ll                    |    14 +
 test/CodeGen/X86/fpcmp-soft-fp.ll                  |   127 +
 test/CodeGen/X86/fpstack-debuginstr-kill.ll        |    16 +-
 test/CodeGen/X86/frameescape.ll                    |   128 -
 test/CodeGen/X86/frem-msvc32.ll                    |    12 +
 test/CodeGen/X86/funclet-layout.ll                 |   158 +
 test/CodeGen/X86/function-alias.ll                 |    12 +
 test/CodeGen/X86/gcc_except_table.ll               |     2 +-
 test/CodeGen/X86/global-sections.ll                |     7 +-
 test/CodeGen/X86/h-register-store.ll               |    25 +-
 test/CodeGen/X86/h-registers-0.ll                  |     1 +
 test/CodeGen/X86/h-registers-1.ll                  |     1 +
 test/CodeGen/X86/h-registers-3.ll                  |     1 +
 test/CodeGen/X86/half.ll                           |     4 +-
 test/CodeGen/X86/hhvm-cc.ll                        |   241 +
 test/CodeGen/X86/i386-shrink-wrapping.ll           |   113 +
 test/CodeGen/X86/immediate_merging.ll              |    82 +
 test/CodeGen/X86/implicit-null-check.ll            |    51 +-
 test/CodeGen/X86/imul.ll                           |    63 +
 test/CodeGen/X86/inalloca-stdcall.ll               |     5 +-
 test/CodeGen/X86/inalloca.ll                       |    15 +-
 test/CodeGen/X86/inconsistent_landingpad.ll        |    30 +
 test/CodeGen/X86/inline-asm-2addr.ll               |    11 +-
 test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll   |     2 +-
 test/CodeGen/X86/inline-sse.ll                     |    34 +
 test/CodeGen/X86/insertps-from-constantpool.ll     |    20 +
 test/CodeGen/X86/insertps-unfold-load-bug.ll       |    33 +
 test/CodeGen/X86/int-intrinsic.ll                  |     2 +-
 test/CodeGen/X86/late-address-taken.ll             |    68 +
 test/CodeGen/X86/lea-opt.ll                        |   131 +
 test/CodeGen/X86/lit.local.cfg                     |     2 +-
 test/CodeGen/X86/localescape.ll                    |   143 +
 test/CodeGen/X86/lower-vec-shift-2.ll              |     1 +
 test/CodeGen/X86/lsr-static-addr.ll                |     2 +-
 test/CodeGen/X86/machine-combiner-int-vec.ll       |   112 +
 test/CodeGen/X86/machine-combiner-int.ll           |   194 +
 test/CodeGen/X86/machine-combiner.ll               |   467 +-
 test/CodeGen/X86/machine-cp.ll                     |    38 +-
 test/CodeGen/X86/machine-trace-metrics-crash.ll    |     4 +-
 test/CodeGen/X86/masked_gather_scatter.ll          |  2012 ++-
 test/CodeGen/X86/masked_memop.ll                   |   524 +-
 test/CodeGen/X86/materialize.ll                    |   184 +
 test/CodeGen/X86/mcu-abi.ll                        |   112 +
 test/CodeGen/X86/memcpy-2.ll                       |    26 +-
 test/CodeGen/X86/memcpy.ll                         |    33 +
 .../X86/merge-store-partially-alias-loads.ll       |    52 +
 .../X86/misched-code-difference-with-debug.ll      |    12 +-
 test/CodeGen/X86/mmx-arg-passing-x86-64.ll         |     1 +
 test/CodeGen/X86/mmx-arg-passing.ll                |     1 +
 test/CodeGen/X86/mmx-coalescing.ll                 |    84 +
 test/CodeGen/X86/mmx-intrinsics.ll                 |   291 +-
 test/CodeGen/X86/mmx-only.ll                       |    21 +
 test/CodeGen/X86/movntdq-no-avx.ll                 |     2 +-
 test/CodeGen/X86/movpc32-check.ll                  |    42 +
 test/CodeGen/X86/movtopush.ll                      |    25 +-
 test/CodeGen/X86/mult-alt-x86.ll                   |     2 +-
 test/CodeGen/X86/musttail-varargs.ll               |    43 +
 test/CodeGen/X86/nontemporal-2.ll                  |    21 +-
 test/CodeGen/X86/nontemporal.ll                    |    11 +-
 test/CodeGen/X86/null-streamer.ll                  |     4 +-
 test/CodeGen/X86/opt-ext-uses.ll                   |     8 +-
 test/CodeGen/X86/or-branch.ll                      |    30 +-
 test/CodeGen/X86/or-lea.ll                         |   120 +
 test/CodeGen/X86/palignr.ll                        |     1 +
 test/CodeGen/X86/patchpoint-verifiable.mir         |    42 +
 test/CodeGen/X86/peephole-na-phys-copy-folding.ll  |   190 +
 test/CodeGen/X86/pmul.ll                           |   297 +-
 test/CodeGen/X86/pop-stack-cleanup.ll              |    76 +
 test/CodeGen/X86/powi.ll                           |    38 +-
 test/CodeGen/X86/pr11415.ll                        |     8 +-
 test/CodeGen/X86/pr11468.ll                        |     2 +-
 test/CodeGen/X86/pr11985.ll                        |    30 +-
 test/CodeGen/X86/pr13577.ll                        |     5 +-
 test/CodeGen/X86/pr15267.ll                        |   240 +-
 test/CodeGen/X86/pr17631.ll                        |     2 +-
 test/CodeGen/X86/pr21529.ll                        |    15 -
 test/CodeGen/X86/pr22019.ll                        |     2 +-
 test/CodeGen/X86/pr23900.ll                        |    29 -
 test/CodeGen/X86/pr24139.ll                        |   148 +
 test/CodeGen/X86/pr24602.ll                        |    17 +
 test/CodeGen/X86/pr25828.ll                        |    30 +
 test/CodeGen/X86/prolog-push-seq.ll                |    19 +
 test/CodeGen/X86/pseudo_cmov_lower.ll              |   267 +
 test/CodeGen/X86/pseudo_cmov_lower1.ll             |    39 +
 test/CodeGen/X86/pseudo_cmov_lower2.ll             |   100 +
 test/CodeGen/X86/psubus.ll                         |   580 +-
 test/CodeGen/X86/push-cfi-debug.ll                 |    53 +
 test/CodeGen/X86/push-cfi-obj.ll                   |    44 +
 test/CodeGen/X86/push-cfi.ll                       |   304 +
 test/CodeGen/X86/ragreedy-hoist-spill.ll           |     2 +-
 test/CodeGen/X86/rem_crash.ll                      |   257 +
 test/CodeGen/X86/remat-invalid-liveness.ll         |    85 -
 test/CodeGen/X86/rodata-relocs.ll                  |     8 +-
 test/CodeGen/X86/rounding-ops.ll                   |    24 +-
 test/CodeGen/X86/safestack.ll                      |    32 +
 test/CodeGen/X86/sar_fold.ll                       |    37 +
 test/CodeGen/X86/sar_fold64.ll                     |    43 +
 test/CodeGen/X86/scalar-fp-to-i64.ll               |   151 +
 test/CodeGen/X86/scalar-int-to-fp.ll               |   132 +
 test/CodeGen/X86/sdiv-pow2.ll                      |    33 +
 test/CodeGen/X86/seh-catch-all-win32.ll            |    33 +-
 test/CodeGen/X86/seh-catch-all.ll                  |    29 +-
 test/CodeGen/X86/seh-catchpad.ll                   |   198 +
 test/CodeGen/X86/seh-except-finally.ll             |    71 +-
 test/CodeGen/X86/seh-exception-code.ll             |    38 +
 test/CodeGen/X86/seh-filter.ll                     |    21 -
 test/CodeGen/X86/seh-finally.ll                    |    50 +-
 test/CodeGen/X86/seh-safe-div-win32.ll             |    42 +-
 test/CodeGen/X86/seh-safe-div.ll                   |    54 +-
 test/CodeGen/X86/seh-stack-realign-win32.ll        |    99 -
 test/CodeGen/X86/seh-stack-realign.ll              |    34 +-
 test/CodeGen/X86/setcc-lowering.ll                 |     1 +
 test/CodeGen/X86/setcc.ll                          |    20 +
 test/CodeGen/X86/shift-bmi2.ll                     |    20 +-
 test/CodeGen/X86/shrink-wrap-chkstk.ll             |    37 +
 test/CodeGen/X86/slow-div.ll                       |    15 +
 test/CodeGen/X86/slow-unaligned-mem.ll             |    95 +
 test/CodeGen/X86/soft-fp.ll                        |    34 +-
 test/CodeGen/X86/soft-sitofp.ll                    |   169 +
 test/CodeGen/X86/splat-for-size.ll                 |   197 +-
 test/CodeGen/X86/sqrt-fastmath.ll                  |     9 +-
 test/CodeGen/X86/sse-align-12.ll                   |     1 +
 test/CodeGen/X86/sse-minmax.ll                     |     2 +-
 test/CodeGen/X86/sse-only.ll                       |    20 +
 test/CodeGen/X86/sse-scalar-fp-arith-unary.ll      |     1 +
 test/CodeGen/X86/sse2-vector-shifts.ll             |   282 +-
 test/CodeGen/X86/sse2.ll                           |     1 +
 test/CodeGen/X86/sse3-avx-addsub-2.ll              |   312 +-
 test/CodeGen/X86/sse3-avx-addsub.ll                |   197 +-
 test/CodeGen/X86/sse3-intrinsics-fast-isel.ll      |   171 +
 test/CodeGen/X86/sse3.ll                           |     7 +-
 test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll   |    47 +-
 test/CodeGen/X86/sse41-intrinsics-x86.ll           |    48 -
 test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll       |   185 +-
 test/CodeGen/X86/sse41.ll                          |    65 +-
 test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll     |    98 +
 test/CodeGen/X86/sse_partial_update.ll             |    33 +
 test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll     |   290 +
 test/CodeGen/X86/stack-align-memcpy.ll             |     2 +-
 test/CodeGen/X86/stack-folding-adx-x86_64.ll       |    45 +
 test/CodeGen/X86/stack-folding-fp-avx1.ll          |    18 +-
 test/CodeGen/X86/stack-folding-fp-sse42.ll         |    28 +-
 test/CodeGen/X86/stack-folding-int-avx1.ll         |    40 +-
 test/CodeGen/X86/stack-folding-int-avx2.ll         |    55 +-
 test/CodeGen/X86/stack-folding-int-sse42.ll        |    38 +-
 test/CodeGen/X86/stack-folding-mmx.ll              |   148 +-
 test/CodeGen/X86/stack-folding-x86_64.ll           |     2 +-
 test/CodeGen/X86/stack-folding-xop.ll              |     2 +-
 test/CodeGen/X86/stack-probe-size.ll               |     3 +-
 test/CodeGen/X86/stack-protector-dbginfo.ll        |    36 +-
 test/CodeGen/X86/stack-protector-weight.ll         |     4 +-
 test/CodeGen/X86/stackmap-frame-setup.ll           |    20 +
 test/CodeGen/X86/statepoint-allocas.ll             |    10 +-
 test/CodeGen/X86/statepoint-call-lowering.ll       |   103 +-
 test/CodeGen/X86/statepoint-far-call.ll            |     4 +-
 test/CodeGen/X86/statepoint-forward.ll             |    16 +-
 .../X86/statepoint-gctransition-call-lowering.ll   |    66 +-
 test/CodeGen/X86/statepoint-invoke.ll              |    78 +-
 test/CodeGen/X86/statepoint-stack-usage.ll         |    54 +-
 test/CodeGen/X86/statepoint-stackmap-format.ll     |    96 +-
 test/CodeGen/X86/stdarg.ll                         |    10 +-
 test/CodeGen/X86/stores-merging.ll                 |    46 +-
 test/CodeGen/X86/switch-bt.ll                      |     8 +-
 test/CodeGen/X86/switch-edge-weight.ll             |   281 +
 test/CodeGen/X86/switch-jump-table.ll              |    54 +-
 test/CodeGen/X86/switch-order-weight.ll            |     2 +-
 test/CodeGen/X86/switch.ll                         |    85 +-
 test/CodeGen/X86/swizzle-2.ll                      |     1 +
 test/CodeGen/X86/system-intrinsics-64-xsave.ll     |    41 +
 test/CodeGen/X86/system-intrinsics-64-xsavec.ll    |    21 +
 test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll  |    21 +
 test/CodeGen/X86/system-intrinsics-64-xsaves.ll    |    41 +
 test/CodeGen/X86/system-intrinsics-64.ll           |     2 +-
 test/CodeGen/X86/system-intrinsics-xsave.ll        |    23 +
 test/CodeGen/X86/system-intrinsics-xsavec.ll       |    12 +
 test/CodeGen/X86/system-intrinsics-xsaveopt.ll     |    12 +
 test/CodeGen/X86/system-intrinsics-xsaves.ll       |    23 +
 test/CodeGen/X86/system-intrinsics.ll              |     2 +-
 test/CodeGen/X86/tail-dup-catchret.ll              |    31 +
 test/CodeGen/X86/tail-merge-wineh.ll               |   107 +
 test/CodeGen/X86/tail-opts.ll                      |    40 +-
 test/CodeGen/X86/tailcall-mem-intrinsics.ll        |     4 +-
 test/CodeGen/X86/tailcall-msvc-conventions.ll      |   189 +
 test/CodeGen/X86/tailcall-readnone.ll              |    15 +
 test/CodeGen/X86/tls-android-negative.ll           |    65 +
 test/CodeGen/X86/tls-android.ll                    |    89 +
 test/CodeGen/X86/tls-models.ll                     |     2 +
 test/CodeGen/X86/tls-pie.ll                        |     8 +
 test/CodeGen/X86/token_landingpad.ll               |    21 +
 test/CodeGen/X86/trunc-store.ll                    |    49 +
 test/CodeGen/X86/unaligned-32-byte-memops.ll       |     7 +-
 test/CodeGen/X86/unaligned-spill-folding.ll        |     2 +-
 test/CodeGen/X86/unknown-location.ll               |     8 +-
 test/CodeGen/X86/v2f32.ll                          |     1 +
 test/CodeGen/X86/vec_cast2.ll                      |    31 +-
 test/CodeGen/X86/vec_cmp_sint-128.ll               |   722 +
 test/CodeGen/X86/vec_cmp_uint-128.ll               |   860 +
 test/CodeGen/X86/vec_ctbits.ll                     |   129 +-
 test/CodeGen/X86/vec_extract-avx.ll                |   114 +-
 test/CodeGen/X86/vec_fabs.ll                       |     2 +-
 test/CodeGen/X86/vec_fp_to_int.ll                  |  1269 +-
 test/CodeGen/X86/vec_insert-5.ll                   |     1 +
 test/CodeGen/X86/vec_int_to_fp.ll                  |  1920 ++-
 test/CodeGen/X86/vec_minmax_sint.ll                |  2090 +++
 test/CodeGen/X86/vec_minmax_uint.ll                |  2229 +++
 test/CodeGen/X86/vec_sdiv_to_shift.ll              |    13 +
 test/CodeGen/X86/vec_trunc_sext.ll                 |    31 +-
 test/CodeGen/X86/vec_uint_to_fp-fastmath.ll        |   130 +
 test/CodeGen/X86/vec_uint_to_fp.ll                 |     8 +-
 test/CodeGen/X86/vector-blend.ll                   |    72 +-
 test/CodeGen/X86/vector-idiv.ll                    |     1 +
 test/CodeGen/X86/vector-lzcnt-128.ll               |   472 +-
 test/CodeGen/X86/vector-lzcnt-256.ll               |   257 +-
 test/CodeGen/X86/vector-lzcnt-512.ll               |   219 +
 .../CodeGen/X86/vector-merge-store-fp-constants.ll |    35 +
 test/CodeGen/X86/vector-popcnt-128.ll              |    37 +-
 test/CodeGen/X86/vector-popcnt-256.ll              |    73 +-
 test/CodeGen/X86/vector-popcnt-512.ll              |   161 +
 test/CodeGen/X86/vector-rotate-128.ll              |  1595 ++
 test/CodeGen/X86/vector-rotate-256.ll              |  1089 ++
 test/CodeGen/X86/vector-sext.ll                    |  3988 ++++-
 test/CodeGen/X86/vector-shift-ashr-128.ll          |   917 +-
 test/CodeGen/X86/vector-shift-ashr-256.ll          |   691 +-
 test/CodeGen/X86/vector-shift-ashr-512.ll          |   378 +
 test/CodeGen/X86/vector-shift-lshr-128.ll          |   619 +-
 test/CodeGen/X86/vector-shift-lshr-256.ll          |   444 +-
 test/CodeGen/X86/vector-shift-lshr-512.ll          |   317 +
 test/CodeGen/X86/vector-shift-shl-128.ll           |   501 +-
 test/CodeGen/X86/vector-shift-shl-256.ll           |   403 +-
 test/CodeGen/X86/vector-shift-shl-512.ll           |   293 +
 test/CodeGen/X86/vector-shuffle-128-v16.ll         |   276 +
 test/CodeGen/X86/vector-shuffle-128-v2.ll          |   318 +-
 test/CodeGen/X86/vector-shuffle-128-v4.ll          |    92 +
 test/CodeGen/X86/vector-shuffle-128-v8.ll          |   252 +
 test/CodeGen/X86/vector-shuffle-256-v16.ll         |   249 +-
 test/CodeGen/X86/vector-shuffle-256-v32.ll         |   210 +-
 test/CodeGen/X86/vector-shuffle-256-v4.ll          |   703 +-
 test/CodeGen/X86/vector-shuffle-256-v8.ll          |   221 +-
 test/CodeGen/X86/vector-shuffle-512-v16.ll         |   134 +
 test/CodeGen/X86/vector-shuffle-512-v32.ll         |    44 +
 test/CodeGen/X86/vector-shuffle-512-v8.ll          |  2487 ++-
 test/CodeGen/X86/vector-shuffle-combining.ll       |     1 +
 test/CodeGen/X86/vector-shuffle-mmx.ll             |     1 +
 test/CodeGen/X86/vector-shuffle-sse1.ll            |     1 +
 test/CodeGen/X86/vector-shuffle-sse4a.ll           |   140 +
 test/CodeGen/X86/vector-shuffle-v1.ll              |   439 +
 test/CodeGen/X86/vector-trunc.ll                   |   681 +-
 test/CodeGen/X86/vector-tzcnt-128.ll               |  2035 +--
 test/CodeGen/X86/vector-tzcnt-256.ll               |  1455 +-
 test/CodeGen/X86/vector-tzcnt-512.ll               |   271 +
 test/CodeGen/X86/vector-zext.ll                    |  1523 +-
 test/CodeGen/X86/vector-zmov.ll                    |     1 +
 ...gisters-cleared-in-machine-functions-liveins.ll |    19 +
 test/CodeGen/X86/vmovq.ll                          |    28 +
 test/CodeGen/X86/vselect-2.ll                      |     1 +
 test/CodeGen/X86/vselect-avx.ll                    |    12 +-
 test/CodeGen/X86/vselect-minmax.ll                 | 16332 +++++++++++++------
 test/CodeGen/X86/vselect.ll                        |     1 +
 test/CodeGen/X86/vshift_scalar.ll                  |     1 +
 test/CodeGen/X86/wide-integer-cmp.ll               |   130 +
 test/CodeGen/X86/widen_load-2.ll                   |     4 +-
 test/CodeGen/X86/widen_shuffle-1.ll                |     1 +
 test/CodeGen/X86/win-catchpad-csrs.ll              |   268 +
 test/CodeGen/X86/win-catchpad-nested-cxx.ll        |   105 +
 test/CodeGen/X86/win-catchpad-nested.ll            |    42 +
 test/CodeGen/X86/win-catchpad-varargs.ll           |   101 +
 test/CodeGen/X86/win-catchpad.ll                   |   353 +
 test/CodeGen/X86/win-cleanuppad.ll                 |   199 +
 test/CodeGen/X86/win-funclet-cfi.ll                |    95 +
 test/CodeGen/X86/win-mixed-ehpersonality.ll        |    81 +
 test/CodeGen/X86/win32-eh-states.ll                |   213 +-
 test/CodeGen/X86/win32-eh.ll                       |    49 +-
 test/CodeGen/X86/win32-pic-jumptable.ll            |     8 +-
 test/CodeGen/X86/win32-seh-catchpad-realign.ll     |    77 +
 test/CodeGen/X86/win32-seh-catchpad.ll             |   231 +
 test/CodeGen/X86/win32-seh-nested-finally.ll       |    80 +
 test/CodeGen/X86/win32-spill-xmm.ll                |    40 +
 test/CodeGen/X86/win64_frame.ll                    |    70 +-
 test/CodeGen/X86/win64_sibcall.ll                  |    38 +
 test/CodeGen/X86/win_coreclr_chkstk.ll             |   143 +
 test/CodeGen/X86/win_eh_prepare.ll                 |    82 -
 test/CodeGen/X86/win_ftol2.ll                      |   166 -
 test/CodeGen/X86/wineh-coreclr.ll                  |   267 +
 test/CodeGen/X86/wineh-exceptionpointer.ll         |    26 +
 test/CodeGen/X86/wineh-no-ehpads.ll                |    20 +
 test/CodeGen/X86/x32-function_pointer-3.ll         |     2 +-
 test/CodeGen/X86/x32-indirectbr.ll                 |    26 +
 test/CodeGen/X86/x32-landingpad.ll                 |    27 +
 test/CodeGen/X86/x32-va_start.ll                   |    99 +
 test/CodeGen/X86/x86-32-intrcc.ll                  |    79 +
 test/CodeGen/X86/x86-64-baseptr.ll                 |     4 +-
 .../X86/x86-64-double-precision-shift-left.ll      |    17 +-
 .../X86/x86-64-double-precision-shift-right.ll     |     9 +-
 test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll  |     4 +-
 test/CodeGen/X86/x86-64-intrcc.ll                  |    86 +
 test/CodeGen/X86/x86-64-ms_abi-vararg.ll           |   108 +
 test/CodeGen/X86/x86-64-pic-10.ll                  |     2 +-
 test/CodeGen/X86/x86-fold-pshufb.ll                |    20 +-
 test/CodeGen/X86/x86-sanitizer-shrink-wrapping.ll  |    40 +
 test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll    |    16 +-
 test/CodeGen/X86/x86-shrink-wrap-unwind.ll         |   153 +
 test/CodeGen/X86/x86-shrink-wrapping.ll            |   254 +-
 test/CodeGen/X86/x86-win64-shrink-wrapping.ll      |   126 +
 test/CodeGen/X86/xop-intrinsics-x86_64.ll          |    33 +-
 test/CodeGen/X86/xop-pcmov.ll                      |   163 +
 test/CodeGen/XCore/aliases.ll                      |     6 +-
 test/CodeGen/XCore/dwarf_debug.ll                  |     8 +-
 1591 files changed, 135533 insertions(+), 31280 deletions(-)
 create mode 100644 test/CodeGen/AArch64/aarch64-addv.ll
 create mode 100644 test/CodeGen/AArch64/aarch64-deferred-spilling.ll
 create mode 100644 test/CodeGen/AArch64/aarch64-loop-gep-opt.ll
 create mode 100644 test/CodeGen/AArch64/aarch64-minmaxv.ll
 create mode 100644 test/CodeGen/AArch64/aarch64-smax-constantfold.ll
 create mode 100644 test/CodeGen/AArch64/arm64-builtins-linux.ll
 create mode 100644 test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fmax-safe.ll
 create mode 100644 test/CodeGen/AArch64/arm64-ld-from-st.ll
 create mode 100644 test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
 create mode 100644 test/CodeGen/AArch64/bitreverse.ll
 create mode 100644 test/CodeGen/AArch64/cxx-tlscc.ll
 create mode 100644 test/CodeGen/AArch64/dag-combine-select.ll
 create mode 100644 test/CodeGen/AArch64/divrem.ll
 create mode 100644 test/CodeGen/AArch64/emutls.ll
 create mode 100644 test/CodeGen/AArch64/emutls_generic.ll
 create mode 100644 test/CodeGen/AArch64/eon.ll
 create mode 100644 test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
 create mode 100644 test/CodeGen/AArch64/fast-isel-cmp-vec.ll
 create mode 100644 test/CodeGen/AArch64/fast-isel-folded-shift.ll
 create mode 100644 test/CodeGen/AArch64/fcvt_combine.ll
 create mode 100644 test/CodeGen/AArch64/fdiv_combine.ll
 create mode 100644 test/CodeGen/AArch64/misched-fusion.ll
 create mode 100644 test/CodeGen/AArch64/nontemporal.ll
 create mode 100644 test/CodeGen/AArch64/readcyclecounter.ll
 create mode 100644 test/CodeGen/AArch64/rotate.ll
 create mode 100644 test/CodeGen/AArch64/round-conv.ll
 create mode 100755 test/CodeGen/AArch64/shrink-wrap.ll
 create mode 100644 test/CodeGen/AArch64/stackmap-frame-setup.ll
 create mode 100644 test/CodeGen/AArch64/tbi.ll
 create mode 100644 test/CodeGen/AArch64/vector-fcopysign.ll
 create mode 100644 test/CodeGen/AMDGPU/addrspacecast.ll
 create mode 100644 test/CodeGen/AMDGPU/annotate-kernel-features.ll
 create mode 100644 test/CodeGen/AMDGPU/bitreverse.ll
 create mode 100644 test/CodeGen/AMDGPU/calling-conventions.ll
 create mode 100644 test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
 create mode 100644 test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
 create mode 100644 test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
 create mode 100644 test/CodeGen/AMDGPU/ds-sub-offset.ll
 create mode 100644 test/CodeGen/AMDGPU/dynamic_stackalloc.ll
 create mode 100644 test/CodeGen/AMDGPU/extract-vector-elt-i64.ll
 create mode 100644 test/CodeGen/AMDGPU/flat-scratch-reg.ll
 create mode 100644 test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
 create mode 100644 test/CodeGen/AMDGPU/global-constant.ll
 create mode 100644 test/CodeGen/AMDGPU/hsa-globals.ll
 create mode 100644 test/CodeGen/AMDGPU/hsa-group-segment.ll
 create mode 100644 test/CodeGen/AMDGPU/image-attributes.ll
 create mode 100644 test/CodeGen/AMDGPU/image-resource-id.ll
 create mode 100644 test/CodeGen/AMDGPU/inline-constraints.ll
 create mode 100644 test/CodeGen/AMDGPU/large-alloca-compute.ll
 create mode 100644 test/CodeGen/AMDGPU/large-alloca-graphics.ll
 delete mode 100644 test/CodeGen/AMDGPU/large-alloca.ll
 delete mode 100644 test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.SI.packf16.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
 create mode 100644 test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
 create mode 100644 test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
 create mode 100644 test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll
 create mode 100644 test/CodeGen/AMDGPU/opencl-image-metadata.ll
 create mode 100644 test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
 create mode 100644 test/CodeGen/AMDGPU/sampler-resource-id.ll
 create mode 100644 test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
 create mode 100644 test/CodeGen/AMDGPU/si-literal-folding.ll
 create mode 100644 test/CodeGen/AMDGPU/sminmax.ll
 create mode 100644 test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
 create mode 100644 test/CodeGen/AMDGPU/store_typed.ll
 create mode 100644 test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
 create mode 100644 test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
 create mode 100644 test/CodeGen/ARM/MachO-subtypes.ll
 create mode 100644 test/CodeGen/ARM/Windows/division.ll
 delete mode 100644 test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll
 create mode 100644 test/CodeGen/ARM/Windows/libcalls.ll
 create mode 100644 test/CodeGen/ARM/Windows/no-eabi.ll
 create mode 100644 test/CodeGen/ARM/Windows/no-frame-register.ll
 create mode 100644 test/CodeGen/ARM/Windows/overflow.ll
 create mode 100644 test/CodeGen/ARM/align-sp-adjustment.ll
 create mode 100644 test/CodeGen/ARM/apcs-vfp.ll
 create mode 100644 test/CodeGen/ARM/arm-eabi.ll
 create mode 100644 test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
 create mode 100644 test/CodeGen/ARM/arm-shrink-wrapping.ll
 create mode 100644 test/CodeGen/ARM/build-attributes-optimization-minsize.ll
 create mode 100644 test/CodeGen/ARM/build-attributes-optimization-mixed.ll
 create mode 100644 test/CodeGen/ARM/build-attributes-optimization-optnone.ll
 create mode 100644 test/CodeGen/ARM/build-attributes-optimization-optsize.ll
 create mode 100644 test/CodeGen/ARM/build-attributes-optimization.ll
 create mode 100644 test/CodeGen/ARM/cfi-alignment.ll
 create mode 100644 test/CodeGen/ARM/combine-vmovdrr.ll
 create mode 100644 test/CodeGen/ARM/debugtrap.ll
 create mode 100644 test/CodeGen/ARM/emutls.ll
 create mode 100644 test/CodeGen/ARM/emutls1.ll
 create mode 100644 test/CodeGen/ARM/emutls_generic.ll
 create mode 100644 test/CodeGen/ARM/fp16-args.ll
 create mode 100644 test/CodeGen/ARM/gep-optimization.ll
 create mode 100644 test/CodeGen/ARM/global-merge-external.ll
 create mode 100644 test/CodeGen/ARM/ldm-stm-base-materialization.ll
 create mode 100644 test/CodeGen/ARM/legalize-unaligned-load.ll
 create mode 100644 test/CodeGen/ARM/memcpy-ldm-stm.ll
 create mode 100644 test/CodeGen/ARM/minmax.ll
 create mode 100644 test/CodeGen/ARM/neon_vshl_minint.ll
 create mode 100644 test/CodeGen/ARM/pr25317.ll
 create mode 100644 test/CodeGen/ARM/pr25838.ll
 create mode 100644 test/CodeGen/ARM/rotate.ll
 create mode 100644 test/CodeGen/ARM/sat-arith.ll
 delete mode 100644 test/CodeGen/ARM/sched-it-debug-nodes.ll
 create mode 100644 test/CodeGen/ARM/setjmp_longjmp.ll
 create mode 100644 test/CodeGen/ARM/softfp-fabs-fneg.ll
 create mode 100644 test/CodeGen/ARM/ssat-lower.ll
 create mode 100644 test/CodeGen/ARM/ssat-upper.ll
 create mode 100644 test/CodeGen/ARM/subtarget-no-movt.ll
 create mode 100644 test/CodeGen/ARM/thumb1-ldst-opt.ll
 create mode 100644 test/CodeGen/ARM/unaligned_load_store_vfp.ll
 create mode 100644 test/CodeGen/ARM/usat-lower.ll
 create mode 100644 test/CodeGen/ARM/usat-upper.ll
 create mode 100644 test/CodeGen/ARM/v7k-abi-align.ll
 create mode 100644 test/CodeGen/ARM/v7k-libcalls.ll
 create mode 100644 test/CodeGen/ARM/v7k-sincos.ll
 create mode 100644 test/CodeGen/ARM/vfp-reg-stride.ll
 create mode 100644 test/CodeGen/ARM/vld-vst-upgrade.ll
 create mode 100644 test/CodeGen/ARM/vminmaxnm-safe.ll
 create mode 100644 test/CodeGen/CPP/gep.ll
 create mode 100644 test/CodeGen/Generic/ForceStackAlign.ll
 create mode 100644 test/CodeGen/Generic/lit.local.cfg
 create mode 100644 test/CodeGen/Hexagon/NVJumpCmp.ll
 create mode 100644 test/CodeGen/Hexagon/bit-eval.ll
 create mode 100644 test/CodeGen/Hexagon/bit-loop.ll
 create mode 100644 test/CodeGen/Hexagon/cfi-late.ll
 create mode 100644 test/CodeGen/Hexagon/early-if-conversion-bug1.ll
 create mode 100644 test/CodeGen/Hexagon/early-if-phi-i1.ll
 create mode 100644 test/CodeGen/Hexagon/early-if-spare.ll
 create mode 100644 test/CodeGen/Hexagon/early-if.ll
 create mode 100644 test/CodeGen/Hexagon/ifcvt-edge-weight.ll
 create mode 100644 test/CodeGen/Hexagon/memcpy-likely-aligned.ll
 create mode 100644 test/CodeGen/Hexagon/mux-basic.ll
 create mode 100644 test/CodeGen/Hexagon/pic-jumptables.ll
 create mode 100644 test/CodeGen/Hexagon/pic-simple.ll
 create mode 100644 test/CodeGen/Hexagon/pic-static.ll
 create mode 100644 test/CodeGen/Hexagon/sdr-basic.ll
 create mode 100644 test/CodeGen/Hexagon/sdr-shr32.ll
 create mode 100644 test/CodeGen/Hexagon/store-widen-aliased-load.ll
 create mode 100644 test/CodeGen/Hexagon/store-widen-negv.ll
 create mode 100644 test/CodeGen/Hexagon/store-widen-negv2.ll
 create mode 100644 test/CodeGen/Hexagon/store-widen.ll
 create mode 100644 test/CodeGen/Hexagon/tail-dup-subreg-abort.ll
 create mode 100644 test/CodeGen/Hexagon/v60Intrins.ll
 create mode 100644 test/CodeGen/Hexagon/v60Vasr.ll
 create mode 100644 test/CodeGen/Hexagon/v60small.ll
 create mode 100644 test/CodeGen/MIR/AArch64/cfi-def-cfa.mir
 create mode 100644 test/CodeGen/MIR/AArch64/expected-target-flag-name.mir
 create mode 100644 test/CodeGen/MIR/AArch64/invalid-target-flag-name.mir
 create mode 100644 test/CodeGen/MIR/AArch64/lit.local.cfg
 create mode 100644 test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir
 create mode 100644 test/CodeGen/MIR/AArch64/stack-object-local-offset.mir
 create mode 100644 test/CodeGen/MIR/AArch64/target-flags.mir
 create mode 100644 test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
 create mode 100644 test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
 create mode 100644 test/CodeGen/MIR/AMDGPU/lit.local.cfg
 create mode 100644 test/CodeGen/MIR/AMDGPU/target-index-operands.mir
 create mode 100644 test/CodeGen/MIR/ARM/ARMLoadStoreDBG.mir
 create mode 100644 test/CodeGen/MIR/ARM/bundled-instructions.mir
 create mode 100644 test/CodeGen/MIR/ARM/cfi-same-value.mir
 create mode 100644 test/CodeGen/MIR/ARM/expected-closing-brace.mir
 create mode 100644 test/CodeGen/MIR/ARM/extraneous-closing-brace-error.mir
 create mode 100644 test/CodeGen/MIR/ARM/lit.local.cfg
 create mode 100644 test/CodeGen/MIR/ARM/nested-instruction-bundle-error.mir
 create mode 100644 test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir
 create mode 100644 test/CodeGen/MIR/Generic/basic-blocks.mir
 create mode 100644 test/CodeGen/MIR/Generic/expected-colon-after-basic-block.mir
 create mode 100644 test/CodeGen/MIR/Generic/expected-mbb-reference-for-successor-mbb.mir
 create mode 100644 test/CodeGen/MIR/Generic/frame-info.mir
 create mode 100644 test/CodeGen/MIR/Generic/function-missing-machine-function.mir
 create mode 100644 test/CodeGen/MIR/Generic/invalid-jump-table-kind.mir
 create mode 100644 test/CodeGen/MIR/Generic/lit.local.cfg
 create mode 100644 test/CodeGen/MIR/Generic/llvm-ir-error-reported.mir
 create mode 100644 test/CodeGen/MIR/Generic/llvmIR.mir
 create mode 100644 test/CodeGen/MIR/Generic/llvmIRMissing.mir
 create mode 100644 test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir
 create mode 100644 test/CodeGen/MIR/Generic/machine-basic-block-redefinition-error.mir
 create mode 100644 test/CodeGen/MIR/Generic/machine-basic-block-undefined-ir-block.mir
 create mode 100644 test/CodeGen/MIR/Generic/machine-basic-block-unknown-name.mir
 create mode 100644 test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir
 create mode 100644 test/CodeGen/MIR/Generic/machine-function-missing-function.mir
 create mode 100644 test/CodeGen/MIR/Generic/machine-function-missing-name.mir
 create mode 100644 test/CodeGen/MIR/Generic/machine-function-redefinition-error.mir
 create mode 100644 test/CodeGen/MIR/Generic/machine-function.mir
 create mode 100644 test/CodeGen/MIR/Generic/register-info.mir
 create mode 100644 test/CodeGen/MIR/Mips/expected-global-value-or-symbol-after-call-entry.mir
 create mode 100644 test/CodeGen/MIR/Mips/lit.local.cfg
 create mode 100644 test/CodeGen/MIR/Mips/memory-operands.mir
 create mode 100644 test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
 create mode 100644 test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
 create mode 100644 test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
 create mode 100644 test/CodeGen/MIR/NVPTX/lit.local.cfg
 create mode 100644 test/CodeGen/MIR/PowerPC/lit.local.cfg
 create mode 100644 test/CodeGen/MIR/PowerPC/unordered-implicit-registers.mir
 create mode 100644 test/CodeGen/MIR/X86/basic-block-not-at-start-of-line-error.mir
 create mode 100644 test/CodeGen/MIR/X86/block-address-operands.mir
 create mode 100644 test/CodeGen/MIR/X86/callee-saved-info.mir
 create mode 100644 test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir
 create mode 100644 test/CodeGen/MIR/X86/cfi-def-cfa-register.mir
 create mode 100644 test/CodeGen/MIR/X86/cfi-offset.mir
 create mode 100644 test/CodeGen/MIR/X86/constant-pool-item-redefinition-error.mir
 create mode 100644 test/CodeGen/MIR/X86/constant-pool.mir
 create mode 100644 test/CodeGen/MIR/X86/constant-value-error.mir
 create mode 100644 test/CodeGen/MIR/X86/def-register-already-tied-error.mir
 create mode 100644 test/CodeGen/MIR/X86/duplicate-memory-operand-flag.mir
 create mode 100644 test/CodeGen/MIR/X86/duplicate-register-flag-error.mir
 create mode 100644 test/CodeGen/MIR/X86/early-clobber-register-flag.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-align-in-memory-operand.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-alignment-after-align-in-memory-operand.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-basic-block-at-start-of-body.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-block-reference-in-blockaddress.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-comma-after-cfi-register.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-comma-after-memory-operand.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-from-in-memory-operand.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-function-reference-after-blockaddress.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-global-value-after-blockaddress.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-integer-after-offset-sign.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-integer-in-successor-weight.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-load-or-store-in-memory-operand.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-metadata-node-after-debug-location.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-metadata-node-after-exclaim.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-metadata-node-in-stack-object.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-named-register-in-callee-saved-register.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-named-register-in-functions-livein.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-newline-at-end-of-list.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-offset-after-cfi-operand.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-pointer-value-in-memory-operand.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-positive-alignment-after-align.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-register-after-cfi-operand.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-stack-object.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-target-flag-name.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-value-in-memory-operand.mir
 create mode 100644 test/CodeGen/MIR/X86/expected-virtual-register-in-functions-livein.mir
 create mode 100644 test/CodeGen/MIR/X86/external-symbol-operands.mir
 create mode 100644 test/CodeGen/MIR/X86/fixed-stack-memory-operands.mir
 create mode 100644 test/CodeGen/MIR/X86/fixed-stack-object-redefinition-error.mir
 create mode 100644 test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
 create mode 100644 test/CodeGen/MIR/X86/frame-info-stack-references.mir
 create mode 100644 test/CodeGen/MIR/X86/frame-setup-instruction-flag.mir
 create mode 100644 test/CodeGen/MIR/X86/function-liveins.mir
 create mode 100644 test/CodeGen/MIR/X86/inline-asm-registers.mir
 create mode 100644 test/CodeGen/MIR/X86/instructions-debug-location.mir
 create mode 100644 test/CodeGen/MIR/X86/invalid-constant-pool-item.mir
 create mode 100644 test/CodeGen/MIR/X86/invalid-metadata-node-type.mir
 create mode 100644 test/CodeGen/MIR/X86/invalid-target-flag-name.mir
 create mode 100644 test/CodeGen/MIR/X86/invalid-tied-def-index-error.mir
 create mode 100644 test/CodeGen/MIR/X86/jump-table-info.mir
 create mode 100644 test/CodeGen/MIR/X86/jump-table-redefinition-error.mir
 create mode 100644 test/CodeGen/MIR/X86/large-cfi-offset-number-error.mir
 create mode 100644 test/CodeGen/MIR/X86/large-immediate-operand-error.mir
 create mode 100644 test/CodeGen/MIR/X86/large-offset-number-error.mir
 create mode 100644 test/CodeGen/MIR/X86/large-size-in-memory-operand-error.mir
 create mode 100644 test/CodeGen/MIR/X86/liveout-register-mask.mir
 create mode 100644 test/CodeGen/MIR/X86/machine-verifier.mir
 create mode 100644 test/CodeGen/MIR/X86/memory-operands.mir
 create mode 100644 test/CodeGen/MIR/X86/metadata-operands.mir
 create mode 100644 test/CodeGen/MIR/X86/missing-closing-quote.mir
 delete mode 100644 test/CodeGen/MIR/X86/missing-instruction.mir
 create mode 100644 test/CodeGen/MIR/X86/newline-handling.mir
 create mode 100644 test/CodeGen/MIR/X86/register-operands-target-flag-error.mir
 create mode 100644 test/CodeGen/MIR/X86/simple-register-allocation-hints.mir
 create mode 100644 test/CodeGen/MIR/X86/stack-object-debug-info.mir
 create mode 100644 test/CodeGen/MIR/X86/stack-object-invalid-name.mir
 create mode 100644 test/CodeGen/MIR/X86/stack-object-operand-name-mismatch-error.mir
 create mode 100644 test/CodeGen/MIR/X86/stack-object-operands.mir
 create mode 100644 test/CodeGen/MIR/X86/stack-object-redefinition-error.mir
 create mode 100644 test/CodeGen/MIR/X86/standalone-register-error.mir
 create mode 100644 test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
 create mode 100644 test/CodeGen/MIR/X86/successor-basic-blocks.mir
 create mode 100644 test/CodeGen/MIR/X86/tied-def-operand-invalid.mir
 create mode 100644 test/CodeGen/MIR/X86/undefined-fixed-stack-object.mir
 create mode 100644 test/CodeGen/MIR/X86/undefined-ir-block-in-blockaddress.mir
 create mode 100644 test/CodeGen/MIR/X86/undefined-ir-block-slot-in-blockaddress.mir
 create mode 100644 test/CodeGen/MIR/X86/undefined-jump-table-id.mir
 create mode 100644 test/CodeGen/MIR/X86/undefined-stack-object.mir
 create mode 100644 test/CodeGen/MIR/X86/undefined-value-in-memory-operand.mir
 create mode 100644 test/CodeGen/MIR/X86/unknown-metadata-keyword.mir
 create mode 100644 test/CodeGen/MIR/X86/unknown-metadata-node.mir
 create mode 100644 test/CodeGen/MIR/X86/used-physical-register-info.mir
 create mode 100644 test/CodeGen/MIR/X86/virtual-register-redefinition-error.mir
 delete mode 100644 test/CodeGen/MIR/basic-blocks.mir
 delete mode 100644 test/CodeGen/MIR/expected-eof-after-successor-mbb.mir
 delete mode 100644 test/CodeGen/MIR/expected-mbb-reference-for-successor-mbb.mir
 delete mode 100644 test/CodeGen/MIR/frame-info.mir
 delete mode 100644 test/CodeGen/MIR/function-missing-machine-function.mir
 delete mode 100644 test/CodeGen/MIR/llvm-ir-error-reported.mir
 delete mode 100644 test/CodeGen/MIR/llvmIR.mir
 delete mode 100644 test/CodeGen/MIR/llvmIRMissing.mir
 delete mode 100644 test/CodeGen/MIR/machine-basic-block-redefinition-error.mir
 delete mode 100644 test/CodeGen/MIR/machine-basic-block-unknown-name.mir
 delete mode 100644 test/CodeGen/MIR/machine-function-missing-body-error.mir
 delete mode 100644 test/CodeGen/MIR/machine-function-missing-function.mir
 delete mode 100644 test/CodeGen/MIR/machine-function-missing-name.mir
 delete mode 100644 test/CodeGen/MIR/machine-function-redefinition-error.mir
 delete mode 100644 test/CodeGen/MIR/machine-function.mir
 delete mode 100644 test/CodeGen/MIR/register-info.mir
 delete mode 100644 test/CodeGen/MIR/successor-basic-blocks.mir
 create mode 100644 test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll
 create mode 100644 test/CodeGen/Mips/emutls_generic.ll
 create mode 100644 test/CodeGen/Mips/interrupt-attr-64-error.ll
 create mode 100644 test/CodeGen/Mips/interrupt-attr-args-error.ll
 create mode 100644 test/CodeGen/Mips/interrupt-attr-error.ll
 create mode 100644 test/CodeGen/Mips/interrupt-attr.ll
 create mode 100644 test/CodeGen/Mips/llvm-ir/atomicrmx.ll
 create mode 100644 test/CodeGen/Mips/llvm-ir/load-atomic.ll
 create mode 100644 test/CodeGen/Mips/llvm-ir/sqrt.ll
 create mode 100644 test/CodeGen/Mips/llvm-ir/store-atomic.ll
 create mode 100644 test/CodeGen/NVPTX/branch-fold.ll
 create mode 100644 test/CodeGen/NVPTX/bypass-div.ll
 create mode 100644 test/CodeGen/NVPTX/combine-min-max.ll
 create mode 100644 test/CodeGen/NVPTX/global-addrspace.ll
 create mode 100644 test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
 create mode 100644 test/CodeGen/NVPTX/reg-copy.ll
 create mode 100644 test/CodeGen/PowerPC/BoolRetToIntTest.ll
 create mode 100644 test/CodeGen/PowerPC/BreakableToken-reduced.ll
 create mode 100644 test/CodeGen/PowerPC/aantidep-def-ec.mir
 create mode 100644 test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
 create mode 100644 test/CodeGen/PowerPC/addisdtprelha-nonr3.mir
 create mode 100644 test/CodeGen/PowerPC/bitcasts-direct-move.ll
 create mode 100644 test/CodeGen/PowerPC/bitreverse.ll
 create mode 100644 test/CodeGen/PowerPC/branch-hint.ll
 create mode 100644 test/CodeGen/PowerPC/coal-sections.ll
 create mode 100644 test/CodeGen/PowerPC/crbit-asm-disabled.ll
 create mode 100644 test/CodeGen/PowerPC/dyn-alloca-offset.ll
 create mode 100644 test/CodeGen/PowerPC/e500-1.ll
 create mode 100644 test/CodeGen/PowerPC/emutls_generic.ll
 create mode 100644 test/CodeGen/PowerPC/fma-mutate-register-constraint.ll
 create mode 100644 test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
 create mode 100644 test/CodeGen/PowerPC/machine-combiner.ll
 create mode 100644 test/CodeGen/PowerPC/mc-instrlat.ll
 create mode 100644 test/CodeGen/PowerPC/mcm-13.ll
 create mode 100644 test/CodeGen/PowerPC/merge-st-chain-op.ll
 create mode 100644 test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
 create mode 100644 test/CodeGen/PowerPC/peephole-align.ll
 create mode 100644 test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
 create mode 100644 test/CodeGen/PowerPC/ppcsoftops.ll
 create mode 100644 test/CodeGen/PowerPC/pr24636.ll
 create mode 100644 test/CodeGen/PowerPC/pr25157-peephole.ll
 create mode 100644 test/CodeGen/PowerPC/preincprep-nontrans-crash.ll
 create mode 100644 test/CodeGen/PowerPC/qpx-unal-cons-lds.ll
 create mode 100644 test/CodeGen/PowerPC/rotl-rotr-crash.ll
 create mode 100644 test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll
 create mode 100644 test/CodeGen/PowerPC/stackmap-frame-setup.ll
 create mode 100644 test/CodeGen/PowerPC/swaps-le-6.ll
 create mode 100644 test/CodeGen/PowerPC/unal-vec-ldst.ll
 create mode 100644 test/CodeGen/PowerPC/unal-vec-negarith.ll
 create mode 100644 test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
 create mode 100644 test/CodeGen/PowerPC/vec-asm-disabled.ll
 create mode 100644 test/CodeGen/PowerPC/vector-merge-store-fp-constants.ll
 create mode 100644 test/CodeGen/SPARC/32abi.ll
 create mode 100644 test/CodeGen/SPARC/float-constants.ll
 create mode 100644 test/CodeGen/SPARC/missing-sret.ll
 create mode 100644 test/CodeGen/SPARC/reserved-regs.ll
 create mode 100644 test/CodeGen/SPARC/select-mask.ll
 create mode 100644 test/CodeGen/SPARC/spill.ll
 create mode 100644 test/CodeGen/SPARC/stack-align.ll
 create mode 100644 test/CodeGen/SystemZ/alloca-03.ll
 create mode 100644 test/CodeGen/SystemZ/alloca-04.ll
 create mode 100644 test/CodeGen/SystemZ/dag-combine-01.ll
 create mode 100644 test/CodeGen/SystemZ/fp-cmp-05.ll
 create mode 100644 test/CodeGen/SystemZ/fp-libcall.ll
 create mode 100644 test/CodeGen/SystemZ/fp-sincos-01.ll
 create mode 100644 test/CodeGen/SystemZ/int-cmp-51.ll
 create mode 100644 test/CodeGen/SystemZ/int-cmp-52.ll
 create mode 100644 test/CodeGen/SystemZ/vec-perm-12.ll
 create mode 100644 test/CodeGen/SystemZ/vec-perm-13.ll
 create mode 100644 test/CodeGen/Thumb/ldm-stm-base-materialization-thumb2.ll
 delete mode 100644 test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
 create mode 100644 test/CodeGen/Thumb/thumb-shrink-wrapping.ll
 create mode 100644 test/CodeGen/Thumb2/emit-unwinding.ll
 create mode 100644 test/CodeGen/Thumb2/setjmp_longjmp.ll
 create mode 100644 test/CodeGen/WebAssembly/call.ll
 create mode 100644 test/CodeGen/WebAssembly/cfg-stackify.ll
 create mode 100644 test/CodeGen/WebAssembly/comparisons_f32.ll
 create mode 100644 test/CodeGen/WebAssembly/comparisons_f64.ll
 create mode 100644 test/CodeGen/WebAssembly/comparisons_i32.ll
 create mode 100644 test/CodeGen/WebAssembly/comparisons_i64.ll
 create mode 100644 test/CodeGen/WebAssembly/conv.ll
 create mode 100644 test/CodeGen/WebAssembly/copysign-casts.ll
 create mode 100644 test/CodeGen/WebAssembly/cpus.ll
 create mode 100644 test/CodeGen/WebAssembly/dead-vreg.ll
 create mode 100644 test/CodeGen/WebAssembly/f32.ll
 create mode 100644 test/CodeGen/WebAssembly/f64.ll
 create mode 100644 test/CodeGen/WebAssembly/fast-isel.ll
 create mode 100644 test/CodeGen/WebAssembly/frem.ll
 create mode 100644 test/CodeGen/WebAssembly/func.ll
 create mode 100644 test/CodeGen/WebAssembly/global.ll
 create mode 100644 test/CodeGen/WebAssembly/globl.ll
 create mode 100644 test/CodeGen/WebAssembly/i32.ll
 create mode 100644 test/CodeGen/WebAssembly/i64.ll
 create mode 100644 test/CodeGen/WebAssembly/ident.ll
 create mode 100644 test/CodeGen/WebAssembly/immediates.ll
 create mode 100644 test/CodeGen/WebAssembly/inline-asm.ll
 create mode 100644 test/CodeGen/WebAssembly/legalize.ll
 create mode 100644 test/CodeGen/WebAssembly/load-ext.ll
 create mode 100644 test/CodeGen/WebAssembly/load-store-i1.ll
 create mode 100644 test/CodeGen/WebAssembly/load.ll
 create mode 100644 test/CodeGen/WebAssembly/loop-idiom.ll
 create mode 100644 test/CodeGen/WebAssembly/memory-addr32.ll
 create mode 100644 test/CodeGen/WebAssembly/memory-addr64.ll
 create mode 100644 test/CodeGen/WebAssembly/offset-folding.ll
 create mode 100644 test/CodeGen/WebAssembly/offset.ll
 create mode 100644 test/CodeGen/WebAssembly/phi.ll
 create mode 100644 test/CodeGen/WebAssembly/reg-stackify.ll
 create mode 100644 test/CodeGen/WebAssembly/return-int32.ll
 create mode 100644 test/CodeGen/WebAssembly/return-void.ll
 create mode 100644 test/CodeGen/WebAssembly/returned.ll
 create mode 100644 test/CodeGen/WebAssembly/select.ll
 create mode 100644 test/CodeGen/WebAssembly/signext-zeroext.ll
 create mode 100644 test/CodeGen/WebAssembly/store-results.ll
 create mode 100644 test/CodeGen/WebAssembly/store-trunc.ll
 create mode 100644 test/CodeGen/WebAssembly/store.ll
 create mode 100644 test/CodeGen/WebAssembly/switch.ll
 create mode 100644 test/CodeGen/WebAssembly/unreachable.ll
 create mode 100644 test/CodeGen/WebAssembly/unused-argument.ll
 create mode 100644 test/CodeGen/WebAssembly/userstack.ll
 create mode 100644 test/CodeGen/WebAssembly/varargs.ll
 create mode 100644 test/CodeGen/WebAssembly/vtable.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-alloca-sink.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-catch-all-win32.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-catch-all.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-catch-and-throw.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-catch-scalar.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-catch-unwind.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-cleanup-invoke.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-demote-liveout.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-frame-vars.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-inalloca.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-min-unwind.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-multi-catch.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-nested-1.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-nested-2.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-nested-3.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-nested-rethrow.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-prepared-catch-all.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-prepared-catch.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-shared-empty-catch.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll
 delete mode 100644 test/CodeGen/WinEH/cppeh-state-calc-1.ll
 delete mode 100644 test/CodeGen/WinEH/seh-catch-all.ll
 delete mode 100644 test/CodeGen/WinEH/seh-exception-code.ll
 delete mode 100644 test/CodeGen/WinEH/seh-exception-code2.ll
 delete mode 100644 test/CodeGen/WinEH/seh-inlined-finally.ll
 delete mode 100644 test/CodeGen/WinEH/seh-outlined-finally-win32.ll
 delete mode 100644 test/CodeGen/WinEH/seh-outlined-finally.ll
 delete mode 100644 test/CodeGen/WinEH/seh-prepared-basic.ll
 delete mode 100644 test/CodeGen/WinEH/seh-resume-phi.ll
 delete mode 100644 test/CodeGen/WinEH/seh-simple.ll
 create mode 100644 test/CodeGen/WinEH/wineh-cloning.ll
 create mode 100644 test/CodeGen/WinEH/wineh-demotion.ll
 create mode 100644 test/CodeGen/WinEH/wineh-intrinsics-invalid.ll
 create mode 100644 test/CodeGen/WinEH/wineh-intrinsics.ll
 create mode 100644 test/CodeGen/WinEH/wineh-no-demotion.ll
 create mode 100644 test/CodeGen/WinEH/wineh-statenumbering-cleanups.ll
 create mode 100644 test/CodeGen/WinEH/wineh-statenumbering.ll
 create mode 100644 test/CodeGen/X86/add-nsw-sext.ll
 create mode 100644 test/CodeGen/X86/and-encoding.ll
 create mode 100644 test/CodeGen/X86/atomic-flags.ll
 create mode 100644 test/CodeGen/X86/atomic-non-integer.ll
 create mode 100644 test/CodeGen/X86/avg.ll
 create mode 100644 test/CodeGen/X86/avx-isa-check.ll
 create mode 100644 test/CodeGen/X86/avx512-bugfix-25270.ll
 create mode 100644 test/CodeGen/X86/avx512-ext.ll
 create mode 100644 test/CodeGen/X86/avx512-extract-subvector.ll
 create mode 100644 test/CodeGen/X86/avx512-skx-insert-subvec.ll
 delete mode 100644 test/CodeGen/X86/avx512-trunc-ext.ll
 create mode 100644 test/CodeGen/X86/avx512-trunc.ll
 create mode 100644 test/CodeGen/X86/avx512cd-intrinsics.ll
 create mode 100644 test/CodeGen/X86/avx512cdvl-intrinsics.ll
 create mode 100644 test/CodeGen/X86/avx512dq-intrinsics.ll
 create mode 100644 test/CodeGen/X86/bit-piece-comment.ll
 create mode 100644 test/CodeGen/X86/bitreverse.ll
 create mode 100644 test/CodeGen/X86/branchfolding-catchpads.ll
 create mode 100644 test/CodeGen/X86/catchpad-realign-savexmm.ll
 create mode 100644 test/CodeGen/X86/catchpad-regmask.ll
 create mode 100644 test/CodeGen/X86/catchpad-weight.ll
 create mode 100644 test/CodeGen/X86/catchret-empty-fallthrough.ll
 create mode 100644 test/CodeGen/X86/catchret-fallthrough.ll
 create mode 100644 test/CodeGen/X86/cleanuppad-inalloca.ll
 create mode 100644 test/CodeGen/X86/cleanuppad-large-codemodel.ll
 create mode 100644 test/CodeGen/X86/cleanuppad-realign.ll
 create mode 100644 test/CodeGen/X86/coal-sections.ll
 create mode 100644 test/CodeGen/X86/coalescer-win64.ll
 create mode 100644 test/CodeGen/X86/code_placement_cold_loop_blocks.ll
 create mode 100644 test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll
 create mode 100644 test/CodeGen/X86/code_placement_loop_rotation.ll
 create mode 100644 test/CodeGen/X86/code_placement_loop_rotation2.ll
 create mode 100644 test/CodeGen/X86/combine-multiplies.ll
 delete mode 100644 test/CodeGen/X86/combine-sse2-intrinsics.ll
 create mode 100644 test/CodeGen/X86/constant-hoisting-and.ll
 create mode 100644 test/CodeGen/X86/constant-hoisting-cmp.ll
 delete mode 100644 test/CodeGen/X86/cppeh-nounwind.ll
 create mode 100644 test/CodeGen/X86/cxx_tlscc64.ll
 create mode 100644 test/CodeGen/X86/dag-fmf-cse.ll
 create mode 100644 test/CodeGen/X86/dag-merge-fast-accesses.ll
 create mode 100644 test/CodeGen/X86/darwin-tls.ll
 create mode 100644 test/CodeGen/X86/debugloc-argsize.ll
 create mode 100644 test/CodeGen/X86/eh-null-personality.ll
 create mode 100644 test/CodeGen/X86/emutls-pic.ll
 create mode 100644 test/CodeGen/X86/emutls-pie.ll
 create mode 100644 test/CodeGen/X86/emutls.ll
 create mode 100644 test/CodeGen/X86/emutls_generic.ll
 create mode 100644 test/CodeGen/X86/expand-vr64-gr64-copy.mir
 create mode 100644 test/CodeGen/X86/extractelement-legalization-cycle.ll
 create mode 100644 test/CodeGen/X86/fadd-combines.ll
 create mode 100644 test/CodeGen/X86/fast-isel-bitcasts-avx.ll
 create mode 100644 test/CodeGen/X86/fast-isel-bitcasts.ll
 create mode 100644 test/CodeGen/X86/fast-isel-deadcode.ll
 create mode 100644 test/CodeGen/X86/fast-isel-emutls.ll
 create mode 100644 test/CodeGen/X86/fast-isel-nontemporal.ll
 create mode 100644 test/CodeGen/X86/fast-isel-stackcheck.ll
 create mode 100644 test/CodeGen/X86/fixup-lea.ll
 create mode 100644 test/CodeGen/X86/fma-commute-x86.ll
 create mode 100644 test/CodeGen/X86/fma-scalar-memfold.ll
 create mode 100644 test/CodeGen/X86/fold-push.ll
 create mode 100644 test/CodeGen/X86/fp-logic.ll
 create mode 100644 test/CodeGen/X86/fp128-calling-conv.ll
 create mode 100644 test/CodeGen/X86/fp128-cast.ll
 create mode 100644 test/CodeGen/X86/fp128-compare.ll
 create mode 100644 test/CodeGen/X86/fp128-i128.ll
 create mode 100644 test/CodeGen/X86/fp128-libcalls.ll
 create mode 100644 test/CodeGen/X86/fp128-load.ll
 create mode 100644 test/CodeGen/X86/fp128-store.ll
 create mode 100644 test/CodeGen/X86/fpcmp-soft-fp.ll
 delete mode 100644 test/CodeGen/X86/frameescape.ll
 create mode 100644 test/CodeGen/X86/frem-msvc32.ll
 create mode 100644 test/CodeGen/X86/funclet-layout.ll
 create mode 100644 test/CodeGen/X86/function-alias.ll
 create mode 100644 test/CodeGen/X86/hhvm-cc.ll
 create mode 100644 test/CodeGen/X86/i386-shrink-wrapping.ll
 create mode 100644 test/CodeGen/X86/immediate_merging.ll
 create mode 100644 test/CodeGen/X86/inconsistent_landingpad.ll
 create mode 100644 test/CodeGen/X86/inline-sse.ll
 create mode 100644 test/CodeGen/X86/insertps-from-constantpool.ll
 create mode 100644 test/CodeGen/X86/insertps-unfold-load-bug.ll
 create mode 100644 test/CodeGen/X86/late-address-taken.ll
 create mode 100644 test/CodeGen/X86/lea-opt.ll
 create mode 100644 test/CodeGen/X86/localescape.ll
 create mode 100644 test/CodeGen/X86/machine-combiner-int-vec.ll
 create mode 100644 test/CodeGen/X86/machine-combiner-int.ll
 create mode 100644 test/CodeGen/X86/materialize.ll
 create mode 100644 test/CodeGen/X86/mcu-abi.ll
 create mode 100644 test/CodeGen/X86/merge-store-partially-alias-loads.ll
 create mode 100644 test/CodeGen/X86/mmx-coalescing.ll
 create mode 100644 test/CodeGen/X86/mmx-only.ll
 create mode 100644 test/CodeGen/X86/movpc32-check.ll
 create mode 100644 test/CodeGen/X86/or-lea.ll
 create mode 100644 test/CodeGen/X86/patchpoint-verifiable.mir
 create mode 100644 test/CodeGen/X86/peephole-na-phys-copy-folding.ll
 create mode 100644 test/CodeGen/X86/pop-stack-cleanup.ll
 delete mode 100644 test/CodeGen/X86/pr21529.ll
 delete mode 100644 test/CodeGen/X86/pr23900.ll
 create mode 100644 test/CodeGen/X86/pr24139.ll
 create mode 100644 test/CodeGen/X86/pr24602.ll
 create mode 100644 test/CodeGen/X86/pr25828.ll
 create mode 100644 test/CodeGen/X86/prolog-push-seq.ll
 create mode 100644 test/CodeGen/X86/pseudo_cmov_lower.ll
 create mode 100644 test/CodeGen/X86/pseudo_cmov_lower1.ll
 create mode 100644 test/CodeGen/X86/pseudo_cmov_lower2.ll
 create mode 100644 test/CodeGen/X86/push-cfi-debug.ll
 create mode 100644 test/CodeGen/X86/push-cfi-obj.ll
 create mode 100644 test/CodeGen/X86/push-cfi.ll
 create mode 100644 test/CodeGen/X86/rem_crash.ll
 delete mode 100644 test/CodeGen/X86/remat-invalid-liveness.ll
 create mode 100644 test/CodeGen/X86/safestack.ll
 create mode 100644 test/CodeGen/X86/sar_fold.ll
 create mode 100644 test/CodeGen/X86/sar_fold64.ll
 create mode 100644 test/CodeGen/X86/scalar-fp-to-i64.ll
 create mode 100644 test/CodeGen/X86/scalar-int-to-fp.ll
 create mode 100644 test/CodeGen/X86/sdiv-pow2.ll
 create mode 100644 test/CodeGen/X86/seh-catchpad.ll
 create mode 100644 test/CodeGen/X86/seh-exception-code.ll
 delete mode 100644 test/CodeGen/X86/seh-filter.ll
 delete mode 100644 test/CodeGen/X86/seh-stack-realign-win32.ll
 create mode 100644 test/CodeGen/X86/shrink-wrap-chkstk.ll
 create mode 100644 test/CodeGen/X86/slow-unaligned-mem.ll
 create mode 100644 test/CodeGen/X86/soft-sitofp.ll
 create mode 100644 test/CodeGen/X86/sse-only.ll
 create mode 100644 test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
 create mode 100644 test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
 create mode 100644 test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
 create mode 100644 test/CodeGen/X86/stack-folding-adx-x86_64.ll
 create mode 100644 test/CodeGen/X86/stackmap-frame-setup.ll
 create mode 100644 test/CodeGen/X86/switch-edge-weight.ll
 create mode 100644 test/CodeGen/X86/system-intrinsics-64-xsave.ll
 create mode 100644 test/CodeGen/X86/system-intrinsics-64-xsavec.ll
 create mode 100644 test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll
 create mode 100644 test/CodeGen/X86/system-intrinsics-64-xsaves.ll
 create mode 100644 test/CodeGen/X86/system-intrinsics-xsave.ll
 create mode 100644 test/CodeGen/X86/system-intrinsics-xsavec.ll
 create mode 100644 test/CodeGen/X86/system-intrinsics-xsaveopt.ll
 create mode 100644 test/CodeGen/X86/system-intrinsics-xsaves.ll
 create mode 100644 test/CodeGen/X86/tail-dup-catchret.ll
 create mode 100644 test/CodeGen/X86/tail-merge-wineh.ll
 create mode 100644 test/CodeGen/X86/tailcall-msvc-conventions.ll
 create mode 100644 test/CodeGen/X86/tailcall-readnone.ll
 create mode 100644 test/CodeGen/X86/tls-android-negative.ll
 create mode 100644 test/CodeGen/X86/tls-android.ll
 create mode 100644 test/CodeGen/X86/token_landingpad.ll
 create mode 100644 test/CodeGen/X86/trunc-store.ll
 create mode 100644 test/CodeGen/X86/vec_cmp_sint-128.ll
 create mode 100644 test/CodeGen/X86/vec_cmp_uint-128.ll
 create mode 100644 test/CodeGen/X86/vec_minmax_sint.ll
 create mode 100644 test/CodeGen/X86/vec_minmax_uint.ll
 create mode 100644 test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
 create mode 100644 test/CodeGen/X86/vector-lzcnt-512.ll
 create mode 100644 test/CodeGen/X86/vector-merge-store-fp-constants.ll
 create mode 100644 test/CodeGen/X86/vector-popcnt-512.ll
 create mode 100644 test/CodeGen/X86/vector-rotate-128.ll
 create mode 100644 test/CodeGen/X86/vector-rotate-256.ll
 create mode 100644 test/CodeGen/X86/vector-shift-ashr-512.ll
 create mode 100644 test/CodeGen/X86/vector-shift-lshr-512.ll
 create mode 100644 test/CodeGen/X86/vector-shift-shl-512.ll
 create mode 100644 test/CodeGen/X86/vector-shuffle-512-v32.ll
 create mode 100644 test/CodeGen/X86/vector-shuffle-v1.ll
 create mode 100644 test/CodeGen/X86/vector-tzcnt-512.ll
 create mode 100644 test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll
 create mode 100644 test/CodeGen/X86/vmovq.ll
 create mode 100644 test/CodeGen/X86/wide-integer-cmp.ll
 create mode 100644 test/CodeGen/X86/win-catchpad-csrs.ll
 create mode 100644 test/CodeGen/X86/win-catchpad-nested-cxx.ll
 create mode 100644 test/CodeGen/X86/win-catchpad-nested.ll
 create mode 100644 test/CodeGen/X86/win-catchpad-varargs.ll
 create mode 100644 test/CodeGen/X86/win-catchpad.ll
 create mode 100644 test/CodeGen/X86/win-cleanuppad.ll
 create mode 100644 test/CodeGen/X86/win-funclet-cfi.ll
 create mode 100644 test/CodeGen/X86/win-mixed-ehpersonality.ll
 create mode 100644 test/CodeGen/X86/win32-seh-catchpad-realign.ll
 create mode 100644 test/CodeGen/X86/win32-seh-catchpad.ll
 create mode 100644 test/CodeGen/X86/win32-seh-nested-finally.ll
 create mode 100644 test/CodeGen/X86/win32-spill-xmm.ll
 create mode 100644 test/CodeGen/X86/win64_sibcall.ll
 create mode 100644 test/CodeGen/X86/win_coreclr_chkstk.ll
 delete mode 100644 test/CodeGen/X86/win_eh_prepare.ll
 delete mode 100644 test/CodeGen/X86/win_ftol2.ll
 create mode 100644 test/CodeGen/X86/wineh-coreclr.ll
 create mode 100644 test/CodeGen/X86/wineh-exceptionpointer.ll
 create mode 100644 test/CodeGen/X86/wineh-no-ehpads.ll
 create mode 100644 test/CodeGen/X86/x32-indirectbr.ll
 create mode 100644 test/CodeGen/X86/x32-landingpad.ll
 create mode 100644 test/CodeGen/X86/x32-va_start.ll
 create mode 100644 test/CodeGen/X86/x86-32-intrcc.ll
 create mode 100644 test/CodeGen/X86/x86-64-intrcc.ll
 create mode 100644 test/CodeGen/X86/x86-64-ms_abi-vararg.ll
 create mode 100644 test/CodeGen/X86/x86-sanitizer-shrink-wrapping.ll
 create mode 100644 test/CodeGen/X86/x86-shrink-wrap-unwind.ll
 create mode 100644 test/CodeGen/X86/x86-win64-shrink-wrapping.ll
 create mode 100644 test/CodeGen/X86/xop-pcmov.ll

(limited to 'test/CodeGen')

diff --git a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
index b075573cc6742..5eb455f3a22cd 100644
--- a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
+++ b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
@@ -3,7 +3,7 @@
 ; Bug 20598
 
 
-define void @test() #0 {
+define void @test() #0 !dbg !4 {
 entry:
   br label %for.body, !dbg !39
 
@@ -44,39 +44,39 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!36, !37}
 !llvm.ident = !{!38}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
 !1 = !DIFile(filename: "test.c", directory: "")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "", line: 140, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 141, file: !1, scope: !1, type: !6, function: void ()* @test, variables: !12)
+!4 = distinct !DISubprogram(name: "", line: 140, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 141, file: !1, scope: !1, type: !6, variables: !12)
 !6 = !DISubroutineType(types: !7)
 !7 = !{null, !8}
 !8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !9)
 !9 = !DIDerivedType(tag: DW_TAG_typedef, line: 30, file: !1, baseType: !11)
 !11 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !12 = !{!13, !14, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35}
-!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "", line: 140, arg: 1, scope: !4, file: !1, type: !8)
-!14 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
+!13 = !DILocalVariable(name: "", line: 140, arg: 1, scope: !4, file: !1, type: !8)
+!14 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
 !15 = !DIDerivedType(tag: DW_TAG_typedef, line: 183, file: !1, baseType: !17)
 !17 = !DIBasicType(tag: DW_TAG_base_type, size: 64, align: 64, encoding: DW_ATE_signed)
-!18 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!19 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!20 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!21 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!22 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!23 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!24 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!25 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15)
-!26 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15)
-!27 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15)
-!28 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15)
-!29 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15)
-!30 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15)
-!31 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15)
-!32 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15)
-!33 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15)
-!34 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 145, scope: !4, file: !1, type: !8)
-!35 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 146, scope: !4, file: !1, type: !11)
+!18 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!19 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!20 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!21 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!22 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!23 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!24 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!25 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15)
+!26 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15)
+!27 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15)
+!28 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15)
+!29 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15)
+!30 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15)
+!31 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15)
+!32 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15)
+!33 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15)
+!34 = !DILocalVariable(name: "", line: 145, scope: !4, file: !1, type: !8)
+!35 = !DILocalVariable(name: "", line: 146, scope: !4, file: !1, type: !11)
 !36 = !{i32 2, !"Dwarf Version", i32 4}
 !37 = !{i32 2, !"Debug Info Version", i32 3}
 !38 = !{!"clang version 3.6.0 "}
diff --git a/test/CodeGen/AArch64/aarch64-addv.ll b/test/CodeGen/AArch64/aarch64-addv.ll
new file mode 100644
index 0000000000000..ca374eea28e72
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-addv.ll
@@ -0,0 +1,98 @@
+; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s
+
+define i8 @add_B(<16 x i8>* %arr)  {
+; CHECK-LABEL: add_B
+; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
+  %bin.rdx = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0
+  %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
+  %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf
+  %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12
+  %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13
+  %r = extractelement <16 x i8> %bin.rdx14, i32 0
+  ret i8 %r
+}
+
+define i16 @add_H(<8 x i16>* %arr)  {
+; CHECK-LABEL: add_H
+; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
+  %bin.rdx = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12
+  %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13
+  %r = extractelement <8 x i16> %bin.rdx14, i32 0
+  ret i16 %r
+}
+
+define i32 @add_S( <4 x i32>* %arr)  {
+; CHECK-LABEL: add_S
+; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
+  %bin.rdx = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12
+  %r = extractelement <4 x i32> %bin.rdx13, i32 0
+  ret i32 %r
+}
+
+define i64 @add_D(<2 x i64>* %arr)  {
+; CHECK-LABEL: add_D
+; CHECK-NOT: addv
+  %bin.rdx = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0
+  %r = extractelement <2 x i64> %bin.rdx0, i32 0
+  ret i64 %r
+}
+
+define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) {
+; CHECK-LABEL: oversized_ADDV_256
+; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = bitcast i8* %arg1 to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %arg2 to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx2 = add <8 x i32> %bin.rdx, %rdx.shuf1
+  %rdx.shuf3 = shufflevector <8 x i32> %bin.rdx2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx4 = add <8 x i32> %bin.rdx2, %rdx.shuf3
+  %10 = extractelement <8 x i32> %bin.rdx4, i32 0
+  ret i32 %10
+}
+
+define i32 @oversized_ADDV_512(<16 x i32>* %arr)  {
+; CHECK-LABEL: oversized_ADDV_512
+; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
+  %bin.rdx = load <16 x i32>, <16 x i32>* %arr
+
+  %rdx.shuf0 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx0 = add <16 x i32> %bin.rdx, %rdx.shuf0
+
+  %rdx.shuf = shufflevector <16 x i32> %bin.rdx0, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
+  %bin.rdx11 = add <16 x i32> %bin.rdx0, %rdx.shuf
+
+  %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx11, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx13 = add <16 x i32> %bin.rdx11, %rdx.shuf12
+
+  %rdx.shuf13 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx14 = add <16 x i32> %bin.rdx13, %rdx.shuf13
+
+  %r = extractelement <16 x i32> %bin.rdx14, i32 0
+  ret i32 %r
+}
diff --git a/test/CodeGen/AArch64/aarch64-deferred-spilling.ll b/test/CodeGen/AArch64/aarch64-deferred-spilling.ll
new file mode 100644
index 0000000000000..7accdced7d44d
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-deferred-spilling.ll
@@ -0,0 +1,514 @@
+;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=true -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=DEFERRED
+;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=false -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=REGULAR
+
+; Check that we do not end up with useless spill code.
+;
+; Move to the basic block we are interested in.
+;
+; CHECK: // %if.then.120
+;
+; REGULAR: str w21, [sp, #[[OFFSET:[0-9]+]]] // 4-byte Folded Spill
+; Check that w21 wouldn't need to be spilled since it is never reused.
+; REGULAR-NOT: {{[wx]}}21{{,?}}
+;
+; Check that w22 is used to carry a value through the call.
+; DEFERRED-NOT: str {{[wx]}}22,
+; DEFERRED: mov {{[wx]}}22,
+; DEFERRED-NOT: str {{[wx]}}22,
+;
+; CHECK:        bl      fprintf
+;
+; DEFERRED-NOT: ldr {{[wx]}}22,
+; DEFERRED: mov {{[wx][0-9]+}}, {{[wx]}}22
+; DEFERRED-NOT: ldr {{[wx]}}22,
+;
+; REGULAR-NOT: {{[wx]}}21{{,?}}
+; REGULAR: ldr w21, [sp, #[[OFFSET]]] // 4-byte Folded Reload
+;
+; End of the basic block we are interested in.
+; CHECK:        b
+; CHECK: {{[^:]+}}: // %sw.bb.123
+
+%struct.__sFILE = type { i8*, i32, i32, i32, i32, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, i8*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
+%struct.__sbuf = type { i8*, i64 }
+%struct.DState = type { %struct.bz_stream*, i32, i8, i32, i8, i32, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, [256 x i32], i32, [257 x i32], [257 x i32], i32*, i16*, i8*, i32, i32, i32, i32, i32, [256 x i8], [16 x i8], [256 x i8], [4096 x i8], [16 x i32], [18002 x i8], [18002 x i8], [6 x [258 x i8]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, i32*, i32* }
+%struct.bz_stream = type { i8*, i32, i32, i32, i8*, i32, i32, i32, i8*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8* }
+
+@__sF = external global [0 x %struct.__sFILE], align 8
+@.str = private unnamed_addr constant [20 x i8] c"\0A    [%d: stuff+mf \00", align 1
+
+declare i32 @fprintf(%struct.__sFILE* nocapture, i8* nocapture readonly, ...)
+
+declare void @bar(i32)
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+define i32 @foo(%struct.DState* %s) {
+entry:
+  %state = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 1
+  %tmp = load i32, i32* %state, align 4
+  %cmp = icmp eq i32 %tmp, 10
+  %save_i = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 40
+  br i1 %cmp, label %if.end.thread, label %if.end
+
+if.end.thread:                                    ; preds = %entry
+  %save_j = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
+  %save_t = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
+  %save_alphaSize = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
+  %save_nGroups = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
+  %save_nSelectors = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
+  %save_EOB = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
+  %save_groupNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
+  %save_groupPos = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
+  %save_nextSym = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
+  %save_nblockMAX = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
+  %save_nblock = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
+  %save_es = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
+  %save_N = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
+  %save_curr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
+  %save_zt = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
+  %save_zn = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
+  %save_zvec = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
+  %save_zj = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
+  %tmp1 = bitcast i32* %save_i to i8*
+  call void @llvm.memset.p0i8.i64(i8* %tmp1, i8 0, i64 108, i32 4, i1 false)
+  br label %sw.default
+
+if.end:                                           ; preds = %entry
+  %.pre = load i32, i32* %save_i, align 4
+  %save_j3.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
+  %.pre406 = load i32, i32* %save_j3.phi.trans.insert, align 4
+  %save_t4.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
+  %.pre407 = load i32, i32* %save_t4.phi.trans.insert, align 4
+  %save_alphaSize5.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
+  %.pre408 = load i32, i32* %save_alphaSize5.phi.trans.insert, align 4
+  %save_nGroups6.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
+  %.pre409 = load i32, i32* %save_nGroups6.phi.trans.insert, align 4
+  %save_nSelectors7.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
+  %.pre410 = load i32, i32* %save_nSelectors7.phi.trans.insert, align 4
+  %save_EOB8.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
+  %.pre411 = load i32, i32* %save_EOB8.phi.trans.insert, align 4
+  %save_groupNo9.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
+  %.pre412 = load i32, i32* %save_groupNo9.phi.trans.insert, align 4
+  %save_groupPos10.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
+  %.pre413 = load i32, i32* %save_groupPos10.phi.trans.insert, align 4
+  %save_nextSym11.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
+  %.pre414 = load i32, i32* %save_nextSym11.phi.trans.insert, align 4
+  %save_nblockMAX12.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
+  %.pre415 = load i32, i32* %save_nblockMAX12.phi.trans.insert, align 4
+  %save_nblock13.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
+  %.pre416 = load i32, i32* %save_nblock13.phi.trans.insert, align 4
+  %save_es14.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
+  %.pre417 = load i32, i32* %save_es14.phi.trans.insert, align 4
+  %save_N15.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
+  %.pre418 = load i32, i32* %save_N15.phi.trans.insert, align 4
+  %save_curr16.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
+  %.pre419 = load i32, i32* %save_curr16.phi.trans.insert, align 4
+  %save_zt17.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
+  %.pre420 = load i32, i32* %save_zt17.phi.trans.insert, align 4
+  %save_zn18.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
+  %.pre421 = load i32, i32* %save_zn18.phi.trans.insert, align 4
+  %save_zvec19.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
+  %.pre422 = load i32, i32* %save_zvec19.phi.trans.insert, align 4
+  %save_zj20.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
+  %.pre423 = load i32, i32* %save_zj20.phi.trans.insert, align 4
+  switch i32 %tmp, label %sw.default [
+    i32 13, label %sw.bb
+    i32 14, label %if.end.sw.bb.65_crit_edge
+    i32 25, label %if.end.sw.bb.123_crit_edge
+  ]
+
+if.end.sw.bb.123_crit_edge:                       ; preds = %if.end
+  %.pre433 = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
+  br label %sw.bb.123
+
+if.end.sw.bb.65_crit_edge:                        ; preds = %if.end
+  %bsLive69.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
+  %.pre426 = load i32, i32* %bsLive69.phi.trans.insert, align 4
+  br label %sw.bb.65
+
+sw.bb:                                            ; preds = %if.end
+  %sunkaddr = ptrtoint %struct.DState* %s to i64
+  %sunkaddr485 = add i64 %sunkaddr, 8
+  %sunkaddr486 = inttoptr i64 %sunkaddr485 to i32*
+  store i32 13, i32* %sunkaddr486, align 4
+  %bsLive = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
+  %tmp2 = load i32, i32* %bsLive, align 4
+  %cmp28.400 = icmp sgt i32 %tmp2, 7
+  br i1 %cmp28.400, label %sw.bb.if.then.29_crit_edge, label %if.end.33.lr.ph
+
+sw.bb.if.then.29_crit_edge:                       ; preds = %sw.bb
+  %sunkaddr487 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr488 = add i64 %sunkaddr487, 32
+  %sunkaddr489 = inttoptr i64 %sunkaddr488 to i32*
+  %.pre425 = load i32, i32* %sunkaddr489, align 4
+  br label %if.then.29
+
+if.end.33.lr.ph:                                  ; preds = %sw.bb
+  %tmp3 = bitcast %struct.DState* %s to %struct.bz_stream**
+  %.pre424 = load %struct.bz_stream*, %struct.bz_stream** %tmp3, align 8
+  %avail_in.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre424, i64 0, i32 1
+  %.pre430 = load i32, i32* %avail_in.phi.trans.insert, align 4
+  %tmp4 = add i32 %.pre430, -1
+  br label %if.end.33
+
+if.then.29:                                       ; preds = %while.body.backedge, %sw.bb.if.then.29_crit_edge
+  %tmp5 = phi i32 [ %.pre425, %sw.bb.if.then.29_crit_edge ], [ %or, %while.body.backedge ]
+  %.lcssa393 = phi i32 [ %tmp2, %sw.bb.if.then.29_crit_edge ], [ %add, %while.body.backedge ]
+  %sub = add nsw i32 %.lcssa393, -8
+  %shr = lshr i32 %tmp5, %sub
+  %and = and i32 %shr, 255
+  %sunkaddr491 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr492 = add i64 %sunkaddr491, 36
+  %sunkaddr493 = inttoptr i64 %sunkaddr492 to i32*
+  store i32 %sub, i32* %sunkaddr493, align 4
+  %blockSize100k = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 9
+  store i32 %and, i32* %blockSize100k, align 4
+  %and.off = add nsw i32 %and, -49
+  %tmp6 = icmp ugt i32 %and.off, 8
+  br i1 %tmp6, label %save_state_and_return, label %if.end.62
+
+if.end.33:                                        ; preds = %while.body.backedge, %if.end.33.lr.ph
+  %lsr.iv482 = phi i32 [ %tmp4, %if.end.33.lr.ph ], [ %lsr.iv.next483, %while.body.backedge ]
+  %tmp7 = phi i32 [ %tmp2, %if.end.33.lr.ph ], [ %add, %while.body.backedge ]
+  %cmp35 = icmp eq i32 %lsr.iv482, -1
+  br i1 %cmp35, label %save_state_and_return, label %if.end.37
+
+if.end.37:                                        ; preds = %if.end.33
+  %tmp8 = bitcast %struct.bz_stream* %.pre424 to i8**
+  %sunkaddr494 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr495 = add i64 %sunkaddr494, 32
+  %sunkaddr496 = inttoptr i64 %sunkaddr495 to i32*
+  %tmp9 = load i32, i32* %sunkaddr496, align 4
+  %shl = shl i32 %tmp9, 8
+  %tmp10 = load i8*, i8** %tmp8, align 8
+  %tmp11 = load i8, i8* %tmp10, align 1
+  %conv = zext i8 %tmp11 to i32
+  %or = or i32 %conv, %shl
+  store i32 %or, i32* %sunkaddr496, align 4
+  %add = add nsw i32 %tmp7, 8
+  %sunkaddr497 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr498 = add i64 %sunkaddr497, 36
+  %sunkaddr499 = inttoptr i64 %sunkaddr498 to i32*
+  store i32 %add, i32* %sunkaddr499, align 4
+  %incdec.ptr = getelementptr inbounds i8, i8* %tmp10, i64 1
+  store i8* %incdec.ptr, i8** %tmp8, align 8
+  %sunkaddr500 = ptrtoint %struct.bz_stream* %.pre424 to i64
+  %sunkaddr501 = add i64 %sunkaddr500, 8
+  %sunkaddr502 = inttoptr i64 %sunkaddr501 to i32*
+  store i32 %lsr.iv482, i32* %sunkaddr502, align 4
+  %sunkaddr503 = ptrtoint %struct.bz_stream* %.pre424 to i64
+  %sunkaddr504 = add i64 %sunkaddr503, 12
+  %sunkaddr505 = inttoptr i64 %sunkaddr504 to i32*
+  %tmp12 = load i32, i32* %sunkaddr505, align 4
+  %inc = add i32 %tmp12, 1
+  store i32 %inc, i32* %sunkaddr505, align 4
+  %cmp49 = icmp eq i32 %inc, 0
+  br i1 %cmp49, label %if.then.51, label %while.body.backedge
+
+if.then.51:                                       ; preds = %if.end.37
+  %sunkaddr506 = ptrtoint %struct.bz_stream* %.pre424 to i64
+  %sunkaddr507 = add i64 %sunkaddr506, 16
+  %sunkaddr508 = inttoptr i64 %sunkaddr507 to i32*
+  %tmp13 = load i32, i32* %sunkaddr508, align 4
+  %inc53 = add i32 %tmp13, 1
+  store i32 %inc53, i32* %sunkaddr508, align 4
+  br label %while.body.backedge
+
+while.body.backedge:                              ; preds = %if.then.51, %if.end.37
+  %lsr.iv.next483 = add i32 %lsr.iv482, -1
+  %cmp28 = icmp sgt i32 %add, 7
+  br i1 %cmp28, label %if.then.29, label %if.end.33
+
+if.end.62:                                        ; preds = %if.then.29
+  %sub64 = add nsw i32 %and, -48
+  %sunkaddr509 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr510 = add i64 %sunkaddr509, 40
+  %sunkaddr511 = inttoptr i64 %sunkaddr510 to i32*
+  store i32 %sub64, i32* %sunkaddr511, align 4
+  br label %sw.bb.65
+
+sw.bb.65:                                         ; preds = %if.end.62, %if.end.sw.bb.65_crit_edge
+  %bsLive69.pre-phi = phi i32* [ %bsLive69.phi.trans.insert, %if.end.sw.bb.65_crit_edge ], [ %bsLive, %if.end.62 ]
+  %tmp14 = phi i32 [ %.pre426, %if.end.sw.bb.65_crit_edge ], [ %sub, %if.end.62 ]
+  %sunkaddr512 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr513 = add i64 %sunkaddr512, 8
+  %sunkaddr514 = inttoptr i64 %sunkaddr513 to i32*
+  store i32 14, i32* %sunkaddr514, align 4
+  %cmp70.397 = icmp sgt i32 %tmp14, 7
+  br i1 %cmp70.397, label %if.then.72, label %if.end.82.lr.ph
+
+if.end.82.lr.ph:                                  ; preds = %sw.bb.65
+  %tmp15 = bitcast %struct.DState* %s to %struct.bz_stream**
+  %.pre427 = load %struct.bz_stream*, %struct.bz_stream** %tmp15, align 8
+  %avail_in84.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre427, i64 0, i32 1
+  %.pre431 = load i32, i32* %avail_in84.phi.trans.insert, align 4
+  %tmp16 = add i32 %.pre431, -1
+  br label %if.end.82
+
+if.then.72:                                       ; preds = %while.body.68.backedge, %sw.bb.65
+  %.lcssa390 = phi i32 [ %tmp14, %sw.bb.65 ], [ %add97, %while.body.68.backedge ]
+  %sub76 = add nsw i32 %.lcssa390, -8
+  %sunkaddr516 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr517 = add i64 %sunkaddr516, 36
+  %sunkaddr518 = inttoptr i64 %sunkaddr517 to i32*
+  store i32 %sub76, i32* %sunkaddr518, align 4
+  %currBlockNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 11
+  %tmp17 = load i32, i32* %currBlockNo, align 4
+  %inc117 = add nsw i32 %tmp17, 1
+  store i32 %inc117, i32* %currBlockNo, align 4
+  %verbosity = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 12
+  %tmp18 = load i32, i32* %verbosity, align 4
+  %cmp118 = icmp sgt i32 %tmp18, 1
+  br i1 %cmp118, label %if.then.120, label %sw.bb.123, !prof !0
+
+if.end.82:                                        ; preds = %while.body.68.backedge, %if.end.82.lr.ph
+  %lsr.iv480 = phi i32 [ %tmp16, %if.end.82.lr.ph ], [ %lsr.iv.next481, %while.body.68.backedge ]
+  %tmp19 = phi i32 [ %tmp14, %if.end.82.lr.ph ], [ %add97, %while.body.68.backedge ]
+  %cmp85 = icmp eq i32 %lsr.iv480, -1
+  br i1 %cmp85, label %save_state_and_return, label %if.end.88
+
+if.end.88:                                        ; preds = %if.end.82
+  %tmp20 = bitcast %struct.bz_stream* %.pre427 to i8**
+  %sunkaddr519 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr520 = add i64 %sunkaddr519, 32
+  %sunkaddr521 = inttoptr i64 %sunkaddr520 to i32*
+  %tmp21 = load i32, i32* %sunkaddr521, align 4
+  %shl90 = shl i32 %tmp21, 8
+  %tmp22 = load i8*, i8** %tmp20, align 8
+  %tmp23 = load i8, i8* %tmp22, align 1
+  %conv93 = zext i8 %tmp23 to i32
+  %or94 = or i32 %conv93, %shl90
+  store i32 %or94, i32* %sunkaddr521, align 4
+  %add97 = add nsw i32 %tmp19, 8
+  %sunkaddr522 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr523 = add i64 %sunkaddr522, 36
+  %sunkaddr524 = inttoptr i64 %sunkaddr523 to i32*
+  store i32 %add97, i32* %sunkaddr524, align 4
+  %incdec.ptr100 = getelementptr inbounds i8, i8* %tmp22, i64 1
+  store i8* %incdec.ptr100, i8** %tmp20, align 8
+  %sunkaddr525 = ptrtoint %struct.bz_stream* %.pre427 to i64
+  %sunkaddr526 = add i64 %sunkaddr525, 8
+  %sunkaddr527 = inttoptr i64 %sunkaddr526 to i32*
+  store i32 %lsr.iv480, i32* %sunkaddr527, align 4
+  %sunkaddr528 = ptrtoint %struct.bz_stream* %.pre427 to i64
+  %sunkaddr529 = add i64 %sunkaddr528, 12
+  %sunkaddr530 = inttoptr i64 %sunkaddr529 to i32*
+  %tmp24 = load i32, i32* %sunkaddr530, align 4
+  %inc106 = add i32 %tmp24, 1
+  store i32 %inc106, i32* %sunkaddr530, align 4
+  %cmp109 = icmp eq i32 %inc106, 0
+  br i1 %cmp109, label %if.then.111, label %while.body.68.backedge
+
+if.then.111:                                      ; preds = %if.end.88
+  %sunkaddr531 = ptrtoint %struct.bz_stream* %.pre427 to i64
+  %sunkaddr532 = add i64 %sunkaddr531, 16
+  %sunkaddr533 = inttoptr i64 %sunkaddr532 to i32*
+  %tmp25 = load i32, i32* %sunkaddr533, align 4
+  %inc114 = add i32 %tmp25, 1
+  store i32 %inc114, i32* %sunkaddr533, align 4
+  br label %while.body.68.backedge
+
+while.body.68.backedge:                           ; preds = %if.then.111, %if.end.88
+  %lsr.iv.next481 = add i32 %lsr.iv480, -1
+  %cmp70 = icmp sgt i32 %add97, 7
+  br i1 %cmp70, label %if.then.72, label %if.end.82
+
+if.then.120:                                      ; preds = %if.then.72
+  %call = tail call i32 (%struct.__sFILE*, i8*, ...) @fprintf(%struct.__sFILE* getelementptr inbounds ([0 x %struct.__sFILE], [0 x %struct.__sFILE]* @__sF, i64 0, i64 2), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str, i64 0, i64 0), i32 %inc117)
+  br label %sw.bb.123
+
+sw.bb.123:                                        ; preds = %if.then.120, %if.then.72, %if.end.sw.bb.123_crit_edge
+  %bsLive127.pre-phi = phi i32* [ %.pre433, %if.end.sw.bb.123_crit_edge ], [ %bsLive69.pre-phi, %if.then.72 ], [ %bsLive69.pre-phi, %if.then.120 ]
+  %sunkaddr534 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr535 = add i64 %sunkaddr534, 8
+  %sunkaddr536 = inttoptr i64 %sunkaddr535 to i32*
+  store i32 25, i32* %sunkaddr536, align 4
+  %tmp26 = load i32, i32* %bsLive127.pre-phi, align 4
+  %cmp128.395 = icmp sgt i32 %tmp26, 7
+  br i1 %cmp128.395, label %sw.bb.123.if.then.130_crit_edge, label %if.end.140.lr.ph
+
+sw.bb.123.if.then.130_crit_edge:                  ; preds = %sw.bb.123
+  %sunkaddr537 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr538 = add i64 %sunkaddr537, 32
+  %sunkaddr539 = inttoptr i64 %sunkaddr538 to i32*
+  %.pre429 = load i32, i32* %sunkaddr539, align 4
+  br label %if.then.130
+
+if.end.140.lr.ph:                                 ; preds = %sw.bb.123
+  %tmp27 = bitcast %struct.DState* %s to %struct.bz_stream**
+  %.pre428 = load %struct.bz_stream*, %struct.bz_stream** %tmp27, align 8
+  %avail_in142.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre428, i64 0, i32 1
+  %.pre432 = load i32, i32* %avail_in142.phi.trans.insert, align 4
+  %tmp28 = add i32 %.pre432, -1
+  br label %if.end.140
+
+if.then.130:                                      ; preds = %while.body.126.backedge, %sw.bb.123.if.then.130_crit_edge
+  %tmp29 = phi i32 [ %.pre429, %sw.bb.123.if.then.130_crit_edge ], [ %or152, %while.body.126.backedge ]
+  %.lcssa = phi i32 [ %tmp26, %sw.bb.123.if.then.130_crit_edge ], [ %add155, %while.body.126.backedge ]
+  %sub134 = add nsw i32 %.lcssa, -8
+  %shr135 = lshr i32 %tmp29, %sub134
+  store i32 %sub134, i32* %bsLive127.pre-phi, align 4
+  %origPtr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 13
+  %tmp30 = load i32, i32* %origPtr, align 4
+  %shl175 = shl i32 %tmp30, 8
+  %conv176 = and i32 %shr135, 255
+  %or177 = or i32 %shl175, %conv176
+  store i32 %or177, i32* %origPtr, align 4
+  %nInUse = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 27
+  %tmp31 = load i32, i32* %nInUse, align 4
+  %add179 = add nsw i32 %tmp31, 2
+  br label %save_state_and_return
+
+if.end.140:                                       ; preds = %while.body.126.backedge, %if.end.140.lr.ph
+  %lsr.iv = phi i32 [ %tmp28, %if.end.140.lr.ph ], [ %lsr.iv.next, %while.body.126.backedge ]
+  %tmp32 = phi i32 [ %tmp26, %if.end.140.lr.ph ], [ %add155, %while.body.126.backedge ]
+  %cmp143 = icmp eq i32 %lsr.iv, -1
+  br i1 %cmp143, label %save_state_and_return, label %if.end.146
+
+if.end.146:                                       ; preds = %if.end.140
+  %tmp33 = bitcast %struct.bz_stream* %.pre428 to i8**
+  %sunkaddr541 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr542 = add i64 %sunkaddr541, 32
+  %sunkaddr543 = inttoptr i64 %sunkaddr542 to i32*
+  %tmp34 = load i32, i32* %sunkaddr543, align 4
+  %shl148 = shl i32 %tmp34, 8
+  %tmp35 = load i8*, i8** %tmp33, align 8
+  %tmp36 = load i8, i8* %tmp35, align 1
+  %conv151 = zext i8 %tmp36 to i32
+  %or152 = or i32 %conv151, %shl148
+  store i32 %or152, i32* %sunkaddr543, align 4
+  %add155 = add nsw i32 %tmp32, 8
+  store i32 %add155, i32* %bsLive127.pre-phi, align 4
+  %incdec.ptr158 = getelementptr inbounds i8, i8* %tmp35, i64 1
+  store i8* %incdec.ptr158, i8** %tmp33, align 8
+  %sunkaddr544 = ptrtoint %struct.bz_stream* %.pre428 to i64
+  %sunkaddr545 = add i64 %sunkaddr544, 8
+  %sunkaddr546 = inttoptr i64 %sunkaddr545 to i32*
+  store i32 %lsr.iv, i32* %sunkaddr546, align 4
+  %sunkaddr547 = ptrtoint %struct.bz_stream* %.pre428 to i64
+  %sunkaddr548 = add i64 %sunkaddr547, 12
+  %sunkaddr549 = inttoptr i64 %sunkaddr548 to i32*
+  %tmp37 = load i32, i32* %sunkaddr549, align 4
+  %inc164 = add i32 %tmp37, 1
+  store i32 %inc164, i32* %sunkaddr549, align 4
+  %cmp167 = icmp eq i32 %inc164, 0
+  br i1 %cmp167, label %if.then.169, label %while.body.126.backedge
+
+if.then.169:                                      ; preds = %if.end.146
+  %sunkaddr550 = ptrtoint %struct.bz_stream* %.pre428 to i64
+  %sunkaddr551 = add i64 %sunkaddr550, 16
+  %sunkaddr552 = inttoptr i64 %sunkaddr551 to i32*
+  %tmp38 = load i32, i32* %sunkaddr552, align 4
+  %inc172 = add i32 %tmp38, 1
+  store i32 %inc172, i32* %sunkaddr552, align 4
+  br label %while.body.126.backedge
+
+while.body.126.backedge:                          ; preds = %if.then.169, %if.end.146
+  %lsr.iv.next = add i32 %lsr.iv, -1
+  %cmp128 = icmp sgt i32 %add155, 7
+  br i1 %cmp128, label %if.then.130, label %if.end.140
+
+sw.default:                                       ; preds = %if.end, %if.end.thread
+  %tmp39 = phi i32 [ 0, %if.end.thread ], [ %.pre, %if.end ]
+  %tmp40 = phi i32 [ 0, %if.end.thread ], [ %.pre406, %if.end ]
+  %tmp41 = phi i32 [ 0, %if.end.thread ], [ %.pre407, %if.end ]
+  %tmp42 = phi i32 [ 0, %if.end.thread ], [ %.pre408, %if.end ]
+  %tmp43 = phi i32 [ 0, %if.end.thread ], [ %.pre409, %if.end ]
+  %tmp44 = phi i32 [ 0, %if.end.thread ], [ %.pre410, %if.end ]
+  %tmp45 = phi i32 [ 0, %if.end.thread ], [ %.pre411, %if.end ]
+  %tmp46 = phi i32 [ 0, %if.end.thread ], [ %.pre412, %if.end ]
+  %tmp47 = phi i32 [ 0, %if.end.thread ], [ %.pre413, %if.end ]
+  %tmp48 = phi i32 [ 0, %if.end.thread ], [ %.pre414, %if.end ]
+  %tmp49 = phi i32 [ 0, %if.end.thread ], [ %.pre415, %if.end ]
+  %tmp50 = phi i32 [ 0, %if.end.thread ], [ %.pre416, %if.end ]
+  %tmp51 = phi i32 [ 0, %if.end.thread ], [ %.pre417, %if.end ]
+  %tmp52 = phi i32 [ 0, %if.end.thread ], [ %.pre418, %if.end ]
+  %tmp53 = phi i32 [ 0, %if.end.thread ], [ %.pre419, %if.end ]
+  %tmp54 = phi i32 [ 0, %if.end.thread ], [ %.pre420, %if.end ]
+  %tmp55 = phi i32 [ 0, %if.end.thread ], [ %.pre421, %if.end ]
+  %tmp56 = phi i32 [ 0, %if.end.thread ], [ %.pre422, %if.end ]
+  %tmp57 = phi i32 [ 0, %if.end.thread ], [ %.pre423, %if.end ]
+  %save_j3.pre-phi469 = phi i32* [ %save_j, %if.end.thread ], [ %save_j3.phi.trans.insert, %if.end ]
+  %save_t4.pre-phi467 = phi i32* [ %save_t, %if.end.thread ], [ %save_t4.phi.trans.insert, %if.end ]
+  %save_alphaSize5.pre-phi465 = phi i32* [ %save_alphaSize, %if.end.thread ], [ %save_alphaSize5.phi.trans.insert, %if.end ]
+  %save_nGroups6.pre-phi463 = phi i32* [ %save_nGroups, %if.end.thread ], [ %save_nGroups6.phi.trans.insert, %if.end ]
+  %save_nSelectors7.pre-phi461 = phi i32* [ %save_nSelectors, %if.end.thread ], [ %save_nSelectors7.phi.trans.insert, %if.end ]
+  %save_EOB8.pre-phi459 = phi i32* [ %save_EOB, %if.end.thread ], [ %save_EOB8.phi.trans.insert, %if.end ]
+  %save_groupNo9.pre-phi457 = phi i32* [ %save_groupNo, %if.end.thread ], [ %save_groupNo9.phi.trans.insert, %if.end ]
+  %save_groupPos10.pre-phi455 = phi i32* [ %save_groupPos, %if.end.thread ], [ %save_groupPos10.phi.trans.insert, %if.end ]
+  %save_nextSym11.pre-phi453 = phi i32* [ %save_nextSym, %if.end.thread ], [ %save_nextSym11.phi.trans.insert, %if.end ]
+  %save_nblockMAX12.pre-phi451 = phi i32* [ %save_nblockMAX, %if.end.thread ], [ %save_nblockMAX12.phi.trans.insert, %if.end ]
+  %save_nblock13.pre-phi449 = phi i32* [ %save_nblock, %if.end.thread ], [ %save_nblock13.phi.trans.insert, %if.end ]
+  %save_es14.pre-phi447 = phi i32* [ %save_es, %if.end.thread ], [ %save_es14.phi.trans.insert, %if.end ]
+  %save_N15.pre-phi445 = phi i32* [ %save_N, %if.end.thread ], [ %save_N15.phi.trans.insert, %if.end ]
+  %save_curr16.pre-phi443 = phi i32* [ %save_curr, %if.end.thread ], [ %save_curr16.phi.trans.insert, %if.end ]
+  %save_zt17.pre-phi441 = phi i32* [ %save_zt, %if.end.thread ], [ %save_zt17.phi.trans.insert, %if.end ]
+  %save_zn18.pre-phi439 = phi i32* [ %save_zn, %if.end.thread ], [ %save_zn18.phi.trans.insert, %if.end ]
+  %save_zvec19.pre-phi437 = phi i32* [ %save_zvec, %if.end.thread ], [ %save_zvec19.phi.trans.insert, %if.end ]
+  %save_zj20.pre-phi435 = phi i32* [ %save_zj, %if.end.thread ], [ %save_zj20.phi.trans.insert, %if.end ]
+  tail call void @bar(i32 4001)
+  br label %save_state_and_return
+
+save_state_and_return:                            ; preds = %sw.default, %if.end.140, %if.then.130, %if.end.82, %if.end.33, %if.then.29
+  %tmp58 = phi i32 [ %tmp39, %sw.default ], [ %.pre, %if.then.29 ], [ %.pre, %if.then.130 ], [ %.pre, %if.end.140 ], [ %.pre, %if.end.82 ], [ %.pre, %if.end.33 ]
+  %tmp59 = phi i32 [ %tmp40, %sw.default ], [ %.pre406, %if.then.29 ], [ %.pre406, %if.then.130 ], [ %.pre406, %if.end.140 ], [ %.pre406, %if.end.82 ], [ %.pre406, %if.end.33 ]
+  %tmp60 = phi i32 [ %tmp41, %sw.default ], [ %.pre407, %if.then.29 ], [ %.pre407, %if.then.130 ], [ %.pre407, %if.end.140 ], [ %.pre407, %if.end.82 ], [ %.pre407, %if.end.33 ]
+  %tmp61 = phi i32 [ %tmp43, %sw.default ], [ %.pre409, %if.then.29 ], [ %.pre409, %if.then.130 ], [ %.pre409, %if.end.140 ], [ %.pre409, %if.end.82 ], [ %.pre409, %if.end.33 ]
+  %tmp62 = phi i32 [ %tmp44, %sw.default ], [ %.pre410, %if.then.29 ], [ %.pre410, %if.then.130 ], [ %.pre410, %if.end.140 ], [ %.pre410, %if.end.82 ], [ %.pre410, %if.end.33 ]
+  %tmp63 = phi i32 [ %tmp45, %sw.default ], [ %.pre411, %if.then.29 ], [ %.pre411, %if.then.130 ], [ %.pre411, %if.end.140 ], [ %.pre411, %if.end.82 ], [ %.pre411, %if.end.33 ]
+  %tmp64 = phi i32 [ %tmp46, %sw.default ], [ %.pre412, %if.then.29 ], [ %.pre412, %if.then.130 ], [ %.pre412, %if.end.140 ], [ %.pre412, %if.end.82 ], [ %.pre412, %if.end.33 ]
+  %tmp65 = phi i32 [ %tmp47, %sw.default ], [ %.pre413, %if.then.29 ], [ %.pre413, %if.then.130 ], [ %.pre413, %if.end.140 ], [ %.pre413, %if.end.82 ], [ %.pre413, %if.end.33 ]
+  %tmp66 = phi i32 [ %tmp48, %sw.default ], [ %.pre414, %if.then.29 ], [ %.pre414, %if.then.130 ], [ %.pre414, %if.end.140 ], [ %.pre414, %if.end.82 ], [ %.pre414, %if.end.33 ]
+  %tmp67 = phi i32 [ %tmp49, %sw.default ], [ %.pre415, %if.then.29 ], [ %.pre415, %if.then.130 ], [ %.pre415, %if.end.140 ], [ %.pre415, %if.end.82 ], [ %.pre415, %if.end.33 ]
+  %tmp68 = phi i32 [ %tmp51, %sw.default ], [ %.pre417, %if.then.29 ], [ %.pre417, %if.then.130 ], [ %.pre417, %if.end.140 ], [ %.pre417, %if.end.82 ], [ %.pre417, %if.end.33 ]
+  %tmp69 = phi i32 [ %tmp52, %sw.default ], [ %.pre418, %if.then.29 ], [ %.pre418, %if.then.130 ], [ %.pre418, %if.end.140 ], [ %.pre418, %if.end.82 ], [ %.pre418, %if.end.33 ]
+  %tmp70 = phi i32 [ %tmp53, %sw.default ], [ %.pre419, %if.then.29 ], [ %.pre419, %if.then.130 ], [ %.pre419, %if.end.140 ], [ %.pre419, %if.end.82 ], [ %.pre419, %if.end.33 ]
+  %tmp71 = phi i32 [ %tmp54, %sw.default ], [ %.pre420, %if.then.29 ], [ %.pre420, %if.then.130 ], [ %.pre420, %if.end.140 ], [ %.pre420, %if.end.82 ], [ %.pre420, %if.end.33 ]
+  %tmp72 = phi i32 [ %tmp55, %sw.default ], [ %.pre421, %if.then.29 ], [ %.pre421, %if.then.130 ], [ %.pre421, %if.end.140 ], [ %.pre421, %if.end.82 ], [ %.pre421, %if.end.33 ]
+  %tmp73 = phi i32 [ %tmp56, %sw.default ], [ %.pre422, %if.then.29 ], [ %.pre422, %if.then.130 ], [ %.pre422, %if.end.140 ], [ %.pre422, %if.end.82 ], [ %.pre422, %if.end.33 ]
+  %tmp74 = phi i32 [ %tmp57, %sw.default ], [ %.pre423, %if.then.29 ], [ %.pre423, %if.then.130 ], [ %.pre423, %if.end.140 ], [ %.pre423, %if.end.82 ], [ %.pre423, %if.end.33 ]
+  %save_j3.pre-phi468 = phi i32* [ %save_j3.pre-phi469, %sw.default ], [ %save_j3.phi.trans.insert, %if.then.29 ], [ %save_j3.phi.trans.insert, %if.then.130 ], [ %save_j3.phi.trans.insert, %if.end.140 ], [ %save_j3.phi.trans.insert, %if.end.82 ], [ %save_j3.phi.trans.insert, %if.end.33 ]
+  %save_t4.pre-phi466 = phi i32* [ %save_t4.pre-phi467, %sw.default ], [ %save_t4.phi.trans.insert, %if.then.29 ], [ %save_t4.phi.trans.insert, %if.then.130 ], [ %save_t4.phi.trans.insert, %if.end.140 ], [ %save_t4.phi.trans.insert, %if.end.82 ], [ %save_t4.phi.trans.insert, %if.end.33 ]
+  %save_alphaSize5.pre-phi464 = phi i32* [ %save_alphaSize5.pre-phi465, %sw.default ], [ %save_alphaSize5.phi.trans.insert, %if.then.29 ], [ %save_alphaSize5.phi.trans.insert, %if.then.130 ], [ %save_alphaSize5.phi.trans.insert, %if.end.140 ], [ %save_alphaSize5.phi.trans.insert, %if.end.82 ], [ %save_alphaSize5.phi.trans.insert, %if.end.33 ]
+  %save_nGroups6.pre-phi462 = phi i32* [ %save_nGroups6.pre-phi463, %sw.default ], [ %save_nGroups6.phi.trans.insert, %if.then.29 ], [ %save_nGroups6.phi.trans.insert, %if.then.130 ], [ %save_nGroups6.phi.trans.insert, %if.end.140 ], [ %save_nGroups6.phi.trans.insert, %if.end.82 ], [ %save_nGroups6.phi.trans.insert, %if.end.33 ]
+  %save_nSelectors7.pre-phi460 = phi i32* [ %save_nSelectors7.pre-phi461, %sw.default ], [ %save_nSelectors7.phi.trans.insert, %if.then.29 ], [ %save_nSelectors7.phi.trans.insert, %if.then.130 ], [ %save_nSelectors7.phi.trans.insert, %if.end.140 ], [ %save_nSelectors7.phi.trans.insert, %if.end.82 ], [ %save_nSelectors7.phi.trans.insert, %if.end.33 ]
+  %save_EOB8.pre-phi458 = phi i32* [ %save_EOB8.pre-phi459, %sw.default ], [ %save_EOB8.phi.trans.insert, %if.then.29 ], [ %save_EOB8.phi.trans.insert, %if.then.130 ], [ %save_EOB8.phi.trans.insert, %if.end.140 ], [ %save_EOB8.phi.trans.insert, %if.end.82 ], [ %save_EOB8.phi.trans.insert, %if.end.33 ]
+  %save_groupNo9.pre-phi456 = phi i32* [ %save_groupNo9.pre-phi457, %sw.default ], [ %save_groupNo9.phi.trans.insert, %if.then.29 ], [ %save_groupNo9.phi.trans.insert, %if.then.130 ], [ %save_groupNo9.phi.trans.insert, %if.end.140 ], [ %save_groupNo9.phi.trans.insert, %if.end.82 ], [ %save_groupNo9.phi.trans.insert, %if.end.33 ]
+  %save_groupPos10.pre-phi454 = phi i32* [ %save_groupPos10.pre-phi455, %sw.default ], [ %save_groupPos10.phi.trans.insert, %if.then.29 ], [ %save_groupPos10.phi.trans.insert, %if.then.130 ], [ %save_groupPos10.phi.trans.insert, %if.end.140 ], [ %save_groupPos10.phi.trans.insert, %if.end.82 ], [ %save_groupPos10.phi.trans.insert, %if.end.33 ]
+  %save_nextSym11.pre-phi452 = phi i32* [ %save_nextSym11.pre-phi453, %sw.default ], [ %save_nextSym11.phi.trans.insert, %if.then.29 ], [ %save_nextSym11.phi.trans.insert, %if.then.130 ], [ %save_nextSym11.phi.trans.insert, %if.end.140 ], [ %save_nextSym11.phi.trans.insert, %if.end.82 ], [ %save_nextSym11.phi.trans.insert, %if.end.33 ]
+  %save_nblockMAX12.pre-phi450 = phi i32* [ %save_nblockMAX12.pre-phi451, %sw.default ], [ %save_nblockMAX12.phi.trans.insert, %if.then.29 ], [ %save_nblockMAX12.phi.trans.insert, %if.then.130 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.140 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.82 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.33 ]
+  %save_nblock13.pre-phi448 = phi i32* [ %save_nblock13.pre-phi449, %sw.default ], [ %save_nblock13.phi.trans.insert, %if.then.29 ], [ %save_nblock13.phi.trans.insert, %if.then.130 ], [ %save_nblock13.phi.trans.insert, %if.end.140 ], [ %save_nblock13.phi.trans.insert, %if.end.82 ], [ %save_nblock13.phi.trans.insert, %if.end.33 ]
+  %save_es14.pre-phi446 = phi i32* [ %save_es14.pre-phi447, %sw.default ], [ %save_es14.phi.trans.insert, %if.then.29 ], [ %save_es14.phi.trans.insert, %if.then.130 ], [ %save_es14.phi.trans.insert, %if.end.140 ], [ %save_es14.phi.trans.insert, %if.end.82 ], [ %save_es14.phi.trans.insert, %if.end.33 ]
+  %save_N15.pre-phi444 = phi i32* [ %save_N15.pre-phi445, %sw.default ], [ %save_N15.phi.trans.insert, %if.then.29 ], [ %save_N15.phi.trans.insert, %if.then.130 ], [ %save_N15.phi.trans.insert, %if.end.140 ], [ %save_N15.phi.trans.insert, %if.end.82 ], [ %save_N15.phi.trans.insert, %if.end.33 ]
+  %save_curr16.pre-phi442 = phi i32* [ %save_curr16.pre-phi443, %sw.default ], [ %save_curr16.phi.trans.insert, %if.then.29 ], [ %save_curr16.phi.trans.insert, %if.then.130 ], [ %save_curr16.phi.trans.insert, %if.end.140 ], [ %save_curr16.phi.trans.insert, %if.end.82 ], [ %save_curr16.phi.trans.insert, %if.end.33 ]
+  %save_zt17.pre-phi440 = phi i32* [ %save_zt17.pre-phi441, %sw.default ], [ %save_zt17.phi.trans.insert, %if.then.29 ], [ %save_zt17.phi.trans.insert, %if.then.130 ], [ %save_zt17.phi.trans.insert, %if.end.140 ], [ %save_zt17.phi.trans.insert, %if.end.82 ], [ %save_zt17.phi.trans.insert, %if.end.33 ]
+  %save_zn18.pre-phi438 = phi i32* [ %save_zn18.pre-phi439, %sw.default ], [ %save_zn18.phi.trans.insert, %if.then.29 ], [ %save_zn18.phi.trans.insert, %if.then.130 ], [ %save_zn18.phi.trans.insert, %if.end.140 ], [ %save_zn18.phi.trans.insert, %if.end.82 ], [ %save_zn18.phi.trans.insert, %if.end.33 ]
+  %save_zvec19.pre-phi436 = phi i32* [ %save_zvec19.pre-phi437, %sw.default ], [ %save_zvec19.phi.trans.insert, %if.then.29 ], [ %save_zvec19.phi.trans.insert, %if.then.130 ], [ %save_zvec19.phi.trans.insert, %if.end.140 ], [ %save_zvec19.phi.trans.insert, %if.end.82 ], [ %save_zvec19.phi.trans.insert, %if.end.33 ]
+  %save_zj20.pre-phi434 = phi i32* [ %save_zj20.pre-phi435, %sw.default ], [ %save_zj20.phi.trans.insert, %if.then.29 ], [ %save_zj20.phi.trans.insert, %if.then.130 ], [ %save_zj20.phi.trans.insert, %if.end.140 ], [ %save_zj20.phi.trans.insert, %if.end.82 ], [ %save_zj20.phi.trans.insert, %if.end.33 ]
+  %nblock.1 = phi i32 [ %tmp50, %sw.default ], [ %.pre416, %if.then.29 ], [ 0, %if.then.130 ], [ %.pre416, %if.end.140 ], [ %.pre416, %if.end.82 ], [ %.pre416, %if.end.33 ]
+  %alphaSize.1 = phi i32 [ %tmp42, %sw.default ], [ %.pre408, %if.then.29 ], [ %add179, %if.then.130 ], [ %.pre408, %if.end.140 ], [ %.pre408, %if.end.82 ], [ %.pre408, %if.end.33 ]
+  %retVal.0 = phi i32 [ 0, %sw.default ], [ -5, %if.then.29 ], [ -4, %if.then.130 ], [ 0, %if.end.140 ], [ 0, %if.end.82 ], [ 0, %if.end.33 ]
+  store i32 %tmp58, i32* %save_i, align 4
+  store i32 %tmp59, i32* %save_j3.pre-phi468, align 4
+  store i32 %tmp60, i32* %save_t4.pre-phi466, align 4
+  store i32 %alphaSize.1, i32* %save_alphaSize5.pre-phi464, align 4
+  store i32 %tmp61, i32* %save_nGroups6.pre-phi462, align 4
+  store i32 %tmp62, i32* %save_nSelectors7.pre-phi460, align 4
+  store i32 %tmp63, i32* %save_EOB8.pre-phi458, align 4
+  store i32 %tmp64, i32* %save_groupNo9.pre-phi456, align 4
+  store i32 %tmp65, i32* %save_groupPos10.pre-phi454, align 4
+  store i32 %tmp66, i32* %save_nextSym11.pre-phi452, align 4
+  store i32 %tmp67, i32* %save_nblockMAX12.pre-phi450, align 4
+  store i32 %nblock.1, i32* %save_nblock13.pre-phi448, align 4
+  store i32 %tmp68, i32* %save_es14.pre-phi446, align 4
+  store i32 %tmp69, i32* %save_N15.pre-phi444, align 4
+  store i32 %tmp70, i32* %save_curr16.pre-phi442, align 4
+  store i32 %tmp71, i32* %save_zt17.pre-phi440, align 4
+  store i32 %tmp72, i32* %save_zn18.pre-phi438, align 4
+  store i32 %tmp73, i32* %save_zvec19.pre-phi436, align 4
+  store i32 %tmp74, i32* %save_zj20.pre-phi434, align 4
+  ret i32 %retVal.0
+}
+
+!0 = !{!"branch_weights", i32 10, i32 1}
diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index 739570236da92..1820b8163a905 100644
--- a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -disable-post-ra < %s | FileCheck %s
 
 ; This test aims to check basic correctness of frame layout &
 ; frame access code. There are 8 functions in this test file,
@@ -252,11 +252,11 @@ entry:
 ; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
-; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: mov	w9, w0
+; CHECK: mov	 x10, sp
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
 ; CHECK: and	x9, x9, #0x7fffffff0
-; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through frame pointer
@@ -299,11 +299,11 @@ entry:
 ; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
-; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: mov	w9, w0
+; CHECK: mov	 x10, sp
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
 ; CHECK: and	x9, x9, #0x7fffffff0
-; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through frame pointer
@@ -361,11 +361,11 @@ entry:
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: mov	w9, w0
+; CHECK: mov	 x10, sp
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
 ; CHECK: and	x9, x9, #0x7fffffff0
-; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
@@ -414,11 +414,11 @@ entry:
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: mov	w9, w0
+; CHECK: mov	 x10, sp
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
 ; CHECK: and	x9, x9, #0x7fffffff0
-; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
@@ -465,11 +465,11 @@ entry:
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: mov	w9, w0
+; CHECK: mov	 x10, sp
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
 ; CHECK: and	x9, x9, #0x7fffffff0
-; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
@@ -522,10 +522,10 @@ bb1:
 
 ; CHECK-LABEL: realign_conditional2
 ; Extra realignment in the prologue (performance issue).
+; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
 ; CHECK:  sub  x9, sp, #32            // =32
 ; CHECK:  and  sp, x9, #0xffffffffffffffe0
 ; CHECK:  mov   x19, sp
-; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
 ; Stack is realigned in a non-entry BB.
 ; CHECK:  sub  [[REG:x[01-9]+]], sp, #64
 ; CHECK:  and  sp, [[REG]], #0xffffffffffffffe0
diff --git a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
index ea3b8fa557328..1bc2a3ccb1ca0 100644
--- a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
+++ b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
@@ -1,7 +1,10 @@
-; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic -lower-interleaved-accesses=true < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
+; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true -mattr=-neon < %s | FileCheck %s -check-prefix=NONEON
 
-; CHECK-LABEL: load_factor2:
-; CHECK: ld2 { v0.8b, v1.8b }, [x0]
+; NEON-LABEL: load_factor2:
+; NEON: ld2 { v0.8b, v1.8b }, [x0]
+; NONEON-LABEL: load_factor2:
+; NONEON-NOT: ld2
 define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
   %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
   %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -10,8 +13,10 @@ define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
   ret <8 x i8> %add
 }
 
-; CHECK-LABEL: load_factor3:
-; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NEON-LABEL: load_factor3:
+; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NONEON-LABEL: load_factor3:
+; NONEON-NOT: ld3
 define <4 x i32> @load_factor3(i32* %ptr) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
@@ -21,8 +26,10 @@ define <4 x i32> @load_factor3(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: load_factor4:
-; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NEON-LABEL: load_factor4:
+; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NONEON-LABEL: load_factor4:
+; NONEON-NOT: ld4
 define <4 x i32> @load_factor4(i32* %ptr) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
@@ -32,16 +39,20 @@ define <4 x i32> @load_factor4(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: store_factor2:
-; CHECK: st2 { v0.8b, v1.8b }, [x0]
+; NEON-LABEL: store_factor2:
+; NEON: st2 { v0.8b, v1.8b }, [x0]
+; NONEON-LABEL: store_factor2:
+; NONEON-NOT: st2
 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
   %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
   ret void
 }
 
-; CHECK-LABEL: store_factor3:
-; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NEON-LABEL: store_factor3:
+; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NONEON-LABEL: store_factor3:
+; NONEON-NOT: st3
 define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -51,8 +62,10 @@ define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v
   ret void
 }
 
-; CHECK-LABEL: store_factor4:
-; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NEON-LABEL: store_factor4:
+; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NONEON-LABEL: store_factor4:
+; NONEON-NOT: st4
 define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -65,8 +78,10 @@ define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v
 ; The following cases test that interleaved access of pointer vectors can be
 ; matched to ldN/stN instruction.
 
-; CHECK-LABEL: load_ptrvec_factor2:
-; CHECK: ld2 { v0.2d, v1.2d }, [x0]
+; NEON-LABEL: load_ptrvec_factor2:
+; NEON: ld2 { v0.2d, v1.2d }, [x0]
+; NONEON-LABEL: load_ptrvec_factor2:
+; NONEON-NOT: ld2
 define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
   %base = bitcast i32** %ptr to <4 x i32*>*
   %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
@@ -74,8 +89,10 @@ define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
   ret <2 x i32*> %strided.v0
 }
 
-; CHECK-LABEL: load_ptrvec_factor3:
-; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+; NEON-LABEL: load_ptrvec_factor3:
+; NEON: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+; NONEON-LABEL: load_ptrvec_factor3:
+; NONEON-NOT: ld3
 define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
   %base = bitcast i32** %ptr to <6 x i32*>*
   %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
@@ -86,8 +103,10 @@ define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr
   ret void
 }
 
-; CHECK-LABEL: load_ptrvec_factor4:
-; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; NEON-LABEL: load_ptrvec_factor4:
+; NEON: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; NONEON-LABEL: load_ptrvec_factor4:
+; NONEON-NOT: ld4
 define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
   %base = bitcast i32** %ptr to <8 x i32*>*
   %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
@@ -98,8 +117,10 @@ define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr
   ret void
 }
 
-; CHECK-LABEL: store_ptrvec_factor2:
-; CHECK: st2 { v0.2d, v1.2d }, [x0]
+; NEON-LABEL: store_ptrvec_factor2:
+; NEON: st2 { v0.2d, v1.2d }, [x0]
+; NONEON-LABEL: store_ptrvec_factor2:
+; NONEON-NOT: st2
 define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
   %base = bitcast i32** %ptr to <4 x i32*>*
   %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -107,8 +128,10 @@ define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
   ret void
 }
 
-; CHECK-LABEL: store_ptrvec_factor3:
-; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
+; NEON-LABEL: store_ptrvec_factor3:
+; NEON: st3 { v0.2d, v1.2d, v2.2d }, [x0]
+; NONEON-LABEL: store_ptrvec_factor3:
+; NONEON-NOT: st3
 define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
   %base = bitcast i32** %ptr to <6 x i32*>*
   %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -118,8 +141,10 @@ define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2
   ret void
 }
 
-; CHECK-LABEL: store_ptrvec_factor4:
-; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; NEON-LABEL: store_ptrvec_factor4:
+; NEON: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; NONEON-LABEL: store_ptrvec_factor4:
+; NONEON-NOT: st4
 define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
   %base = bitcast i32* %ptr to <8 x i32*>*
   %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -132,8 +157,10 @@ define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2
 ; Following cases check that shuffle maskes with undef indices can be matched
 ; into ldN/stN instruction.
 
-; CHECK-LABEL: load_undef_mask_factor2:
-; CHECK: ld2 { v0.4s, v1.4s }, [x0]
+; NEON-LABEL: load_undef_mask_factor2:
+; NEON: ld2 { v0.4s, v1.4s }, [x0]
+; NONEON-LABEL: load_undef_mask_factor2:
+; NONEON-NOT: ld2
 define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
   %base = bitcast i32* %ptr to <8 x i32>*
   %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
@@ -143,8 +170,10 @@ define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: load_undef_mask_factor3:
-; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NEON-LABEL: load_undef_mask_factor3:
+; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NONEON-LABEL: load_undef_mask_factor3:
+; NONEON-NOT: ld3
 define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
@@ -154,8 +183,10 @@ define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: load_undef_mask_factor4:
-; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NEON-LABEL: load_undef_mask_factor4:
+; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NONEON-LABEL: load_undef_mask_factor4:
+; NONEON-NOT: ld4
 define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
@@ -165,8 +196,10 @@ define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: store_undef_mask_factor2:
-; CHECK: st2 { v0.4s, v1.4s }, [x0]
+; NEON-LABEL: store_undef_mask_factor2:
+; NEON: st2 { v0.4s, v1.4s }, [x0]
+; NONEON-LABEL: store_undef_mask_factor2:
+; NONEON-NOT: st2
 define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
   %base = bitcast i32* %ptr to <8 x i32>*
   %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
@@ -174,8 +207,10 @@ define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
   ret void
 }
 
-; CHECK-LABEL: store_undef_mask_factor3:
-; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NEON-LABEL: store_undef_mask_factor3:
+; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NONEON-LABEL: store_undef_mask_factor3:
+; NONEON-NOT: st3
 define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -185,8 +220,10 @@ define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <
   ret void
 }
 
-; CHECK-LABEL: store_undef_mask_factor4:
-; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NEON-LABEL: store_undef_mask_factor4:
+; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NONEON-LABEL: store_undef_mask_factor4:
+; NONEON-NOT: st4
 define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -195,3 +232,39 @@ define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <
   store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
   ret void
 }
+
+; Check that we do something sane with illegal types.
+
+; NEON-LABEL: load_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: ldr q[[V:[0-9]+]], [x0]
+; NEON-NEXT: uzp1 v0.4s, v[[V]].4s, v{{.*}}.4s
+; NEON-NEXT: ret
+; NONEON-LABEL: load_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: ldr s0, [x0]
+; NONEON-NEXT: ldr s1, [x0, #8]
+; NONEON-NEXT: ret
+define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
+  %tmp1 = load <3 x float>, <3 x float>* %p, align 16
+  %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  ret <3 x float> %tmp2
+}
+
+; NEON-LABEL: store_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: uzp1 v0.4s, v0.4s, v{{.*}}.4s
+; NEON-NEXT: st1 { v0.d }[0], [x0]
+; NEON-NEXT: ret
+; NONEON-LABEL: store_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: fmov w[[ELT2:[0-9]+]], s2
+; NONEON-NEXT: fmov w[[RES:[0-9]+]], s0
+; NONEON-NEXT: bfi x[[RES]], x[[ELT2]], #32, #32
+; NONEON-NEXT: str x[[RES]], [x0]
+; NONEON-NEXT: ret
+define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
+  %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  store <3 x float> %tmp1, <3 x float>* %p, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll b/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll
new file mode 100644
index 0000000000000..84277995ce5b5
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll
@@ -0,0 +1,50 @@
+; RUN: llc -O3 -aarch64-gep-opt=true  -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck <%t %s
+; REQUIRES: asserts
+target triple = "aarch64--linux-android"
+
+%typeD = type { i32, i32, [256 x i32], [257 x i32] }
+
+; Function Attrs: noreturn nounwind uwtable
+define i32 @test1(%typeD* nocapture %s) {
+entry:
+; CHECK-LABEL: entry:
+; CHECK:    %uglygep = getelementptr i8, i8* %0, i64 1032
+; CHECK:    br label %do.body.i
+
+
+  %tPos = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 0
+  %k0 = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 1
+  %.pre = load i32, i32* %tPos, align 4
+  br label %do.body.i
+
+do.body.i:
+; CHECK-LABEL: do.body.i:
+; CHECK:          %uglygep2 = getelementptr i8, i8* %uglygep, i64 %3
+; CHECK-NEXT:     %4 = bitcast i8* %uglygep2 to i32*
+; CHECK-NOT:      %uglygep2 = getelementptr i8, i8* %uglygep, i64 1032
+
+
+  %0 = phi i32 [ 256, %entry ], [ %.be, %do.body.i.backedge ]
+  %1 = phi i32 [ 0, %entry ], [ %.be6, %do.body.i.backedge ]
+  %add.i = add nsw i32 %1, %0
+  %shr.i = ashr i32 %add.i, 1
+  %idxprom.i = sext i32 %shr.i to i64
+  %arrayidx.i = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 3, i64 %idxprom.i
+  %2 = load i32, i32* %arrayidx.i, align 4
+  %cmp.i = icmp sle i32 %2, %.pre
+  %na.1.i = select i1 %cmp.i, i32 %0, i32 %shr.i
+  %nb.1.i = select i1 %cmp.i, i32 %shr.i, i32 %1
+  %sub.i = sub nsw i32 %na.1.i, %nb.1.i
+  %cmp1.i = icmp eq i32 %sub.i, 1
+  br i1 %cmp1.i, label %fooo.exit, label %do.body.i.backedge
+
+do.body.i.backedge:
+  %.be = phi i32 [ %na.1.i, %do.body.i ], [ 256, %fooo.exit ]
+  %.be6 = phi i32 [ %nb.1.i, %do.body.i ], [ 0, %fooo.exit ]
+  br label %do.body.i
+
+fooo.exit:                              ; preds = %do.body.i
+  store i32 %nb.1.i, i32* %k0, align 4
+  br label %do.body.i.backedge
+}
+
diff --git a/test/CodeGen/AArch64/aarch64-minmaxv.ll b/test/CodeGen/AArch64/aarch64-minmaxv.ll
new file mode 100644
index 0000000000000..fb13b706cfafb
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -0,0 +1,511 @@
+; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linu--gnu"
+
+; CHECK-LABEL: smax_B
+; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
+define i8 @smax_B(<16 x i8>* nocapture readonly %arr)  {
+  %arr.load = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf = shufflevector <16 x i8> %arr.load, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp sgt <16 x i8> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %arr.load, <16 x i8> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp sgt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp sgt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp sgt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  ret i8 %r
+}
+
+; CHECK-LABEL: smax_H
+; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
+define i16 @smax_H(<8 x i16>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp23 = icmp sgt <8 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
+  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp26 = icmp sgt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
+  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
+  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp29 = icmp sgt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
+  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
+  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
+  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
+  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  ret i16 %r
+}
+
+; CHECK-LABEL: smax_S
+; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
+define i32 @smax_S(<4 x i32> * nocapture readonly %arr)  {
+  %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp18 = icmp sgt <4 x i32> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
+  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp21 = icmp sgt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
+  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
+  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
+  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
+  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  ret i32 %r
+}
+
+; CHECK-LABEL: smax_D
+; CHECK-NOT: smaxv
+define i64 @smax_D(<2 x i64>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %rdx.minmax.cmp18 = icmp sgt <2 x i64> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
+  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
+  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
+  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
+  ret i64 %r
+}
+
+
+; CHECK-LABEL: umax_B
+; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
+define i8 @umax_B(<16 x i8>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ugt <16 x i8> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ugt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ugt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ugt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  ret i8 %r
+}
+
+; CHECK-LABEL: umax_H
+; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
+define i16 @umax_H(<8 x i16>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp23 = icmp ugt <8 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
+  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp26 = icmp ugt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
+  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
+  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp29 = icmp ugt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
+  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
+  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
+  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
+  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  ret i16 %r
+}
+
+; CHECK-LABEL: umax_S
+; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
+define i32 @umax_S(<4 x i32>* nocapture readonly %arr) {
+  %rdx.minmax.select  = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp18 = icmp ugt <4 x i32> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
+  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp21 = icmp ugt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
+  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
+  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
+  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
+  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  ret i32 %r
+}
+
+; CHECK-LABEL: umax_D
+; CHECK-NOT: umaxv
+define i64 @umax_D(<2 x i64>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %rdx.minmax.cmp18 = icmp ugt <2 x i64> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
+  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
+  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
+  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
+  ret i64 %r
+}
+
+
+; CHECK-LABEL: smin_B
+; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
+define i8 @smin_B(<16 x i8>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp slt <16 x i8> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp slt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp slt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp slt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  ret i8 %r
+}
+
+; CHECK-LABEL: smin_H
+; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
+define i16 @smin_H(<8 x i16>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp23 = icmp slt <8 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
+  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp26 = icmp slt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
+  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
+  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp29 = icmp slt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
+  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
+  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
+  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
+  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  ret i16 %r
+}
+
+; CHECK-LABEL: smin_S
+; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
+define i32 @smin_S(<4 x i32>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp18 = icmp slt <4 x i32> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
+  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp21 = icmp slt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
+  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
+  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
+  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
+  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  ret i32 %r
+}
+
+; CHECK-LABEL: smin_D
+; CHECK-NOT: sminv
+define i64 @smin_D(<2 x i64>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %rdx.minmax.cmp18 = icmp slt <2 x i64> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
+  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
+  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
+  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
+  ret i64 %r
+}
+
+
+; CHECK-LABEL: umin_B
+; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
+define i8 @umin_B(<16 x i8>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ult <16 x i8> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ult <16 x i8> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ult <16 x i8> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ult <16 x i8> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  ret i8 %r
+}
+
+; CHECK-LABEL: umin_H
+; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
+define i16 @umin_H(<8 x i16>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp23 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
+  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp26 = icmp ult <8 x i16> %rdx.minmax.select24, %rdx.shuf25
+  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
+  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp29 = icmp ult <8 x i16> %rdx.minmax.select27, %rdx.shuf28
+  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
+  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
+  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
+  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  ret i16 %r
+}
+
+; CHECK-LABEL: umin_S
+; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
+define i32 @umin_S(<4 x i32>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp18 = icmp ult <4 x i32> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
+  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp21 = icmp ult <4 x i32> %rdx.minmax.select19, %rdx.shuf20
+  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
+  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
+  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
+  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  ret i32 %r
+}
+
+; CHECK-LABEL: umin_D
+; CHECK-NOT: uminv
+define i64 @umin_D(<2 x i64>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %rdx.minmax.cmp18 = icmp ult <2 x i64> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
+  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
+  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
+  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
+  ret i64 %r
+}
+
+; CHECK-LABEL: fmaxnm_S
+; CHECK: fmaxnmv
+define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) {
+  %rdx.minmax.select  = load <4 x float>, <4 x float>* %arr
+  %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp = fcmp fast oge <4 x float> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
+  %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp1 = fcmp fast oge <4 x float> %rdx.minmax.select1, %rdx.shuf1
+  %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
+  %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
+  %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
+  %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
+  ret float %r
+}
+
+; CHECK-LABEL: fminnm_S
+; CHECK: fminnmv
+define float @fminnm_S(<4 x float>* nocapture readonly %arr) {
+  %rdx.minmax.select  = load <4 x float>, <4 x float>* %arr
+  %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp = fcmp fast ole <4 x float> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
+  %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp1 = fcmp fast ole <4 x float> %rdx.minmax.select1, %rdx.shuf1
+  %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
+  %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
+  %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
+  %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
+  ret float %r
+}
+
+define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_umax_256
+; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: umaxv {{h[0-9]+}}, [[V0]]
+  %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
+  %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ugt <16 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ugt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ugt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ugt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  ret i16 %r
+}
+
+define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_umax_512
+; CHECK: umax v
+; CHECK-NEXT: umax v
+; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]]
+  %arr.load = load <16 x i32>, <16 x i32>* %arr
+  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ugt <16 x i32> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ugt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ugt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ugt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  ret i32 %r
+}
+
+define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_umin_256
+; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: uminv {{h[0-9]+}}, [[V0]]
+  %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
+  %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ult <16 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ult <16 x i16> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ult <16 x i16> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ult <16 x i16> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  ret i16 %r
+}
+
+define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_umin_512
+; CHECK: umin v
+; CHECK-NEXT: umin v
+; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]]
+  %arr.load = load <16 x i32>, <16 x i32>* %arr
+  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ult <16 x i32> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ult <16 x i32> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ult <16 x i32> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ult <16 x i32> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  ret i32 %r
+}
+
+define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_smax_256
+; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: smaxv {{h[0-9]+}}, [[V0]]
+  %arr.load = load <16 x i16>, <16 x i16>* %arr
+  %rdx.shuf = shufflevector <16 x i16> %arr.load, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp sgt <16 x i16> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %arr.load, <16 x i16> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp sgt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp sgt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp sgt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  ret i16 %r
+}
+
+define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_smax_512
+; CHECK: smax v
+; CHECK-NEXT: smax v
+; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]]
+  %arr.load = load <16 x i32>, <16 x i32>* %arr
+  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp sgt <16 x i32> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp sgt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp sgt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp sgt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  ret i32 %r
+}
+
+define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_smin_256
+; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: sminv {{h[0-9]+}}, [[V0]]
+  %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
+  %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp slt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp slt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp slt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  ret i16 %r
+}
+
+define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_smin_512
+; CHECK: smin v
+; CHECK-NEXT: smin v
+; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]]
+  %arr.load = load <16 x i32>, <16 x i32>* %arr
+  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp slt <16 x i32> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp slt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp slt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp slt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  ret i32 %r
+}
diff --git a/test/CodeGen/AArch64/aarch64-smax-constantfold.ll b/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
new file mode 100644
index 0000000000000..0e5b59f95126d
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
+
+; Function Attrs: nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>)
+
+; CHECK-LABEL: test
+define <4 x i16> @test() {
+entry:
+; CHECK: movi	d{{[0-9]+}}, #0000000000000000
+  %0 = tail call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer)
+  ret <4 x i16> %0
+}
diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll
index f0c7572ebf136..f30ab89f238bf 100644
--- a/test/CodeGen/AArch64/addsub_ext.ll
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@@ -80,6 +80,64 @@ end:
     ret void
 }
 
+define void @sub_i8rhs() minsize {
+; CHECK-LABEL: sub_i8rhs:
+    %val8_tmp = load i8, i8* @var8
+    %lhs32 = load i32, i32* @var32
+    %lhs64 = load i64, i64* @var64
+
+    ; Need this to prevent extension upon load and give a vanilla i8 operand.
+    %val8 = add i8 %val8_tmp, 123
+
+
+; Zero-extending to 32-bits
+    %rhs32_zext = zext i8 %val8 to i32
+    %res32_zext = sub i32 %lhs32, %rhs32_zext
+    store volatile i32 %res32_zext, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxtb
+
+   %rhs32_zext_shift = shl i32 %rhs32_zext, 3
+   %res32_zext_shift = sub i32 %lhs32, %rhs32_zext_shift
+   store volatile i32 %res32_zext_shift, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxtb #3
+
+
+; Zero-extending to 64-bits
+    %rhs64_zext = zext i8 %val8 to i64
+    %res64_zext = sub i64 %lhs64, %rhs64_zext
+    store volatile i64 %res64_zext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtb
+
+   %rhs64_zext_shift = shl i64 %rhs64_zext, 1
+   %res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift
+   store volatile i64 %res64_zext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtb #1
+
+; Sign-extending to 32-bits
+    %rhs32_sext = sext i8 %val8 to i32
+    %res32_sext = sub i32 %lhs32, %rhs32_sext
+    store volatile i32 %res32_sext, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxtb
+
+   %rhs32_sext_shift = shl i32 %rhs32_sext, 1
+   %res32_sext_shift = sub i32 %lhs32, %rhs32_sext_shift
+   store volatile i32 %res32_sext_shift, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxtb #1
+
+; Sign-extending to 64-bits
+    %rhs64_sext = sext i8 %val8 to i64
+    %res64_sext = sub i64 %lhs64, %rhs64_sext
+    store volatile i64 %res64_sext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtb
+
+   %rhs64_sext_shift = shl i64 %rhs64_sext, 4
+   %res64_sext_shift = sub i64 %lhs64, %rhs64_sext_shift
+   store volatile i64 %res64_sext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtb #4
+
+    ret void
+}
+
 define void @addsub_i16rhs() minsize {
 ; CHECK-LABEL: addsub_i16rhs:
     %val16_tmp = load i16, i16* @var16
@@ -155,6 +213,64 @@ end:
     ret void
 }
 
+define void @sub_i16rhs() minsize {
+; CHECK-LABEL: sub_i16rhs:
+    %val16_tmp = load i16, i16* @var16
+    %lhs32 = load i32, i32* @var32
+    %lhs64 = load i64, i64* @var64
+
+    ; Need this to prevent extension upon load and give a vanilla i16 operand.
+    %val16 = add i16 %val16_tmp, 123
+
+
+; Zero-extending to 32-bits
+    %rhs32_zext = zext i16 %val16 to i32
+    %res32_zext = sub i32 %lhs32, %rhs32_zext
+    store volatile i32 %res32_zext, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxth
+
+   %rhs32_zext_shift = shl i32 %rhs32_zext, 3
+   %res32_zext_shift = sub i32 %lhs32, %rhs32_zext_shift
+   store volatile i32 %res32_zext_shift, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxth #3
+
+
+; Zero-extending to 64-bits
+    %rhs64_zext = zext i16 %val16 to i64
+    %res64_zext = sub i64 %lhs64, %rhs64_zext
+    store volatile i64 %res64_zext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxth
+
+   %rhs64_zext_shift = shl i64 %rhs64_zext, 1
+   %res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift
+   store volatile i64 %res64_zext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxth #1
+
+; Sign-extending to 32-bits
+    %rhs32_sext = sext i16 %val16 to i32
+    %res32_sext = sub i32 %lhs32, %rhs32_sext
+    store volatile i32 %res32_sext, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth
+
+   %rhs32_sext_shift = shl i32 %rhs32_sext, 1
+   %res32_sext_shift = sub i32 %lhs32, %rhs32_sext_shift
+   store volatile i32 %res32_sext_shift, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth #1
+
+; Sign-extending to 64-bits
+    %rhs64_sext = sext i16 %val16 to i64
+    %res64_sext = sub i64 %lhs64, %rhs64_sext
+    store volatile i64 %res64_sext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxth
+
+   %rhs64_sext_shift = shl i64 %rhs64_sext, 4
+   %res64_sext_shift = sub i64 %lhs64, %rhs64_sext_shift
+   store volatile i64 %res64_sext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxth #4
+
+    ret void
+}
+
 ; N.b. we could probably check more here ("add w2, w3, w1, uxtw" for
 ; example), but the remaining instructions are probably not idiomatic
 ; in the face of "add/sub (shifted register)" so I don't intend to.
@@ -187,3 +303,33 @@ define void @addsub_i32rhs() minsize {
 
     ret void
 }
+
+define void @sub_i32rhs() minsize {
+; CHECK-LABEL: sub_i32rhs:
+    %val32_tmp = load i32, i32* @var32
+    %lhs64 = load i64, i64* @var64
+
+    %val32 = add i32 %val32_tmp, 123
+
+    %rhs64_zext = zext i32 %val32 to i64
+    %res64_zext = sub i64 %lhs64, %rhs64_zext
+    store volatile i64 %res64_zext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw
+
+    %rhs64_zext_shift = shl i64 %rhs64_zext, 2
+    %res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift
+    store volatile i64 %res64_zext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw #2
+
+    %rhs64_sext = sext i32 %val32 to i64
+    %res64_sext = sub i64 %lhs64, %rhs64_sext
+    store volatile i64 %res64_sext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw
+
+    %rhs64_sext_shift = shl i64 %rhs64_sext, 2
+    %res64_sext_shift = sub i64 %lhs64, %rhs64_sext_shift
+    store volatile i64 %res64_sext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw #2
+
+    ret void
+}
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
index 5b2278ce8a351..45754377b2d91 100644
--- a/test/CodeGen/AArch64/alloca.ll
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
+; RUN: llc -mtriple=aarch64-linux-gnu -disable-post-ra -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
 
 declare void @use_addr(i8*)
 
diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
index 173a440326ac6..a66ea0df2e987 100644
--- a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
@@ -22,22 +22,22 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.sp = !{!1, !7, !10, !11, !12}
 
 !0 = !DIGlobalVariable(name: "vsplive", line: 617, isLocal: true, isDefinition: true, scope: !1, file: !2, type: !6)
-!1 = !DISubprogram(name: "drt_vsprintf", line: 616, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
+!1 = distinct !DISubprogram(name: "drt_vsprintf", line: 616, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
 !2 = !DIFile(filename: "print.i", directory: "/Volumes/Ebi/echeng/radars/r9146594")
-!3 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", isOptimized: true, emissionKind: 0, file: !20, enums: !21, retainedTypes: !21)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", isOptimized: true, emissionKind: 0, file: !20, enums: !21, retainedTypes: !21)
 !4 = !DISubroutineType(types: !5)
 !5 = !{!6}
 !6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!7 = !DISubprogram(name: "putc_mem", line: 30, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8)
+!7 = distinct !DISubprogram(name: "putc_mem", line: 30, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8)
 !8 = !DISubroutineType(types: !9)
 !9 = !{null}
-!10 = !DISubprogram(name: "print_double", line: 203, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
-!11 = !DISubprogram(name: "print_number", line: 75, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
-!12 = !DISubprogram(name: "get_flags", line: 508, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8)
+!10 = distinct !DISubprogram(name: "print_double", line: 203, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
+!11 = distinct !DISubprogram(name: "print_number", line: 75, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
+!12 = distinct !DISubprogram(name: "get_flags", line: 508, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8)
 !13 = !DILocation(line: 653, column: 5, scope: !14)
 !14 = distinct !DILexicalBlock(line: 652, column: 35, file: !20, scope: !15)
 !15 = distinct !DILexicalBlock(line: 616, column: 1, file: !20, scope: !1)
-!16 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "do_tab_convert", line: 853, scope: !17, file: !2, type: !6)
+!16 = !DILocalVariable(name: "do_tab_convert", line: 853, scope: !17, file: !2, type: !6)
 !17 = distinct !DILexicalBlock(line: 850, column: 12, file: !20, scope: !14)
 !18 = !DILocation(line: 853, column: 11, scope: !17)
 !19 = !DILocation(line: 853, column: 29, scope: !17)
diff --git a/test/CodeGen/AArch64/arm64-aapcs-be.ll b/test/CodeGen/AArch64/arm64-aapcs-be.ll
index f27570acc820e..e77952e4b8a1c 100644
--- a/test/CodeGen/AArch64/arm64-aapcs-be.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs-be.ll
@@ -32,7 +32,7 @@ define float @test_block_addr([8 x float], [1 x float] %in) {
 
 define void @test_block_addr_callee() {
 ; CHECK-LABEL: test_block_addr_callee:
-; CHECK: str {{[a-z0-9]+}}, [sp]
+; CHECK: str {{[a-z0-9]+}}, [sp, #-16]!
 ; CHECK: bl test_block_addr
   %val = insertvalue [1 x float] undef, float 0.0, 0
   call float @test_block_addr([8 x float] undef, [1 x float] %val)
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index d0880cd4f3eb5..441f45bf90b34 100644
--- a/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false -disable-post-ra < %s | FileCheck %s
 
 @var = global i32 0, align 4
 
@@ -27,12 +27,13 @@ define [2 x i64] @test_i64x2_align(i32, [2 x i64] %arg, i32 %after) {
   ; Check stack slots are 64-bit at all times.
 define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
                                 i32 %int, i64 %long) {
-  ; Part of last store. Blasted scheduler.
-; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
-
   %ext_bool = zext i1 %bool to i64
   store volatile i64 %ext_bool, i64* @var64, align 8
 ; CHECK: ldrb w[[EXT:[0-9]+]], [sp]
+
+  ; Part of last store. Blasted scheduler.
+; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
+
 ; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
 ; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
 
@@ -63,8 +64,8 @@ define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
 define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
   %ext_bool = zext i1 %bool to i64
   store volatile i64 %ext_bool, i64* @var64
-; CHECK: and [[EXT:x[0-9]+]], x0, #0x1
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+; CHECK: and w[[EXT:[0-9]+]], w0, #0x1
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
 
   %ext_char = sext i8 %char to i64
   store volatile i64 %ext_char, i64* @var64
@@ -73,13 +74,13 @@ define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
 
   %ext_short = zext i16 %short to i64
   store volatile i64 %ext_short, i64* @var64
-; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+; CHECK: and w[[EXT:[0-9]+]], w2, #0xffff
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
 
   %ext_int = zext i32 %int to i64
   store volatile i64 %ext_int, i64* @var64
-; CHECK: ubfx [[EXT:x[0-9]+]], x3, #0, #32
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+; CHECK: mov w[[EXT:[0-9]+]], w3
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
 
   ret void
 }
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
index 1c1b58b8b140d..dc9884f12f571 100644
--- a/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -508,7 +508,7 @@ entry:
 ; "i64 %0" should be in register x7.
 ; "i32 8" should be on stack at [sp].
 ; CHECK: ldr x7, [{{x[0-9]+}}]
-; CHECK: str {{w[0-9]+}}, [sp]
+; CHECK: str {{w[0-9]+}}, [sp, #-16]!
 ; FAST-LABEL: i64_split
 ; FAST: ldr x7, [{{x[0-9]+}}]
 ; FAST: mov x[[R0:[0-9]+]], sp
diff --git a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
index 4703d25a6016b..d46800d34cac8 100644
--- a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
+++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march arm64 < %s | FileCheck %s
+; RUN: llc -march arm64 < %s -aarch64-collect-loh=false | FileCheck %s
 ; rdar://13452552
-; ModuleID = 'reduced_test.ll'
+; Disable the collecting of LOH so that the labels do not get in the
+; way of the NEXT patterns.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios3.0.0"
 
@@ -13,8 +14,8 @@ define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
 ; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
 ; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  w0, sxtw]
 ; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw]
-; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
-; CHECK-NEXT b.ne
+; CHECK-NEXT: cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
+; CHECK-NEXT: b.ne
 ; Next BB
 ; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
 ; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
index eb0cd3547bdad..36424506bee87 100644
--- a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
+++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
 
 ; CHECK: foo
-; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
-; CHECK: str w[[REG]], [x19, #132]
-; CHECK: ldr w{{[0-9]+}}, [x19, #264]
+; CHECK: str w[[REG0:[0-9]+]], [x19, #264]
+; CHECK: mov w[[REG1:[0-9]+]], w[[REG0]]
+; CHECK: str w[[REG1]], [x19, #132]
 
 define i32 @foo(i32 %a) nounwind {
   %retval = alloca i32, align 4
diff --git a/test/CodeGen/AArch64/arm64-arith.ll b/test/CodeGen/AArch64/arm64-arith.ll
index f36e706b15dda..d5d9a1b98174b 100644
--- a/test/CodeGen/AArch64/arm64-arith.ll
+++ b/test/CodeGen/AArch64/arm64-arith.ll
@@ -123,7 +123,8 @@ entry:
 define i64 @t14(i16 %a, i64 %x) nounwind ssp {
 entry:
 ; CHECK-LABEL: t14:
-; CHECK: add	x0, x1, w0, uxth #3
+; CHECK: and	w8, w0, #0xffff
+; CHECK: add	x0, x1, w8, uxtw #3
 ; CHECK: ret
   %c = zext i16 %a to i64
   %d = shl i64 %c, 3
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
index a76cf74a6d0c2..44c24c51f0df5 100644
--- a/test/CodeGen/AArch64/arm64-atomic-128.ll
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -173,10 +173,13 @@ define i128 @atomic_load_seq_cst(i128* %p) {
    ret i128 %r
 }
 
-define i128 @atomic_load_relaxed(i128* %p) {
+define i128 @atomic_load_relaxed(i64, i64, i128* %p) {
 ; CHECK-LABEL: atomic_load_relaxed:
 ; CHECK-NOT: dmb
-; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x2]
+; CHECK-NEXT: stxp [[SUCCESS:w[0-9]+]], [[LO]], [[HI]], [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
 ; CHECK-NOT: dmb
    %r = load atomic i128, i128* %p monotonic, align 16
    ret i128 %r
diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
index 0824bd881a95c..5d8d60de5fc5c 100644
--- a/test/CodeGen/AArch64/arm64-atomic.ll
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -2,13 +2,17 @@
 
 define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-LABEL: val_compare_and_swap:
-; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x0]
+; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
+; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
+; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
-; CHECK-NEXT: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], w2, [x0]
-; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK-NEXT: [[LABEL2]]:
+; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
+; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
+; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: [[EXITBB]]:
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -17,13 +21,16 @@ define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 {
 define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 {
 ; CHECK-LABEL: val_compare_and_swap_from_load:
 ; CHECK-NEXT: ldr    [[NEW:w[0-9]+]], [x2]
-; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x0]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
-; CHECK-NEXT: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0]
-; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK-NEXT: [[LABEL2]]:
+; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
+; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: [[EXITBB]]:
   %new = load i32, i32* %pnew
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
@@ -32,13 +39,17 @@ define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 {
 
 define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-LABEL: val_compare_and_swap_rel:
-; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x0]
+; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
+; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
+; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
-; CHECK-NEXT: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x0]
-; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK-NEXT: [[LABEL2]]:
+; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]
+; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
+; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: [[EXITBB]]:
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -47,13 +58,16 @@ define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 {
 define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 {
 ; CHECK-LABEL: val_compare_and_swap_64:
 ; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
-; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK-NEXT: ldxr   [[RESULT:x[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: cmp    [[RESULT]], x1
-; CHECK-NEXT: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]]
-; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK-NEXT: [[LABEL2]]:
+; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
+; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: [[EXITBB]]:
   %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic
   %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
@@ -61,13 +75,13 @@ define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 {
 
 define i32 @fetch_and_nand(i32* %p) #0 {
 ; CHECK-LABEL: fetch_and_nand:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
 ; CHECK: mvn    [[TMP_REG:w[0-9]+]], w[[DEST_REG]]
 ; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], [[TMP_REG]], #0xfffffff8
 ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
 ; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[TRYBB]]
 ; CHECK: mov    x0, x[[DEST_REG]]
   %val = atomicrmw nand i32* %p, i32 7 release
   ret i32 %val
@@ -76,12 +90,12 @@ define i32 @fetch_and_nand(i32* %p) #0 {
 define i64 @fetch_and_nand_64(i64* %p) #0 {
 ; CHECK-LABEL: fetch_and_nand_64:
 ; CHECK: mov    x[[ADDR:[0-9]+]], x0
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK: ldaxr   x[[DEST_REG:[0-9]+]], [x[[ADDR]]]
 ; CHECK: mvn    w[[TMP_REG:[0-9]+]], w[[DEST_REG]]
 ; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], x[[TMP_REG]], #0xfffffffffffffff8
 ; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[TRYBB]]
 
   %val = atomicrmw nand i64* %p, i64 7 acq_rel
   ret i64 %val
@@ -90,12 +104,12 @@ define i64 @fetch_and_nand_64(i64* %p) #0 {
 define i32 @fetch_and_or(i32* %p) #0 {
 ; CHECK-LABEL: fetch_and_or:
 ; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #0x5
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK: ldaxr   w[[DEST_REG:[0-9]+]], [x0]
 ; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
 ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
 ; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[TRYBB]]
 ; CHECK: mov    x0, x[[DEST_REG]]
   %val = atomicrmw or i32* %p, i32 5 seq_cst
   ret i32 %val
@@ -104,11 +118,11 @@ define i32 @fetch_and_or(i32* %p) #0 {
 define i64 @fetch_and_or_64(i64* %p) #0 {
 ; CHECK: fetch_and_or_64:
 ; CHECK: mov    x[[ADDR:[0-9]+]], x0
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
 ; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7
 ; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[TRYBB]]
   %val = atomicrmw or i64* %p, i64 7 monotonic
   ret i64 %val
 }
diff --git a/test/CodeGen/AArch64/arm64-builtins-linux.ll b/test/CodeGen/AArch64/arm64-builtins-linux.ll
new file mode 100644
index 0000000000000..34fa1b4715615
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-builtins-linux.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.aarch64.thread.pointer() #1
+
+define i8* @thread_pointer() {
+; CHECK: thread_pointer:
+; CHECK: mrs {{x[0-9]+}}, TPIDR_EL0
+  %1 = tail call i8* @llvm.aarch64.thread.pointer()
+  ret i8* %1
+}
diff --git a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
index 4e47ab6c03f3e..25d874e54cb7c 100644
--- a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
@@ -15,10 +15,10 @@ target triple = "arm64-apple-ios7.0.0"
 ; CHECK: Maze1
 ; CHECK: %if.then
 ; CHECK: cmp x{{[0-9]+}}, #2
-; CHECK-NEXT b.cc
+; CHECK-NEXT: b.lo
 ; CHECK: %if.then
 ; CHECK: cmp x{{[0-9]+}}, #2
-; CHECK-NEXT b.cc
+; CHECK-NEXT: b.lo
 define i32 @Maze1() nounwind ssp {
 entry:
   %0 = load i64, i64* @channelColumns, align 8, !tbaa !0
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
index ff18f73643371..72d3b8331162f 100644
--- a/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -104,11 +104,14 @@ if.end:                                           ; preds = %if.then, %lor.lhs.f
 ; Speculatively execute division by zero.
 ; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
 ; safe to speculate.
-; CHECK: speculate_division
-; CHECK-NOT: cmp
-; CHECK: sdiv
-; CHECK: cmp
-; CHECK-NEXT: ccmp
+; CHECK-LABEL: speculate_division:
+; CHECK: cmp w0, #1
+; CHECK: sdiv [[DIVRES:w[0-9]+]], w1, w0
+; CHECK: ccmp [[DIVRES]], #16, #0, ge
+; CHECK: b.gt [[BLOCK:LBB[0-9_]+]]
+; CHECK: bl _foo
+; CHECK: [[BLOCK]]:
+; CHECK: orr w0, wzr, #0x7
 define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -287,3 +290,156 @@ sw.bb.i.i:
   %code1.i.i.phi.trans.insert = getelementptr inbounds %str1, %str1* %0, i64 0, i32 0, i32 0, i64 16
   br label %sw.bb.i.i
 }
+
+; CHECK-LABEL: select_and
+define i64 @select_and(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
+; CHECK: cmp w1, #5
+; CHECK-NEXT: ccmp w0, w1, #0, ne
+; CHECK-NEXT: csel x0, x2, x3, lt
+; CHECK-NEXT: ret
+  %1 = icmp slt i32 %w0, %w1
+  %2 = icmp ne i32 5, %w1
+  %3 = and i1 %1, %2
+  %sel = select i1 %3, i64 %x2, i64 %x3
+  ret i64 %sel
+}
+
+; CHECK-LABEL: select_or
+define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
+; CHECK: cmp w1, #5
+; CHECK-NEXT: ccmp w0, w1, #8, eq
+; CHECK-NEXT: csel x0, x2, x3, lt
+; CHECK-NEXT: ret
+  %1 = icmp slt i32 %w0, %w1
+  %2 = icmp ne i32 5, %w1
+  %3 = or i1 %1, %2
+  %sel = select i1 %3, i64 %x2, i64 %x3
+  ret i64 %sel
+}
+
+; CHECK-LABEL: select_complicated
+define i16 @select_complicated(double %v1, double %v2, i16 %a, i16 %b) {
+; CHECK: ldr [[REG:d[0-9]+]],
+; CHECK: fcmp d0, d2
+; CHECK-NEXT: fmov d2, #13.00000000
+; CHECK-NEXT: fccmp d1, d2, #4, ne
+; CHECK-NEXT: fccmp d0, d1, #1, ne
+; CHECK-NEXT: fccmp d0, d1, #4, vc
+; CEHCK-NEXT: csel w0, w0, w1, eq
+  %1 = fcmp one double %v1, %v2
+  %2 = fcmp oeq double %v2, 13.0
+  %3 = fcmp oeq double %v1, 42.0
+  %or0 = or i1 %2, %3
+  %or1 = or i1 %1, %or0
+  %sel = select i1 %or1, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: gccbug
+define i64 @gccbug(i64 %x0, i64 %x1) {
+; CHECK: cmp x0, #2
+; CHECK-NEXT: ccmp x0, #4, #4, ne
+; CHECK-NEXT: ccmp x1, #0, #0, eq
+; CHECK-NEXT: orr w[[REGNUM:[0-9]+]], wzr, #0x1
+; CHECK-NEXT: cinc x0, x[[REGNUM]], eq
+; CHECK-NEXT: ret
+  %cmp0 = icmp eq i64 %x1, 0
+  %cmp1 = icmp eq i64 %x0, 2
+  %cmp2 = icmp eq i64 %x0, 4
+
+  %or = or i1 %cmp2, %cmp1
+  %and = and i1 %or, %cmp0
+
+  %sel = select i1 %and, i64 2, i64 1
+  ret i64 %sel
+}
+
+; CHECK-LABEL: select_ororand
+define i32 @select_ororand(i32 %w0, i32 %w1, i32 %w2, i32 %w3) {
+; CHECK: cmp w3, #4
+; CHECK-NEXT: ccmp w2, #2, #0, gt
+; CHECK-NEXT: ccmp w1, #13, #2, ge
+; CHECK-NEXT: ccmp w0, #0, #4, ls
+; CHECK-NEXT: csel w0, w3, wzr, eq
+; CHECK-NEXT: ret
+  %c0 = icmp eq i32 %w0, 0
+  %c1 = icmp ugt i32 %w1, 13
+  %c2 = icmp slt i32 %w2, 2
+  %c4 = icmp sgt i32 %w3, 4
+  %or = or i1 %c0, %c1
+  %and = and i1 %c2, %c4
+  %or1 = or i1 %or, %and
+  %sel = select i1 %or1, i32 %w3, i32 0
+  ret i32 %sel
+}
+
+; CHECK-LABEL: select_andor
+define i32 @select_andor(i32 %v1, i32 %v2, i32 %v3) {
+; CHECK: cmp w1, w2
+; CHECK-NEXT: ccmp w0, #0, #4, lt
+; CHECK-NEXT: ccmp w0, w1, #0, eq
+; CHECK-NEXT: csel w0, w0, w1, eq
+; CHECK-NEXT: ret
+  %c0 = icmp eq i32 %v1, %v2
+  %c1 = icmp sge i32 %v2, %v3
+  %c2 = icmp eq i32 %v1, 0
+  %or = or i1 %c2, %c1
+  %and = and i1 %or, %c0
+  %sel = select i1 %and, i32 %v1, i32 %v2
+  ret i32 %sel
+}
+
+; CHECK-LABEL: select_noccmp1
+define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
+; CHECK: cmp x0, #0
+; CHECK-NEXT: cset [[REG0:w[0-9]+]], lt
+; CHECK-NEXT: cmp x0, #13
+; CHECK-NOT: ccmp
+; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt
+; CHECK-NEXT: cmp x2, #2
+; CHECK-NEXT: cset [[REG2:w[0-9]+]], lt
+; CHECK-NEXT: cmp x2, #4
+; CHECK-NEXT: cset [[REG3:w[0-9]+]], gt
+; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]]
+; CHECK-NEXT: and [[REG5:w[0-9]+]], [[REG2]], [[REG3]]
+; CHECK-NEXT: orr [[REG6:w[0-9]+]], [[REG4]], [[REG5]]
+; CHECK-NEXT: cmp [[REG6]], #0
+; CHECK-NEXT: csel x0, xzr, x3, ne
+; CHECK-NEXT: ret
+  %c0 = icmp slt i64 %v1, 0
+  %c1 = icmp sgt i64 %v1, 13
+  %c2 = icmp slt i64 %v3, 2
+  %c4 = icmp sgt i64 %v3, 4
+  %and0 = and i1 %c0, %c1
+  %and1 = and i1 %c2, %c4
+  %or = or i1 %and0, %and1
+  %sel = select i1 %or, i64 0, i64 %r
+  ret i64 %sel
+}
+
+@g = global i32 0
+
+; Should not use ccmp if we have to compute the or expression in an integer
+; register anyway because of other users.
+; CHECK-LABEL: select_noccmp2
+define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
+; CHECK: cmp x0, #0
+; CHECK-NEXT: cset [[REG0:w[0-9]+]], lt
+; CHECK-NOT: ccmp
+; CHECK-NEXT: cmp x0, #13
+; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt
+; CHECK-NEXT: orr [[REG2:w[0-9]+]], [[REG0]], [[REG1]]
+; CHECK-NEXT: cmp [[REG2]], #0
+; CHECK-NEXT: csel x0, xzr, x3, ne
+; CHECK-NEXT: sbfx [[REG3:w[0-9]+]], [[REG2]], #0, #1
+; CHECK-NEXT: adrp x[[REGN4:[0-9]+]], _g@PAGE
+; CHECK-NEXT: str [[REG3]], [x[[REGN4]], _g@PAGEOFF]
+; CHECK-NEXT: ret
+  %c0 = icmp slt i64 %v1, 0
+  %c1 = icmp sgt i64 %v1, 13
+  %or = or i1 %c0, %c1
+  %sel = select i1 %or, i64 0, i64 %r
+  %ext = sext i1 %or to i32
+  store volatile i32 %ext, i32* @g
+  ret i64 %sel
+}
diff --git a/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll b/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll
new file mode 100644
index 0000000000000..528d2538bb4ab
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s | FileCheck %s
+
+; CHECK:       orr     w0, wzr, #0x1
+; CHECK-NEXT:  bl      foo
+; CHECK-NEXT:  orr     w0, wzr, #0x1
+; CHECK-NEXT:  bl      foo
+
+target triple = "aarch64--linux-android"
+declare i32 @foo(i32)
+
+; Function Attrs: nounwind uwtable
+define i32 @main() {
+entry:
+  %call = tail call i32 @foo(i32 1)
+  %call1 = tail call i32 @foo(i32 1)
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll
index c0aa63cc43312..59147d401a305 100644
--- a/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -51,3 +51,607 @@ if.end4:                                          ; preds = %if.then2, %if.then,
   %add6 = add nsw i32 %tmp3, %t.addr.0
   ret i32 %add6
 }
+
+@C = common global i32 0, align 4
+
+; Check that we catch AdrpLdrGotLdr case when we have a simple chain:
+; adrp -> ldrgot -> ldr.
+; CHECK-LABEL: _getC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getC() {
+  %res = load i32, i32* @C, align 4
+  ret i32 %res
+}
+
+; LDRSW supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExtC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsw x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i64 @getSExtC() {
+  %res = load i32, i32* @C, align 4
+  %sextres = sext i32 %res to i64
+  ret i64 %sextres
+}
+
+; It may not be safe to fold the literal in the load if the address is
+; used several times.
+; Make sure we emit AdrpLdrGot for those.
+; CHECK-LABEL: _getSeveralC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
+; CHECK-NEXT: str [[ADD]], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+define void @getSeveralC(i32 %t) {
+entry:
+  %tmp = load i32, i32* @C, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* @C, align 4
+  ret void
+}
+
+; Make sure we catch that:
+; adrp -> ldrgot -> str.
+; CHECK-LABEL: _setC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define void @setC(i32 %t) {
+entry:
+  store i32 %t, i32* @C, align 4
+  ret void
+}
+
+; Perform the same tests for internal global and a displacement
+; in the addressing mode.
+; Indeed we will get an ADD for those instead of LOADGot.
+@InternalC = internal global i32 0, align 4
+
+; Check that we catch AdrpAddLdr case when we have a simple chain:
+; adrp -> add -> ldr.
+; CHECK-LABEL: _getInternalCPlus4
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr w0, {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getInternalCPlus4() {
+  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %res = load i32, i32* %addr, align 4
+  ret i32 %res
+}
+
+; LDRSW supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExtInternalCPlus4
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsw x0, {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define i64 @getSExtInternalCPlus4() {
+  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %res = load i32, i32* %addr, align 4
+  %sextres = sext i32 %res to i64
+  ret i64 %sextres
+}
+
+; It may not be safe to fold the literal in the load if the address is
+; used several times.
+; Make sure we emit AdrpAdd for those.
+; CHECK-LABEL: _getSeveralInternalCPlus4
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
+; CHECK-NEXT: str [[ADD]], {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAdd [[ADRP_LABEL]], [[ADDGOT_LABEL]]
+define void @getSeveralInternalCPlus4(i32 %t) {
+entry:
+  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %tmp = load i32, i32* %addr, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* %addr, align 4
+  ret void
+}
+
+; Make sure we catch that:
+; adrp -> add -> str.
+; CHECK-LABEL: _setInternalCPlus4
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str w0, {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @setInternalCPlus4(i32 %t) {
+entry:
+  %addr = getelementptr i32, i32* @InternalC, i32 4
+  store i32 %t, i32* %addr, align 4
+  ret void
+}
+
+; Check that we catch AdrpAddLdr case when we have a simple chain:
+; adrp -> ldr.
+; CHECK-LABEL: _getInternalC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr w0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdr [[ADRP_LABEL]], [[LDR_LABEL]]
+define i32 @getInternalC() {
+  %res = load i32, i32* @InternalC, align 4
+  ret i32 %res
+}
+
+; LDRSW supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExtInternalC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsw x0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdr [[ADRP_LABEL]], [[LDR_LABEL]]
+define i64 @getSExtInternalC() {
+  %res = load i32, i32* @InternalC, align 4
+  %sextres = sext i32 %res to i64
+  ret i64 %sextres
+}
+
+; It may not be safe to fold the literal in the load if the address is
+; used several times.
+; Make sure we do not catch anything here. We have a adrp alone,
+; there is not much we can do about it.
+; CHECK-LABEL: _getSeveralInternalC
+; CHECK: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
+; CHECK-NEXT: str [[ADD]], {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ret
+define void @getSeveralInternalC(i32 %t) {
+entry:
+  %tmp = load i32, i32* @InternalC, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* @InternalC, align 4
+  ret void
+}
+
+; Make sure we do not catch anything when:
+; adrp -> str.
+; We cannot fold anything in the str at this point.
+; Indeed, strs do not support litterals.
+; CHECK-LABEL: _setInternalC
+; CHECK: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: str w0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ret
+define void @setInternalC(i32 %t) {
+entry:
+  store i32 %t, i32* @InternalC, align 4
+  ret void
+}
+
+; Now check other variant of loads/stores.
+
+@D = common global i8 0, align 4
+
+; LDRB does not support loading from a literal.
+; Make sure we emit AdrpLdrGot and not AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getD
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldrb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+define i8 @getD() {
+  %res = load i8, i8* @D, align 4
+  ret i8 %res
+}
+
+; CHECK-LABEL: _setD
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: strb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setD(i8 %t) {
+  store i8 %t, i8* @D, align 4
+  ret void
+}
+
+; LDRSB supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExtD
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getSExtD() {
+  %res = load i8, i8* @D, align 4
+  %sextres = sext i8 %res to i32
+  ret i32 %sextres
+}
+
+; LDRSB supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExt64D
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsb x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i64 @getSExt64D() {
+  %res = load i8, i8* @D, align 4
+  %sextres = sext i8 %res to i64
+  ret i64 %sextres
+}
+
+@E = common global i16 0, align 4
+
+; LDRH does not support loading from a literal.
+; Make sure we emit AdrpLdrGot and not AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getE
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldrh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+define i16 @getE() {
+  %res = load i16, i16* @E, align 4
+  ret i16 %res
+}
+
+; LDRSH supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExtE
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getSExtE() {
+  %res = load i16, i16* @E, align 4
+  %sextres = sext i16 %res to i32
+  ret i32 %sextres
+}
+
+; CHECK-LABEL: _setE
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: strh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setE(i16 %t) {
+  store i16 %t, i16* @E, align 4
+  ret void
+}
+
+; LDRSH supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExt64E
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsh x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i64 @getSExt64E() {
+  %res = load i16, i16* @E, align 4
+  %sextres = sext i16 %res to i64
+  ret i64 %sextres
+}
+
+@F = common global i64 0, align 4
+
+; LDR supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getF
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i64 @getF() {
+  %res = load i64, i64* @F, align 4
+  ret i64 %res
+}
+
+; CHECK-LABEL: _setF
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setF(i64 %t) {
+  store i64 %t, i64* @F, align 4
+  ret void
+}
+
+@G = common global float 0.0, align 4
+
+; LDR float supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getG
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr s0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define float @getG() {
+  %res = load float, float* @G, align 4
+  ret float %res
+}
+
+; CHECK-LABEL: _setG
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str s0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setG(float %t) {
+  store float %t, float* @G, align 4
+  ret void
+}
+
+@H = common global half 0.0, align 4
+
+; LDR half supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getH
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr h0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define half @getH() {
+  %res = load half, half* @H, align 4
+  ret half %res
+}
+
+; CHECK-LABEL: _setH
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str h0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setH(half %t) {
+  store half %t, half* @H, align 4
+  ret void
+}
+
+@I = common global double 0.0, align 4
+
+; LDR double supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getI
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define double @getI() {
+  %res = load double, double* @I, align 4
+  ret double %res
+}
+
+; CHECK-LABEL: _setI
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setI(double %t) {
+  store double %t, double* @I, align 4
+  ret void
+}
+
+@J = common global <2 x i32> <i32 0, i32 0>, align 4
+
+; LDR 64-bit vector supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getJ
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <2 x i32> @getJ() {
+  %res = load <2 x i32>, <2 x i32>* @J, align 4
+  ret <2 x i32> %res
+}
+
+; CHECK-LABEL: _setJ
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setJ(<2 x i32> %t) {
+  store <2 x i32> %t, <2 x i32>* @J, align 4
+  ret void
+}
+
+@K = common global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 4
+
+; LDR 128-bit vector supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getK
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr q0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <4 x i32> @getK() {
+  %res = load <4 x i32>, <4 x i32>* @K, align 4
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: _setK
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str q0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setK(<4 x i32> %t) {
+  store <4 x i32> %t, <4 x i32>* @K, align 4
+  ret void
+}
+
+@L = common global <1 x i8> <i8 0>, align 4
+
+; LDR 8-bit vector supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getL
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr b0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <1 x i8> @getL() {
+  %res = load <1 x i8>, <1 x i8>* @L, align 4
+  ret <1 x i8> %res
+}
+
+; CHECK-LABEL: _setL
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; Ultimately we should generate str b0, but right now, we match the vector
+; variant which does not allow to fold the immediate into the store.
+; CHECK-NEXT: st1.b { v0 }[0], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+define void @setL(<1 x i8> %t) {
+  store <1 x i8> %t, <1 x i8>* @L, align 4
+  ret void
+}
+
+; Make sure we do not assert when we do not track
+; all the aliases of a tuple register.
+; Indeed the tuple register can be tracked because of
+; one of its element, but the other elements of the tuple
+; do not need to be tracked and we used to assert on that.
+; Note: The test case is fragile in the sense that we need
+; a tuple register to appear in the lowering. Thus, the target
+; cpu is required to have the problem reproduced.
+; CHECK-LABEL: _uninterestingSub
+; CHECK: adrp [[ADRP_REG:x[0-9]+]], [[CONSTPOOL:lCPI[0-9]+_[0-9]+]]@PAGE
+; CHECK-NEXT: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF]
+; The tuple comes from the next instruction.
+; CHECK-NEXT: tbl.16b v{{[0-9]+}}, { v{{[0-9]+}}, v{{[0-9]+}} }, v[[IDX]]
+; CHECK: ret
+define void @uninterestingSub(i8* nocapture %row) #0 {
+  %tmp = bitcast i8* %row to <16 x i8>*
+  %tmp1 = load <16 x i8>, <16 x i8>* %tmp, align 16
+  %vext43 = shufflevector <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %tmp1, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  %add.i.414 = add <16 x i8> zeroinitializer, %vext43
+  store <16 x i8> %add.i.414, <16 x i8>* %tmp, align 16
+  %add.ptr51 = getelementptr inbounds i8, i8* %row, i64 16
+  %tmp2 = bitcast i8* %add.ptr51 to <16 x i8>*
+  %tmp3 = load <16 x i8>, <16 x i8>* %tmp2, align 16
+  %tmp4 = bitcast i8* undef to <16 x i8>*
+  %tmp5 = load <16 x i8>, <16 x i8>* %tmp4, align 16
+  %vext157 = shufflevector <16 x i8> %tmp3, <16 x i8> %tmp5, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  %add.i.402 = add <16 x i8> zeroinitializer, %vext157
+  store <16 x i8> %add.i.402, <16 x i8>* %tmp4, align 16
+  ret void
+}
+
+attributes #0 = { "target-cpu"="cyclone" }
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
index 0ef7b143df807..55c9c6036ed57 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-br.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
@@ -94,9 +94,7 @@ entry:
   store i32 %c, i32* %c.addr, align 4
   store i64 %d, i64* %d.addr, align 8
   %0 = load i16, i16* %b.addr, align 2
-; CHECK: and w0, w0, #0x1
-; CHECK: cmp w0, #0
-; CHECK: b.eq LBB4_2
+; CHECK: tbz w0, #0, LBB4_2
   %conv = trunc i16 %0 to i1
   br i1 %conv, label %if.then, label %if.end
 
@@ -106,9 +104,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %1 = load i32, i32* %c.addr, align 4
-; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
-; CHECK: cmp w[[REG]], #0
-; CHECK: b.eq LBB4_4
+; CHECK: tbz w{{[0-9]+}}, #0, LBB4_4
   %conv1 = trunc i32 %1 to i1
   br i1 %conv1, label %if.then3, label %if.end4
 
@@ -118,8 +114,7 @@ if.then3:                                         ; preds = %if.end
 
 if.end4:                                          ; preds = %if.then3, %if.end
   %2 = load i64, i64* %d.addr, align 8
-; CHECK: cmp w{{[0-9]+}}, #0
-; CHECK: b.eq LBB4_6
+; CHECK: tbz w{{[0-9]+}}, #0, LBB4_6
   %conv5 = trunc i64 %2 to i1
   br i1 %conv5, label %if.then7, label %if.end8
 
@@ -139,9 +134,7 @@ define i32 @trunc64(i64 %foo) nounwind {
 ; CHECK: trunc64
 ; CHECK: and  [[REG1:x[0-9]+]], x0, #0x1
 ; CHECK: mov  x[[REG2:[0-9]+]], [[REG1]]
-; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0x1
-; CHECK: cmp  [[REG3]], #0
-; CHECK: b.eq LBB5_2
+; CHECK: tbz w[[REG2]], #0, LBB5_2
   %a = and i64 %foo, 1
   %b = trunc i64 %a to i1
   br i1 %b, label %if.then, label %if.else
diff --git a/test/CodeGen/AArch64/arm64-fmax-safe.ll b/test/CodeGen/AArch64/arm64-fmax-safe.ll
new file mode 100644
index 0000000000000..8b7d66986e786
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmax-safe.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+define double @test_direct(float %in) {
+; CHECK-LABEL: test_direct:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %val = select i1 %cmp, float 0.000000e+00, float %in
+  %longer = fpext float %val to double
+  ret double %longer
+
+; CHECK: fmax s
+}
+
+define double @test_cross(float %in) {
+; CHECK-LABEL: test_cross:
+  %cmp = fcmp ult float %in, 0.000000e+00
+  %val = select i1 %cmp, float %in, float 0.000000e+00
+  %longer = fpext float %val to double
+  ret double %longer
+
+; CHECK: fmin s
+}
+
+; Same as previous, but with ordered comparison;
+; must become fminnm, not fmin.
+define double @test_cross_fail_nan(float %in) {
+; CHECK-LABEL: test_cross_fail_nan:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %val = select i1 %cmp, float %in, float 0.000000e+00
+  %longer = fpext float %val to double
+  ret double %longer
+
+; CHECK: fminnm s
+}
+
+; This isn't a min or a max, but passes the first condition for swapping the
+; results. Make sure they're put back before we resort to the normal fcsel.
+define float @test_cross_fail(float %lhs, float %rhs) {
+; CHECK-LABEL: test_cross_fail:
+  %tst = fcmp une float %lhs, %rhs
+  %res = select i1 %tst, float %rhs, float %lhs
+  ret float %res
+
+  ; The register allocator would have to decide to be deliberately obtuse before
+  ; other register were used.
+; CHECK: fcsel s0, s1, s0, ne
+}
+
+; Make sure the transformation isn't triggered for integers
+define i64 @test_integer(i64  %in) {
+  %cmp = icmp slt i64 %in, 0
+  %val = select i1 %cmp, i64 0, i64 %in
+  ret i64 %val
+}
diff --git a/test/CodeGen/AArch64/arm64-fmax.ll b/test/CodeGen/AArch64/arm64-fmax.ll
index ea281528b84ce..40cc36ea52fa1 100644
--- a/test/CodeGen/AArch64/arm64-fmax.ll
+++ b/test/CodeGen/AArch64/arm64-fmax.ll
@@ -1,57 +1,48 @@
 ; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
-; RUN: llc -march=arm64 < %s | FileCheck %s --check-prefix=CHECK-SAFE
 
 define double @test_direct(float %in) {
 ; CHECK-LABEL: test_direct:
-; CHECK-SAFE-LABEL: test_direct:
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double 0.000000e+00, double %longer
-  ret double %val
+  %cmp = fcmp nnan olt float %in, 0.000000e+00
+  %val = select i1 %cmp, float 0.000000e+00, float %in
+  %longer = fpext float %val to double
+  ret double %longer
 
 ; CHECK: fmax
-; CHECK-SAFE: fmax
 }
 
 define double @test_cross(float %in) {
 ; CHECK-LABEL: test_cross:
-; CHECK-SAFE-LABEL: test_cross:
-  %cmp = fcmp ult float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double %longer, double 0.000000e+00
-  ret double %val
+  %cmp = fcmp nnan ult float %in, 0.000000e+00
+  %val = select i1 %cmp, float %in, float 0.000000e+00
+  %longer = fpext float %val to double
+  ret double %longer
 
 ; CHECK: fmin
-; CHECK-SAFE: fmin
 }
 
 ; Same as previous, but with ordered comparison;
 ; can't be converted in safe-math mode.
 define double @test_cross_fail_nan(float %in) {
 ; CHECK-LABEL: test_cross_fail_nan:
-; CHECK-SAFE-LABEL: test_cross_fail_nan:
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double %longer, double 0.000000e+00
-  ret double %val
+  %cmp = fcmp nnan olt float %in, 0.000000e+00
+  %val = select i1 %cmp, float %in, float 0.000000e+00
+  %longer = fpext float %val to double
+  ret double %longer
 
 ; CHECK: fmin
-; CHECK-SAFE: fcsel d0, d1, d0, mi
 }
 
 ; This isn't a min or a max, but passes the first condition for swapping the
 ; results. Make sure they're put back before we resort to the normal fcsel.
 define float @test_cross_fail(float %lhs, float %rhs) {
 ; CHECK-LABEL: test_cross_fail:
-; CHECK-SAFE-LABEL: test_cross_fail:
-  %tst = fcmp une float %lhs, %rhs
+  %tst = fcmp nnan une float %lhs, %rhs
   %res = select i1 %tst, float %rhs, float %lhs
   ret float %res
 
   ; The register allocator would have to decide to be deliberately obtuse before
   ; other register were used.
 ; CHECK: fcsel s0, s1, s0, ne
-; CHECK-SAFE: fcsel s0, s1, s0, ne
 }
 
 ; Make sure the transformation isn't triggered for integers
@@ -60,3 +51,14 @@ define i64 @test_integer(i64  %in) {
   %val = select i1 %cmp, i64 0, i64 %in
   ret i64 %val
 }
+
+define float @test_f16(half %in) {
+; CHECK-LABEL: test_f16:
+  %cmp = fcmp nnan ult half %in, 0.000000e+00
+  %val = select i1 %cmp, half %in, half 0.000000e+00
+  %longer = fpext half %val to float
+  ret float %longer
+; FIXME: It'd be nice for this to create an fmin instruction!
+; CHECK: fcvt
+; CHECK: fcsel
+}
diff --git a/test/CodeGen/AArch64/arm64-fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll
index aaef39fcf512e..097fe2ca6ed9a 100644
--- a/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@@ -148,14 +148,9 @@ define i1 @test_setcc2() {
 ; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
 
   %val = fcmp ugt fp128 %lhs, %rhs
-; CHECK: bl      __gttf2
+; CHECK: bl      __letf2
 ; CHECK: cmp     w0, #0
-; CHECK: cset   [[GT:w[0-9]+]], gt
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp     w0, #0
-; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
-; CHECK: orr     w0, [[UNORDERED]], [[GT]]
+; CHECK: cset    w0, gt
 
   ret i1 %val
 ; CHECK: ret
@@ -169,31 +164,21 @@ define i32 @test_br_cc() {
 ; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
 ; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
 
-  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
+  ; olt == !uge, which LLVM optimizes this to.
   %cond = fcmp olt fp128 %lhs, %rhs
-; CHECK: bl      __getf2
-; CHECK: cmp     w0, #0
-; CHECK: cset   [[OGE:w[0-9]+]], ge
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp     w0, #0
-; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
-
-; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
-; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
+; CHECK: bl      __lttf2
+; CHECK-NEXT: cmp     w0, #0
+; CHECK-NEXT: b.ge {{.LBB[0-9]+_[0-9]+}}
   br i1 %cond, label %iftrue, label %iffalse
 
 iftrue:
   ret i32 42
 ; CHECK-NEXT: BB#
 ; CHECK-NEXT: movz w0, #0x2a
-; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
-
+; CHECK: ret
 iffalse:
   ret i32 29
-; CHECK: [[RET29]]:
-; CHECK-NEXT: movz w0, #0x1d
-; CHECK-NEXT: [[REALRET]]:
+; CHECK: movz w0, #0x1d
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll
index f1c4e9bbaed95..895bfe4b3915a 100644
--- a/test/CodeGen/AArch64/arm64-hello.ll
+++ b/test/CodeGen/AArch64/arm64-hello.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -disable-post-ra | FileCheck %s --check-prefix=CHECK-LINUX
 
 ; CHECK-LABEL: main:
 ; CHECK:	stp	x29, x30, [sp, #-16]!
diff --git a/test/CodeGen/AArch64/arm64-indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll
index b52cddf600ac4..b6ab9934dbc3a 100644
--- a/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -81,6 +81,17 @@ define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind
 }
 
 
+define void @storef16(half** %out, half %index, half %spacing) nounwind {
+; CHECK-LABEL: storef16:
+; CHECK: str h{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load half*, half** %out, align 2
+  %incdec.ptr = getelementptr inbounds half, half* %tmp, i64 1
+  store half %spacing, half* %tmp, align 2
+  store half* %incdec.ptr, half** %out, align 2
+  ret void
+}
+
 define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp {
 ; CHECK-LABEL: storef32:
 ; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4
@@ -125,6 +136,17 @@ define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline
   ret float *%ptr
 }
 
+define half* @pref16(half** %out, half %spacing) nounwind {
+; CHECK-LABEL: pref16:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: str h0, [x0, #6]!
+; CHECK-NEXT: ret
+  %tmp = load half*, half** %out, align 2
+  %ptr = getelementptr inbounds half, half* %tmp, i64 3
+  store half %spacing, half* %ptr, align 2
+  ret half *%ptr
+}
+
 define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp {
 ; CHECK-LABEL: pre64:
 ; CHECK: ldr     x0, [x0]
@@ -230,6 +252,17 @@ define float* @preidxf32(float* %src, float* %out) {
   ret float* %ptr
 }
 
+define half* @preidxf16(half* %src, half* %out) {
+; CHECK-LABEL: preidxf16:
+; CHECK: ldr     h0, [x0, #2]!
+; CHECK: str     h0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds half, half* %src, i64 1
+  %tmp = load half, half* %ptr, align 2
+  store half %tmp, half* %out, align 2
+  ret half* %ptr
+}
+
 define i64* @preidx64(i64* %src, i64* %out) {
 ; CHECK-LABEL: preidx64:
 ; CHECK: ldr     x[[REG:[0-9]+]], [x0, #8]!
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index ba31513172d56..98d4e3646f56c 100644
--- a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 -disable-post-ra -o - %s | FileCheck %s
 
 @ptr = global i8* null
 
@@ -6215,3 +6215,27 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(i16* %bar, i16** %pt
 }
 
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
+
+; CHECK-LABEL: test_ld1lane_build:
+; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[0], [x0]
+; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[1], [x1]
+; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[0], [x2]
+; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[1], [x3]
+; CHECK: sub.2s v[[REGNUM2:[0-9]+]], [[REG0]], [[REG1]]
+; CHECK-NEXT: str d[[REGNUM2]], [x4]
+; CHECK-NEXT: ret
+define void @test_ld1lane_build(i32* %ptr0, i32* %ptr1, i32* %ptr2, i32* %ptr3, <2 x i32>* %out) {
+  %load0 = load i32, i32* %ptr0, align 4
+  %load1 = load i32, i32* %ptr1, align 4
+  %vec0_0 = insertelement <2 x i32> undef, i32 %load0, i32 0
+  %vec0_1 = insertelement <2 x i32> %vec0_0, i32 %load1, i32 1
+
+  %load2 = load i32, i32* %ptr2, align 4
+  %load3 = load i32, i32* %ptr3, align 4
+  %vec1_0 = insertelement <2 x i32> undef, i32 %load2, i32 0
+  %vec1_1 = insertelement <2 x i32> %vec1_0, i32 %load3, i32 1
+
+  %sub = sub nsw <2 x i32> %vec0_1, %vec1_1
+  store <2 x i32> %sub, <2 x i32>* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index 802d95826ce4a..ac6e8a7731c69 100644
--- a/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as -disable-post-ra | FileCheck %s
 
 ; rdar://9167275
 
diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
index dee0344835419..c65cf95be2e57 100644
--- a/test/CodeGen/AArch64/arm64-join-reserved.ll
+++ b/test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -5,7 +5,7 @@ target triple = "arm64-apple-macosx10"
 ; A move isn't necessary.
 ; <rdar://problem/11492712>
 ; CHECK-LABEL: g:
-; CHECK: str xzr, [sp]
+; CHECK: str xzr, [sp, #-16]!
 ; CHECK: bl
 ; CHECK: ret
 define void @g() nounwind ssp {
diff --git a/test/CodeGen/AArch64/arm64-large-frame.ll b/test/CodeGen/AArch64/arm64-large-frame.ll
index c4cce36bcb74b..d1244e73b0f33 100644
--- a/test/CodeGen/AArch64/arm64-large-frame.ll
+++ b/test/CodeGen/AArch64/arm64-large-frame.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim -disable-post-ra < %s | FileCheck %s
 declare void @use_addr(i8*)
 
 @addr = global i8* null
diff --git a/test/CodeGen/AArch64/arm64-ld-from-st.ll b/test/CodeGen/AArch64/arm64-ld-from-st.ll
new file mode 100644
index 0000000000000..dd8add70cdb7c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ld-from-st.ll
@@ -0,0 +1,666 @@
+; RUN: llc < %s -mtriple aarch64--none-eabi -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: Str64Ldr64
+; CHECK: mov x0, x1
+define i64 @Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i64*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 1
+  %1 = load i64, i64* %arrayidx1
+  ret i64 %1
+}
+
+; CHECK-LABEL: Str64Ldr32_0
+; CHECK: and x0, x1, #0xffffffff
+define i32 @Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i32*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 2
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Str64Ldr32_1
+; CHECK: lsr x0, x1, #32
+define i32 @Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i32*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 3
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_0
+; CHECK: and x0, x1, #0xffff
+define i16 @Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 4
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_1
+; CHECK: ubfx x0, x1, #16, #16
+define i16 @Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 5
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_2
+; CHECK: ubfx x0, x1, #32, #16
+define i16 @Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 6
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_3
+; CHECK: lsr x0, x1, #48
+define i16 @Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 7
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_0
+; CHECK: and x0, x1, #0xff
+define i8 @Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 8
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_1
+; CHECK: ubfx x0, x1, #8, #8
+define i8 @Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 9
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_2
+; CHECK: ubfx x0, x1, #16, #8
+define i8 @Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 10
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_3
+; CHECK: ubfx x0, x1, #24, #8
+define i8 @Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 11
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_4
+; CHECK: ubfx x0, x1, #32, #8
+define i8 @Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 12
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_5
+; CHECK: ubfx x0, x1, #40, #8
+define i8 @Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 13
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_6
+; CHECK: ubfx x0, x1, #48, #8
+define i8 @Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 14
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_7
+; CHECK: lsr x0, x1, #56
+define i8 @Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 15
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr32
+; CHECK: mov w0, w1
+define i32 @Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i32*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Str32Ldr16_0
+; CHECK: and w0, w1, #0xffff
+define i16 @Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str32Ldr16_1
+; CHECK: lsr	w0, w1, #16
+define i16 @Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 3
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 4
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 5
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_2
+; CHECK: ubfx w0, w1, #16, #8
+define i8 @Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 6
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_3
+; CHECK: lsr w0, w1, #24
+define i8 @Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 7
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str16Ldr16
+; CHECK: and w0, w1, #0xffff
+define i16 @Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i16*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str16Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i8*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 2
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str16Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i8*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 3
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+
+; CHECK-LABEL: Unscaled_Str64Ldr64
+; CHECK: mov x0, x1
+define i64 @Unscaled_Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i64*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 -1
+  %1 = load i64, i64* %arrayidx1
+  ret i64 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr32_0
+; CHECK: and x0, x1, #0xffffffff
+define i32 @Unscaled_Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i32*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -2
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr32_1
+; CHECK: lsr x0, x1, #32
+define i32 @Unscaled_Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i32*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_0
+; CHECK: and x0, x1, #0xffff
+define i16 @Unscaled_Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -4
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_1
+; CHECK: ubfx x0, x1, #16, #16
+define i16 @Unscaled_Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_2
+; CHECK: ubfx x0, x1, #32, #16
+define i16 @Unscaled_Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_3
+; CHECK: lsr x0, x1, #48
+define i16 @Unscaled_Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_0
+; CHECK: and x0, x1, #0xff
+define i8 @Unscaled_Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -8
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_1
+; CHECK: ubfx x0, x1, #8, #8
+define i8 @Unscaled_Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -7
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_2
+; CHECK: ubfx x0, x1, #16, #8
+define i8 @Unscaled_Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -6
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_3
+; CHECK: ubfx x0, x1, #24, #8
+define i8 @Unscaled_Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -5
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_4
+; CHECK: ubfx x0, x1, #32, #8
+define i8 @Unscaled_Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_5
+; CHECK: ubfx x0, x1, #40, #8
+define i8 @Unscaled_Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_6
+; CHECK: ubfx x0, x1, #48, #8
+define i8 @Unscaled_Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_7
+; CHECK: lsr x0, x1, #56
+define i8 @Unscaled_Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr32
+; CHECK: mov w0, w1
+define i32 @Unscaled_Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i32*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr16_0
+; CHECK: and w0, w1, #0xffff
+define i16 @Unscaled_Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr16_1
+; CHECK: lsr	w0, w1, #16
+define i16 @Unscaled_Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Unscaled_Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Unscaled_Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_2
+; CHECK: ubfx w0, w1, #16, #8
+define i8 @Unscaled_Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_3
+; CHECK: lsr w0, w1, #24
+define i8 @Unscaled_Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr16
+; CHECK: and w0, w1, #0xffff
+define i16 @Unscaled_Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i16*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Unscaled_Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i8*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Unscaled_Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i8*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: StrVolatileLdr
+; CHECK: ldrh
+define i16 @StrVolatileLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+  %1 = load volatile i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: StrNotInRangeLdr
+; CHECK: ldrh
+define i16 @StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_StrNotInRangeLdr
+; CHECK: ldurh
+define i16 @Unscaled_StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: StrCallLdr
+; CHECK: ldrh
+define i16 @StrCallLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %c = call i1 @test_dummy()
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+declare i1 @test_dummy()
+
+; CHECK-LABEL: StrStrLdr
+; CHECK: ldrh
+define i16 @StrStrLdr(i32 %v, i32* %P, i32* %P2, i32 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  store i32 %n, i32* %P2
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
diff --git a/test/CodeGen/AArch64/arm64-ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll
index a192eab112faf..6071d092f8b37 100644
--- a/test/CodeGen/AArch64/arm64-ldp.ll
+++ b/test/CodeGen/AArch64/arm64-ldp.ll
@@ -1,8 +1,6 @@
 ; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
-; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
 
-; CHECK: ldp_int
+; CHECK-LABEL: ldp_int
 ; CHECK: ldp
 define i32 @ldp_int(i32* %p) nounwind {
   %tmp = load i32, i32* %p, align 4
@@ -12,7 +10,7 @@ define i32 @ldp_int(i32* %p) nounwind {
   ret i32 %add
 }
 
-; CHECK: ldp_sext_int
+; CHECK-LABEL: ldp_sext_int
 ; CHECK: ldpsw
 define i64 @ldp_sext_int(i32* %p) nounwind {
   %tmp = load i32, i32* %p, align 4
@@ -51,7 +49,7 @@ define i64 @ldp_half_sext_res1_int(i32* %p) nounwind {
 }
 
 
-; CHECK: ldp_long
+; CHECK-LABEL: ldp_long
 ; CHECK: ldp
 define i64 @ldp_long(i64* %p) nounwind {
   %tmp = load i64, i64* %p, align 8
@@ -61,7 +59,7 @@ define i64 @ldp_long(i64* %p) nounwind {
   ret i64 %add
 }
 
-; CHECK: ldp_float
+; CHECK-LABEL: ldp_float
 ; CHECK: ldp
 define float @ldp_float(float* %p) nounwind {
   %tmp = load float, float* %p, align 4
@@ -71,7 +69,7 @@ define float @ldp_float(float* %p) nounwind {
   ret float %add
 }
 
-; CHECK: ldp_double
+; CHECK-LABEL: ldp_double
 ; CHECK: ldp
 define double @ldp_double(double* %p) nounwind {
   %tmp = load double, double* %p, align 8
@@ -83,10 +81,10 @@ define double @ldp_double(double* %p) nounwind {
 
 ; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
 define i32 @ldur_int(i32* %a) nounwind {
-; LDUR_CHK: ldur_int
-; LDUR_CHK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
-; LDUR_CHK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_int
+; CHECK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
+; CHECK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i32 -1
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i32 -2
@@ -96,10 +94,10 @@ define i32 @ldur_int(i32* %a) nounwind {
 }
 
 define i64 @ldur_sext_int(i32* %a) nounwind {
-; LDUR_CHK: ldur_sext_int
-; LDUR_CHK: ldpsw     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-8]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_sext_int
+; CHECK: ldpsw     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-8]
+; CHECK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i32 -1
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i32 -2
@@ -111,11 +109,11 @@ define i64 @ldur_sext_int(i32* %a) nounwind {
 }
 
 define i64 @ldur_half_sext_int_res0(i32* %a) nounwind {
-; LDUR_CHK: ldur_half_sext_int_res0
-; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8]
-; LDUR_CHK: sxtw     x[[DST1]], w[[DST1]]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_half_sext_int_res0
+; CHECK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8]
+; CHECK: sxtw     x[[DST1]], w[[DST1]]
+; CHECK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i32 -1
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i32 -2
@@ -127,11 +125,11 @@ define i64 @ldur_half_sext_int_res0(i32* %a) nounwind {
 }
 
 define i64 @ldur_half_sext_int_res1(i32* %a) nounwind {
-; LDUR_CHK: ldur_half_sext_int_res1
-; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8]
-; LDUR_CHK: sxtw     x[[DST2]], w[[DST2]]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_half_sext_int_res1
+; CHECK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8]
+; CHECK: sxtw     x[[DST2]], w[[DST2]]
+; CHECK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i32 -1
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i32 -2
@@ -144,10 +142,10 @@ define i64 @ldur_half_sext_int_res1(i32* %a) nounwind {
 
 
 define i64 @ldur_long(i64* %a) nounwind ssp {
-; LDUR_CHK: ldur_long
-; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_long
+; CHECK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
+; CHECK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i64, i64* %a, i64 -1
   %tmp1 = load i64, i64* %p1, align 2
   %p2 = getelementptr inbounds i64, i64* %a, i64 -2
@@ -157,10 +155,10 @@ define i64 @ldur_long(i64* %a) nounwind ssp {
 }
 
 define float @ldur_float(float* %a) {
-; LDUR_CHK: ldur_float
-; LDUR_CHK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
-; LDUR_CHK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_float
+; CHECK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
+; CHECK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds float, float* %a, i64 -1
   %tmp1 = load float, float* %p1, align 2
   %p2 = getelementptr inbounds float, float* %a, i64 -2
@@ -170,10 +168,10 @@ define float @ldur_float(float* %a) {
 }
 
 define double @ldur_double(double* %a) {
-; LDUR_CHK: ldur_double
-; LDUR_CHK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
-; LDUR_CHK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_double
+; CHECK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
+; CHECK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds double, double* %a, i64 -1
   %tmp1 = load double, double* %p1, align 2
   %p2 = getelementptr inbounds double, double* %a, i64 -2
@@ -184,11 +182,11 @@ define double @ldur_double(double* %a) {
 
 ; Now check some boundary conditions
 define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyIn
-; LDUR_CHK-NOT: ldur
-; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpBarelyIn
+; CHECK-NOT: ldur
+; CHECK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; CHECK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i64, i64* %a, i64 -31
   %tmp1 = load i64, i64* %p1, align 2
   %p2 = getelementptr inbounds i64, i64* %a, i64 -32
@@ -198,11 +196,11 @@ define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
 }
 
 define i64 @pairUpBarelyInSext(i32* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyInSext
-; LDUR_CHK-NOT: ldur
-; LDUR_CHK: ldpsw     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpBarelyInSext
+; CHECK-NOT: ldur
+; CHECK: ldpsw     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; CHECK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i64 -63
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i64 -64
@@ -214,12 +212,12 @@ define i64 @pairUpBarelyInSext(i32* %a) nounwind ssp {
 }
 
 define i64 @pairUpBarelyInHalfSextRes0(i32* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyInHalfSextRes0
-; LDUR_CHK-NOT: ldur
-; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256]
-; LDUR_CHK: sxtw     x[[DST1]], w[[DST1]]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpBarelyInHalfSextRes0
+; CHECK-NOT: ldur
+; CHECK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256]
+; CHECK: sxtw     x[[DST1]], w[[DST1]]
+; CHECK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i64 -63
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i64 -64
@@ -231,12 +229,12 @@ define i64 @pairUpBarelyInHalfSextRes0(i32* %a) nounwind ssp {
 }
 
 define i64 @pairUpBarelyInHalfSextRes1(i32* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyInHalfSextRes1
-; LDUR_CHK-NOT: ldur
-; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256]
-; LDUR_CHK: sxtw     x[[DST2]], w[[DST2]]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpBarelyInHalfSextRes1
+; CHECK-NOT: ldur
+; CHECK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256]
+; CHECK: sxtw     x[[DST2]], w[[DST2]]
+; CHECK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i64 -63
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i64 -64
@@ -248,12 +246,12 @@ define i64 @pairUpBarelyInHalfSextRes1(i32* %a) nounwind ssp {
 }
 
 define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyOut
-; LDUR_CHK-NOT: ldp
+; CHECK-LABEL: pairUpBarelyOut
+; CHECK-NOT: ldp
 ; Don't be fragile about which loads or manipulations of the base register
 ; are used---just check that there isn't an ldp before the add
-; LDUR_CHK: add
-; LDUR_CHK-NEXT: ret
+; CHECK: add
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i64, i64* %a, i64 -32
   %tmp1 = load i64, i64* %p1, align 2
   %p2 = getelementptr inbounds i64, i64* %a, i64 -33
@@ -263,12 +261,12 @@ define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
 }
 
 define i64 @pairUpBarelyOutSext(i32* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyOutSext
-; LDUR_CHK-NOT: ldp
+; CHECK-LABEL: pairUpBarelyOutSext
+; CHECK-NOT: ldp
 ; Don't be fragile about which loads or manipulations of the base register
 ; are used---just check that there isn't an ldp before the add
-; LDUR_CHK: add
-; LDUR_CHK-NEXT: ret
+; CHECK: add
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i64 -64
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i64 -65
@@ -280,12 +278,12 @@ define i64 @pairUpBarelyOutSext(i32* %a) nounwind ssp {
 }
 
 define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpNotAligned
-; LDUR_CHK-NOT: ldp
-; LDUR_CHK: ldur
-; LDUR_CHK-NEXT: ldur
-; LDUR_CHK-NEXT: add
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpNotAligned
+; CHECK-NOT: ldp
+; CHECK: ldur
+; CHECK-NEXT: ldur
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i64, i64* %a, i64 -18
   %bp1 = bitcast i64* %p1 to i8*
   %bp1p1 = getelementptr inbounds i8, i8* %bp1, i64 1
@@ -303,12 +301,12 @@ define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
 }
 
 define i64 @pairUpNotAlignedSext(i32* %a) nounwind ssp {
-; LDUR_CHK: pairUpNotAlignedSext
-; LDUR_CHK-NOT: ldp
-; LDUR_CHK: ldursw
-; LDUR_CHK-NEXT: ldursw
-; LDUR_CHK-NEXT: add
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpNotAlignedSext
+; CHECK-NOT: ldp
+; CHECK: ldursw
+; CHECK-NEXT: ldursw
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i64 -18
   %bp1 = bitcast i32* %p1 to i8*
   %bp1p1 = getelementptr inbounds i8, i8* %bp1, i64 1
@@ -326,3 +324,35 @@ define i64 @pairUpNotAlignedSext(i32* %a) nounwind ssp {
   %tmp3 = add i64 %sexttmp1, %sexttmp2
  ret i64 %tmp3
 }
+
+declare void @use-ptr(i32*)
+
+; CHECK-LABEL: ldp_sext_int_pre
+; CHECK: ldpsw x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}, #8]
+define i64 @ldp_sext_int_pre(i32* %p) nounwind {
+  %ptr = getelementptr inbounds i32, i32* %p, i64 2
+  call void @use-ptr(i32* %ptr)
+  %add.ptr = getelementptr inbounds i32, i32* %ptr, i64 0
+  %tmp = load i32, i32* %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i32, i32* %ptr, i64 1
+  %tmp1 = load i32, i32* %add.ptr1, align 4
+  %sexttmp = sext i32 %tmp to i64
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  ret i64 %add
+}
+
+; CHECK-LABEL: ldp_sext_int_post
+; CHECK: ldpsw x{{[0-9]+}}, x{{[0-9]+}}, [x0], #8
+define i64 @ldp_sext_int_post(i32* %p) nounwind {
+  %tmp = load i32, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %tmp1 = load i32, i32* %add.ptr, align 4
+  %sexttmp = sext i32 %tmp to i64
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %ptr = getelementptr inbounds i32, i32* %add.ptr, i64 1
+  call void @use-ptr(i32* %ptr)
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  ret i64 %add
+}
+
diff --git a/test/CodeGen/AArch64/arm64-long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll
index d5baf16bdd5ce..ad89d3ff711bc 100644
--- a/test/CodeGen/AArch64/arm64-long-shift.ll
+++ b/test/CodeGen/AArch64/arm64-long-shift.ll
@@ -2,18 +2,20 @@
 
 define i128 @shl(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: shl:
-; CHECK: lsl  [[XREG_0:x[0-9]+]], x1, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsr  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
-; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsl  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
-; CHECK-NEXT: cmp   [[XREG_4]], #0
-; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
-; CHECK-NEXT: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
-; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsr  [[LO_FOR_HI_NORMAL:x[0-9]+]], x0, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[LO_FOR_HI:x[0-9]+]], xzr, [[LO_FOR_HI_NORMAL]], eq
+; CHECK: lsl  [[HI_FOR_HI:x[0-9]+]], x1, x2
+; CHECK: orr [[HI_NORMAL:x[0-9]+]], [[LO_FOR_HI]], [[HI_FOR_HI]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: lsl  [[HI_BIG_SHIFT:x[0-9]+]], x0, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x1, [[HI_BIG_SHIFT]], [[HI_NORMAL]], ge
+; CHECK: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
+; CHECK: ret
 
   %shl = shl i128 %r, %s
   ret i128 %shl
@@ -21,19 +23,21 @@ define i128 @shl(i128 %r, i128 %s) nounwind readnone {
 
 define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: ashr:
-; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: asr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
-; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsl  [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq
+; CHECK: lsr  [[LO_FOR_LO:x[0-9]+]], x0, x2
+; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: asr  [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge
+; CHECK: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
+; CHECK: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
+; CHECK: ret
 
   %shr = ashr i128 %r, %s
   ret i128 %shr
@@ -41,18 +45,20 @@ define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
 
 define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: lshr:
-; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsl  [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq
+; CHECK: lsr  [[LO_FOR_LO:x[0-9]+]], x0, x2
+; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: lsr  [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge
+; CHECK: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK: csel x1, xzr, [[SMALLSHIFT_HI]], ge
+; CHECK: ret
 
   %shr = lshr i128 %r, %s
   ret i128 %shr
diff --git a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
index 5bc4d71501ba4..85572f2cf0f8c 100644
--- a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
+++ b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -aarch64-strict-align < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -mattr=+strict-align < %s | FileCheck %s
 
 ; Small (16-bytes here) unaligned memcpys should stay memcpy calls if
 ; strict-alignment is turned on.
diff --git a/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll b/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
new file mode 100644
index 0000000000000..5276ac334a71e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
@@ -0,0 +1,406 @@
+; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=LE
+; RUN: llc < %s -mtriple aarch64_be--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=BE
+
+; CHECK-LABEL: Ldrh_merge
+; CHECK-NOT: ldrh
+; CHECK: ldr [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; CHECK-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i16 @Ldrh_merge(i16* nocapture readonly %p) {
+  %1 = load i16, i16* %p, align 2
+  %arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1
+  %2 = load i16, i16* %arrayidx2, align 2
+  %add = sub nuw nsw i16 %1, %2
+  ret i16 %add
+}
+
+; CHECK-LABEL: Ldurh_merge
+; CHECK-NOT: ldurh
+; CHECK: ldur [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; CHECK-DAG: lsr  [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i16 @Ldurh_merge(i16* nocapture readonly %p)  {
+entry:
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 -2
+  %0 = load i16, i16* %arrayidx
+  %arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1
+  %1 = load i16, i16* %arrayidx3
+  %add = sub nuw nsw i16 %0, %1
+  ret i16 %add
+}
+
+; CHECK-LABEL: Ldrh_4_merge
+; CHECK-NOT: ldrh
+; CHECK: ldp [[WORD1:w[0-9]+]], [[WORD2:w[0-9]+]], [x0]
+; CHECK-DAG: and [[WORD1LO:w[0-9]+]], [[WORD1]], #0xffff
+; CHECK-DAG: lsr [[WORD1HI:w[0-9]+]], [[WORD1]], #16
+; CHECK-DAG: and [[WORD2LO:w[0-9]+]], [[WORD2]], #0xffff
+; CHECK-DAG: lsr [[WORD2HI:w[0-9]+]], [[WORD2]], #16
+; LE-DAG: sub [[TEMP1:w[0-9]+]], [[WORD1HI]], [[WORD1LO]]
+; BE-DAG: sub [[TEMP1:w[0-9]+]], [[WORD1LO]], [[WORD1HI]]
+; LE: udiv [[TEMP2:w[0-9]+]], [[TEMP1]], [[WORD2LO]]
+; BE: udiv [[TEMP2:w[0-9]+]], [[TEMP1]], [[WORD2HI]]
+; LE: sub w0, [[TEMP2]], [[WORD2HI]]
+; BE: sub w0, [[TEMP2]], [[WORD2LO]]
+define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 0
+  %l0 = load i16, i16* %arrayidx
+  %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1
+  %l1 = load i16, i16* %arrayidx2
+  %arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2
+  %l2 = load i16, i16* %arrayidx7
+  %arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3
+  %l3 = load i16, i16* %arrayidx12
+  %add4 = sub nuw nsw i16 %l1, %l0
+  %add9 = udiv i16 %add4, %l2
+  %add14 = sub nuw nsw i16 %add9, %l3
+  ret i16 %add14
+}
+
+; CHECK-LABEL: Ldrsh_merge
+; CHECK: ldr [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: asr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; CHECK-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+
+define i32 @Ldrsh_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 5
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = sext i16 %tmp to i32
+  %sexttmp1 = sext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp1, %sexttmp
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrsh_zsext_merge
+; CHECK: ldr [[NEW_DEST:w[0-9]+]]
+; LE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; LE-DAG: asr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
+; BE-DAG: sxth [[LO_PART:w[0-9]+]], [[NEW_DEST]]
+; BE-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrsh_zsext_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 5
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = zext i16 %tmp to i32
+  %sexttmp1 = sext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrsh_szext_merge
+; CHECK: ldr [[NEW_DEST:w[0-9]+]]
+; LE-DAG: sxth [[LO_PART:w[0-9]+]], [[NEW_DEST]]
+; LE-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
+; BE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; BE-DAG: asr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrsh_szext_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 5
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = sext i16 %tmp to i32
+  %sexttmp1 = zext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrb_merge
+; CHECK: ldrh [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; CHECK-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrb_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 3
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = zext i8 %tmp to i32
+  %sexttmp1 = zext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrsb_merge
+; CHECK: ldrh [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]]
+; CHECK-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrsb_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 3
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = sext i8 %tmp to i32
+  %sexttmp1 = sext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrsb_zsext_merge
+; CHECK: ldrh [[NEW_DEST:w[0-9]+]]
+; LE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; LE-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; BE-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]]
+; BE-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrsb_zsext_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 3
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = zext i8 %tmp to i32
+  %sexttmp1 = sext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrsb_szext_merge
+; CHECK: ldrh [[NEW_DEST:w[0-9]+]]
+; LE-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]]
+; LE-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; BE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; BE-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrsb_szext_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 3
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = sext i8 %tmp to i32
+  %sexttmp1 = zext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursh_merge
+; CHECK: ldur [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: asr  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; CHECK-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursh_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = sext i16 %tmp to i32
+  %sexttmp1 = sext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursh_zsext_merge
+; CHECK: ldur [[NEW_DEST:w[0-9]+]]
+; LE-DAG: lsr  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; LE-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; BE-DAG: asr  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; BE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursh_zsext_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = zext i16 %tmp to i32
+  %sexttmp1 = sext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursh_szext_merge
+; CHECK: ldur [[NEW_DEST:w[0-9]+]]
+; LE-DAG: asr  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; LE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; BE-DAG: lsr  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; BE-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursh_szext_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = sext i16 %tmp to i32
+  %sexttmp1 = zext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldurb_merge
+; CHECK: ldurh [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: ubfx  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; CHECK-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldurb_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = zext i8 %tmp to i32
+  %sexttmp1 = zext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursb_merge
+; CHECK: ldurh [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; CHECK-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursb_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = sext i8 %tmp to i32
+  %sexttmp1 = sext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursb_zsext_merge
+; CHECK: ldurh [[NEW_DEST:w[0-9]+]]
+; LE-DAG: ubfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; BE-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; BE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursb_zsext_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = zext i8 %tmp to i32
+  %sexttmp1 = sext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursb_szext_merge
+; CHECK: ldurh [[NEW_DEST:w[0-9]+]]
+; LE-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; BE-DAG: ubfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; BE-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursb_szext_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = sext i8 %tmp to i32
+  %sexttmp1 = zext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Strh_zero
+; CHECK: str wzr
+define void @Strh_zero(i16* nocapture %P, i32 %n) {
+entry:
+ %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom
+ store i16 0, i16* %arrayidx
+  %add = add nsw i32 %n, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 %idxprom1
+  store i16 0, i16* %arrayidx2
+  ret void
+}
+
+; CHECK-LABEL: Strh_zero_4
+; CHECK: stp wzr, wzr
+define void @Strh_zero_4(i16* nocapture %P, i32 %n) {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom
+  store i16 0, i16* %arrayidx
+  %add = add nsw i32 %n, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 %idxprom1
+  store i16 0, i16* %arrayidx2
+  %add3 = add nsw i32 %n, 2
+  %idxprom4 = sext i32 %add3 to i64
+  %arrayidx5 = getelementptr inbounds i16, i16* %P, i64 %idxprom4
+  store i16 0, i16* %arrayidx5
+  %add6 = add nsw i32 %n, 3
+  %idxprom7 = sext i32 %add6 to i64
+  %arrayidx8 = getelementptr inbounds i16, i16* %P, i64 %idxprom7
+  store i16 0, i16* %arrayidx8
+  ret void
+}
+
+; CHECK-LABEL: Sturb_zero
+; CHECK: sturh wzr
+define void @Sturb_zero(i8* nocapture %P, i32 %n) #0 {
+entry:
+  %sub = add nsw i32 %n, -2
+  %idxprom = sext i32 %sub to i64
+  %arrayidx = getelementptr inbounds i8, i8* %P, i64 %idxprom
+  store i8 0, i8* %arrayidx
+  %sub2= add nsw i32 %n, -1
+  %idxprom1 = sext i32 %sub2 to i64
+  %arrayidx2 = getelementptr inbounds i8, i8* %P, i64 %idxprom1
+  store i8 0, i8* %arrayidx2
+  ret void
+}
+
+; CHECK-LABEL: Sturh_zero
+; CHECK: stur wzr
+define void @Sturh_zero(i16* nocapture %P, i32 %n) {
+entry:
+  %sub = add nsw i32 %n, -2
+  %idxprom = sext i32 %sub to i64
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom
+  store i16 0, i16* %arrayidx
+  %sub1 = add nsw i32 %n, -3
+  %idxprom2 = sext i32 %sub1 to i64
+  %arrayidx3 = getelementptr inbounds i16, i16* %P, i64 %idxprom2
+  store i16 0, i16* %arrayidx3
+  ret void
+}
+
+; CHECK-LABEL: Sturh_zero_4
+; CHECK: stp wzr, wzr
+define void @Sturh_zero_4(i16* nocapture %P, i32 %n) {
+entry:
+  %sub = add nsw i32 %n, -3
+  %idxprom = sext i32 %sub to i64
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom
+  store i16 0, i16* %arrayidx
+  %sub1 = add nsw i32 %n, -4
+  %idxprom2 = sext i32 %sub1 to i64
+  %arrayidx3 = getelementptr inbounds i16, i16* %P, i64 %idxprom2
+  store i16 0, i16* %arrayidx3
+  %sub4 = add nsw i32 %n, -2
+  %idxprom5 = sext i32 %sub4 to i64
+  %arrayidx6 = getelementptr inbounds i16, i16* %P, i64 %idxprom5
+  store i16 0, i16* %arrayidx6
+  %sub7 = add nsw i32 %n, -1
+  %idxprom8 = sext i32 %sub7 to i64
+  %arrayidx9 = getelementptr inbounds i16, i16* %P, i64 %idxprom8
+  store i16 0, i16* %arrayidx9
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll
index 869966caa3ae3..985b5bf483acd 100644
--- a/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -535,6 +535,17 @@ entry:
 
 declare double @llvm.fma.f64(double, double, double)
 
+define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmss_lane_f32
+; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <2 x float> %v, i32 1
+  %extract = fsub float -0.000000e+00, %extract.rhs
+  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
+  ret float %0
+}
+
 define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfmss_laneq_f32
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
@@ -557,6 +568,50 @@ entry:
   ret double %0
 }
 
+define double @test_vfmsd_lane_f64_0(double %a, double %b, <1 x double> %v) {
+; CHCK-LABEL: test_vfmsd_lane_f64_0
+; CHCK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHCK-NEXT: ret
+entry:
+  %tmp0 = fsub <1 x double> <double -0.000000e+00>, %v
+  %tmp1 = extractelement <1 x double> %tmp0, i32 0
+  %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
+  ret double %0
+}
+
+define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmss_lane_f32_0
+; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %tmp1 = extractelement <2 x float> %tmp0, i32 1
+  %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
+  ret float %0
+}
+
+define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmss_laneq_f32_0
+; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %tmp1 = extractelement <4 x float> %tmp0, i32 3
+  %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
+  ret float %0
+}
+
+define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmsd_laneq_f64_0
+; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v
+  %tmp1 = extractelement <2 x double> %tmp0, i32 1
+  %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
+  ret double %0
+}
+
 define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_lane_s16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index b74a40626ceeb..83b1cac70f5c0 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -320,21 +320,20 @@ define i32 @smovw8h(<8 x i16> %tmp1) {
   ret i32 %tmp5
 }
 
-define i32 @smovx16b(<16 x i8> %tmp1) {
+define i64 @smovx16b(<16 x i8> %tmp1) {
 ; CHECK-LABEL: smovx16b:
-; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 %tmp4, %tmp4
-  ret i32 %tmp5
+  %tmp4 = sext i8 %tmp3 to i64
+  ret i64 %tmp4
 }
 
-define i32 @smovx8h(<8 x i16> %tmp1) {
+define i64 @smovx8h(<8 x i16> %tmp1) {
 ; CHECK-LABEL: smovx8h:
-; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  ret i32 %tmp4
+  %tmp4 = sext i16 %tmp3 to i64
+  ret i64 %tmp4
 }
 
 define i64 @smovx4s(<4 x i32> %tmp1) {
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
index b8236c5b24795..c2006ccdd064b 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
@@ -7,7 +7,7 @@ define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
 ; CHECK-LABEL: jscall_patchpoint_codegen:
 ; CHECK:       Ltmp
-; CHECK:       str x{{.+}}, [sp]
+; CHECK:       str x{{.+}}, [sp, #-16]!
 ; CHECK-NEXT:  mov  x0, x{{.+}}
 ; CHECK:       Ltmp
 ; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
@@ -16,7 +16,7 @@ entry:
 ; CHECK-NEXT:  blr x16
 ; FAST-LABEL:  jscall_patchpoint_codegen:
 ; FAST:        Ltmp
-; FAST:        str x{{.+}}, [sp]
+; FAST:        str x{{.+}}, [sp, #-16]!
 ; FAST:        Ltmp
 ; FAST-NEXT:   movz  x16, #0xffff, lsl #32
 ; FAST-NEXT:   movk  x16, #0xdead, lsl #16
@@ -50,7 +50,7 @@ entry:
 ; FAST:        orr [[REG1:x[0-9]+]], xzr, #0x2
 ; FAST-NEXT:   orr [[REG2:w[0-9]+]], wzr, #0x4
 ; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
-; FAST-NEXT:   str [[REG1]], [sp]
+; FAST-NEXT:   str [[REG1]], [sp, #-32]!
 ; FAST-NEXT:   str [[REG2]], [sp, #16]
 ; FAST-NEXT:   str [[REG3]], [sp, #24]
 ; FAST:        Ltmp
@@ -90,7 +90,7 @@ entry:
 ; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
 ; FAST-NEXT:   orr [[REG4:w[0-9]+]], wzr, #0x8
 ; FAST-NEXT:   movz [[REG5:x[0-9]+]], #0xa
-; FAST-NEXT:   str [[REG1]], [sp]
+; FAST-NEXT:   str [[REG1]], [sp, #-64]!
 ; FAST-NEXT:   str [[REG2]], [sp, #16]
 ; FAST-NEXT:   str [[REG3]], [sp, #24]
 ; FAST-NEXT:   str [[REG4]], [sp, #36]
diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
index 60672aa38486b..f3af01a73559f 100644
--- a/test/CodeGen/AArch64/arm64-platform-reg.ll
+++ b/test/CodeGen/AArch64/arm64-platform-reg.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
-; RUN: llc -mtriple=arm64-freebsd-gnu -aarch64-reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
+; RUN: llc -mtriple=arm64-apple-ios -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
+; RUN: llc -mtriple=arm64-freebsd-gnu -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
 ; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
 
 ; x18 is reserved as a platform register on Darwin but not on other
diff --git a/test/CodeGen/AArch64/arm64-popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
index b0b529a13f413..9ee53a0f92e6b 100644
--- a/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -4,8 +4,8 @@
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
-; CHECK: ubfx	x{{[0-9]+}}
-; CHECK: fmov	d0, x{{[0-9]+}}
+; CHECK: mov w[[IN64:[0-9]+]], w0
+; CHECK: fmov	d0, x[[IN64]]
 ; CHECK: cnt.8b	v0, v0
 ; CHECK: uaddlv.8b	h0, v0
 ; CHECK: fmov w0, s0
@@ -59,7 +59,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
 ; CHECK-LABEL: cnt32:
-; CHECK-NOT 16b
+; CHECK-NOT: 16b
 ; CHECK: ret
 }
 
@@ -67,7 +67,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
   ret i64 %cnt
 ; CHECK-LABEL: cnt64:
-; CHECK-NOT 16b
+; CHECK-NOT: 16b
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/AArch64/arm64-rounding.ll b/test/CodeGen/AArch64/arm64-rounding.ll
index 931114447adfa..d487aabccc4f1 100644
--- a/test/CodeGen/AArch64/arm64-rounding.ll
+++ b/test/CodeGen/AArch64/arm64-rounding.ll
@@ -1,10 +1,8 @@
-; RUN: llc -O3 < %s -mcpu=cyclone | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
-target triple = "arm64-apple-ios6.0.0"
+; RUN: llc -O3 < %s -mtriple=arm64 | FileCheck %s
 
-; CHECK: test1
-; CHECK: frintx
+; CHECK-LABEL: test1:
 ; CHECK: frintm
+; CHECK-NOT: frintx
 define float @test1(float %a) #0 {
 entry:
   %call = tail call float @floorf(float %a) nounwind readnone
@@ -13,9 +11,9 @@ entry:
 
 declare float @floorf(float) nounwind readnone
 
-; CHECK: test2
-; CHECK: frintx
+; CHECK-LABEL: test2:
 ; CHECK: frintm
+; CHECK-NOT: frintx
 define double @test2(double %a) #0 {
 entry:
   %call = tail call double @floor(double %a) nounwind readnone
@@ -24,7 +22,7 @@ entry:
 
 declare double @floor(double) nounwind readnone
 
-; CHECK: test3
+; CHECK-LABEL: test3:
 ; CHECK: frinti
 define float @test3(float %a) #0 {
 entry:
@@ -34,7 +32,7 @@ entry:
 
 declare float @nearbyintf(float) nounwind readnone
 
-; CHECK: test4
+; CHECK-LABEL: test4:
 ; CHECK: frinti
 define double @test4(double %a) #0 {
 entry:
@@ -44,9 +42,9 @@ entry:
 
 declare double @nearbyint(double) nounwind readnone
 
-; CHECK: test5
-; CHECK: frintx
+; CHECK-LABEL: test5:
 ; CHECK: frintp
+; CHECK-NOT: frintx
 define float @test5(float %a) #0 {
 entry:
   %call = tail call float @ceilf(float %a) nounwind readnone
@@ -55,9 +53,9 @@ entry:
 
 declare float @ceilf(float) nounwind readnone
 
-; CHECK: test6
-; CHECK: frintx
+; CHECK-LABEL: test6:
 ; CHECK: frintp
+; CHECK-NOT: frintx
 define double @test6(double %a) #0 {
 entry:
   %call = tail call double @ceil(double %a) nounwind readnone
@@ -66,7 +64,7 @@ entry:
 
 declare double @ceil(double) nounwind readnone
 
-; CHECK: test7
+; CHECK-LABEL: test7:
 ; CHECK: frintx
 define float @test7(float %a) #0 {
 entry:
@@ -76,7 +74,7 @@ entry:
 
 declare float @rintf(float) nounwind readnone
 
-; CHECK: test8
+; CHECK-LABEL: test8:
 ; CHECK: frintx
 define double @test8(double %a) #0 {
 entry:
@@ -86,9 +84,9 @@ entry:
 
 declare double @rint(double) nounwind readnone
 
-; CHECK: test9
-; CHECK: frintx
+; CHECK-LABEL: test9:
 ; CHECK: frintz
+; CHECK-NOT: frintx
 define float @test9(float %a) #0 {
 entry:
   %call = tail call float @truncf(float %a) nounwind readnone
@@ -97,9 +95,9 @@ entry:
 
 declare float @truncf(float) nounwind readnone
 
-; CHECK: test10
-; CHECK: frintx
+; CHECK-LABEL: test10:
 ; CHECK: frintz
+; CHECK-NOT: frintx
 define double @test10(double %a) #0 {
 entry:
   %call = tail call double @trunc(double %a) nounwind readnone
@@ -108,9 +106,9 @@ entry:
 
 declare double @trunc(double) nounwind readnone
 
-; CHECK: test11
-; CHECK: frintx
+; CHECK-LABEL: test11:
 ; CHECK: frinta
+; CHECK-NOT: frintx
 define float @test11(float %a) #0 {
 entry:
   %call = tail call float @roundf(float %a) nounwind readnone
@@ -119,9 +117,9 @@ entry:
 
 declare float @roundf(float %a) nounwind readnone
 
-; CHECK: test12
-; CHECK: frintx
+; CHECK-LABEL: test12:
 ; CHECK: frinta
+; CHECK-NOT: frintx
 define double @test12(double %a) #0 {
 entry:
   %call = tail call double @round(double %a) nounwind readnone
@@ -130,7 +128,7 @@ entry:
 
 declare double @round(double %a) nounwind readnone
 
-; CHECK: test13
+; CHECK-LABEL: test13:
 ; CHECK-NOT: frintx
 ; CHECK: frintm
 define float @test13(float %a) #1 {
@@ -139,7 +137,7 @@ entry:
   ret float %call
 }
 
-; CHECK: test14
+; CHECK-LABEL: test14:
 ; CHECK-NOT: frintx
 ; CHECK: frintm
 define double @test14(double %a) #1 {
@@ -148,7 +146,7 @@ entry:
   ret double %call
 }
 
-; CHECK: test15
+; CHECK-LABEL: test15:
 ; CHECK-NOT: frintx
 ; CHECK: frintp
 define float @test15(float %a) #1 {
@@ -157,7 +155,7 @@ entry:
   ret float %call
 }
 
-; CHECK: test16
+; CHECK-LABEL: test16:
 ; CHECK-NOT: frintx
 ; CHECK: frintp
 define double @test16(double %a) #1 {
@@ -166,7 +164,7 @@ entry:
   ret double %call
 }
 
-; CHECK: test17
+; CHECK-LABEL: test17:
 ; CHECK-NOT: frintx
 ; CHECK: frintz
 define float @test17(float %a) #1 {
@@ -175,7 +173,7 @@ entry:
   ret float %call
 }
 
-; CHECK: test18
+; CHECK-LABEL: test18:
 ; CHECK-NOT: frintx
 ; CHECK: frintz
 define double @test18(double %a) #1 {
@@ -184,7 +182,7 @@ entry:
   ret double %call
 }
 
-; CHECK: test19
+; CHECK-LABEL: test19:
 ; CHECK-NOT: frintx
 ; CHECK: frinta
 define float @test19(float %a) #1 {
@@ -193,7 +191,7 @@ entry:
   ret float %call
 }
 
-; CHECK: test20
+; CHECK-LABEL: test20:
 ; CHECK-NOT: frintx
 ; CHECK: frinta
 define double @test20(double %a) #1 {
@@ -202,7 +200,5 @@ entry:
   ret double %call
 }
 
-
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index 599712be401c6..2ecd66ddf5d42 100644
--- a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: llc %s -o - -enable-shrink-wrap=true -disable-post-ra | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false -disable-post-ra | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios"
 
@@ -539,3 +539,94 @@ if.end:
 declare void @abort() #0
 
 attributes #0 = { noreturn nounwind }
+
+; Make sure that we handle infinite loops properly When checking that the Save
+; and Restore blocks are control flow equivalent, the loop searches for the
+; immediate (post) dominator for the (restore) save blocks. When either the Save
+; or Restore block is located in an infinite loop the only immediate (post)
+; dominator is itself. In this case, we cannot perform shrink wrapping, but we
+; should return gracefully and continue compilation.
+; The only condition for this test is the compilation finishes correctly.
+;
+; CHECK-LABEL: infiniteloop
+; CHECK: ret
+define void @infiniteloop() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %ptr = alloca i32, i32 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
+  %call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
+  %add = add nsw i32 %call, %sum.03
+  store i32 %add, i32* %ptr
+  br label %for.body
+
+if.end:
+  ret void
+}
+
+; Another infinite loop test this time with a body bigger than just one block.
+; CHECK-LABEL: infiniteloop2
+; CHECK: ret
+define void @infiniteloop2() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %ptr = alloca i32, i32 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi i32 [ 0, %if.then ], [ %add, %body1 ], [ 1, %body2]
+  %call = tail call i32 asm "mov $0, #0", "=r,~{x19}"()
+  %add = add nsw i32 %call, %sum.03
+  store i32 %add, i32* %ptr
+  br i1 undef, label %body1, label %body2
+
+body1:
+  tail call void asm sideeffect "nop", "~{x19}"()
+  br label %for.body
+
+body2:
+  tail call void asm sideeffect "nop", "~{x19}"()
+  br label %for.body
+
+if.end:
+  ret void
+}
+
+; Another infinite loop test this time with two nested infinite loop.
+; CHECK-LABEL: infiniteloop3
+; CHECK: ret
+define void @infiniteloop3() {
+entry:
+  br i1 undef, label %loop2a, label %body
+
+body:                                             ; preds = %entry
+  br i1 undef, label %loop2a, label %end
+
+loop1:                                            ; preds = %loop2a, %loop2b
+  %var.phi = phi i32* [ %next.phi, %loop2b ], [ %var, %loop2a ]
+  %next.phi = phi i32* [ %next.load, %loop2b ], [ %next.var, %loop2a ]
+  %0 = icmp eq i32* %var, null
+  %next.load = load i32*, i32** undef
+  br i1 %0, label %loop2a, label %loop2b
+
+loop2a:                                           ; preds = %loop1, %body, %entry
+  %var = phi i32* [ null, %body ], [ null, %entry ], [ %next.phi, %loop1 ]
+  %next.var = phi i32* [ undef, %body ], [ null, %entry ], [ %next.load, %loop1 ]
+  br label %loop1
+
+loop2b:                                           ; preds = %loop1
+  %gep1 = bitcast i32* %var.phi to i32*
+  %next.ptr = bitcast i32* %gep1 to i32**
+  store i32* %next.phi, i32** %next.ptr
+  br label %loop1
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-spill-lr.ll b/test/CodeGen/AArch64/arm64-spill-lr.ll
index 88109088a2ff4..2ea5d7810a146 100644
--- a/test/CodeGen/AArch64/arm64-spill-lr.ll
+++ b/test/CodeGen/AArch64/arm64-spill-lr.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=arm64-apple-ios < %s
 @bar = common global i32 0, align 4
 
-; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack
-; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch
-; register.
+; Leaf function which uses all callee-saved registers and allocates >= 256 bytes
+; on the stack this will cause determineCalleeSaves() to spill LR as an
+; additional scratch register.
 ;
 ; This is a crash-only regression test for rdar://15124582.
 define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind {
diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
index 1a4df7a6f2d68..3eb1d27530012 100644
--- a/test/CodeGen/AArch64/arm64-stackmap.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-apple-darwin                             < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-apple-darwin -fast-isel -fast-isel-abort=1 < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -mattr=+reserve-x18                             < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -mattr=+reserve-x18 -fast-isel -fast-isel-abort=1 < %s | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
diff --git a/test/CodeGen/AArch64/arm64-stp.ll b/test/CodeGen/AArch64/arm64-stp.ll
index 72561aac6e871..98242d0bb57e5 100644
--- a/test/CodeGen/AArch64/arm64-stp.ll
+++ b/test/CodeGen/AArch64/arm64-stp.ll
@@ -1,8 +1,6 @@
 ; RUN: llc < %s -march=arm64 -aarch64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
-; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
-; RUN:   -verify-machineinstrs -mcpu=cyclone | FileCheck -check-prefix=STUR_CHK %s
 
-; CHECK: stp_int
+; CHECK-LABEL: stp_int
 ; CHECK: stp w0, w1, [x2]
 define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
   store i32 %a, i32* %p, align 4
@@ -11,7 +9,7 @@ define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
   ret void
 }
 
-; CHECK: stp_long
+; CHECK-LABEL: stp_long
 ; CHECK: stp x0, x1, [x2]
 define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
   store i64 %a, i64* %p, align 8
@@ -20,7 +18,7 @@ define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
   ret void
 }
 
-; CHECK: stp_float
+; CHECK-LABEL: stp_float
 ; CHECK: stp s0, s1, [x0]
 define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
   store float %a, float* %p, align 4
@@ -29,7 +27,7 @@ define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
   ret void
 }
 
-; CHECK: stp_double
+; CHECK-LABEL: stp_double
 ; CHECK: stp d0, d1, [x0]
 define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
   store double %a, double* %p, align 8
@@ -40,9 +38,9 @@ define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
 
 ; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
 define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
-; STUR_CHK: stur_int
-; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
-; STUR_CHK-NEXT: ret
+; CHECK-LABEL: stur_int
+; CHECK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %p, i32 -1
   store i32 %a, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %p, i32 -2
@@ -51,9 +49,9 @@ define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
 }
 
 define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
-; STUR_CHK: stur_long
-; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
-; STUR_CHK-NEXT: ret
+; CHECK-LABEL: stur_long
+; CHECK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i64, i64* %p, i32 -1
   store i64 %a, i64* %p1, align 2
   %p2 = getelementptr inbounds i64, i64* %p, i32 -2
@@ -62,9 +60,9 @@ define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
 }
 
 define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
-; STUR_CHK: stur_float
-; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
-; STUR_CHK-NEXT: ret
+; CHECK-LABEL: stur_float
+; CHECK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds float, float* %p, i32 -1
   store float %a, float* %p1, align 2
   %p2 = getelementptr inbounds float, float* %p, i32 -2
@@ -73,9 +71,9 @@ define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
 }
 
 define void @stur_double(double %a, double %b, double* nocapture %p) nounwind {
-; STUR_CHK: stur_double
-; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
-; STUR_CHK-NEXT: ret
+; CHECK-LABEL: stur_double
+; CHECK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds double, double* %p, i32 -1
   store double %a, double* %p1, align 2
   %p2 = getelementptr inbounds double, double* %p, i32 -2
diff --git a/test/CodeGen/AArch64/arm64-strict-align.ll b/test/CodeGen/AArch64/arm64-strict-align.ll
index 109f4115d8017..28c158f7a2eb0 100644
--- a/test/CodeGen/AArch64/arm64-strict-align.ll
+++ b/test/CodeGen/AArch64/arm64-strict-align.ll
@@ -1,7 +1,6 @@
 ; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-no-strict-align | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
-; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align -fast-isel | FileCheck %s --check-prefix=CHECK-STRICT
+; RUN: llc < %s -mtriple=arm64-apple-darwin -mattr=+strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+; RUN: llc < %s -mtriple=arm64-apple-darwin -mattr=+strict-align -fast-isel | FileCheck %s --check-prefix=CHECK-STRICT
 
 define i32 @f0(i32* nocapture %p) nounwind {
 ; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
diff --git a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
index f94f88a1183fe..c95eca062ff6a 100644
--- a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
+++ b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
@@ -1,4 +1,7 @@
-; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic \
+; RUN:     -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=NOEMU %s
+; RUN: llc -emulated-tls -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic \
+; RUN:     -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=EMU %s
 
 ; If the .tlsdesccall and blr parts are emitted completely separately (even with
 ; glue) then LLVM will separate them quite happily (with a spill at O0, hence
@@ -13,6 +16,40 @@ define i32 @test_generaldynamic() {
   %val = load i32, i32* @general_dynamic_var
   ret i32 %val
 
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr {{x[0-9]+}}
+; NOEMU: .tlsdesccall general_dynamic_var
+; NOEMU-NEXT: blr {{x[0-9]+}}
+; NOEMU-NOT: __emutls_v.general_dynamic_var:
+
+; EMU: adrp{{.+}}__emutls_v.general_dynamic_var
+; EMU: bl __emutls_get_address
+
+; EMU-NOT: __emutls_v.general_dynamic_var
+; EMU-NOT: __emutls_t.general_dynamic_var
+}
+
+@emulated_init_var = thread_local global i32 37, align 8
+
+define i32 @test_emulated_init() {
+; COMMON-LABEL: test_emulated_init:
+
+  %val = load i32, i32* @emulated_init_var
+  ret i32 %val
+
+; EMU: adrp{{.+}}__emutls_v.emulated_init_var
+; EMU: bl __emutls_get_address
+
+; EMU-NOT: __emutls_v.general_dynamic_var:
+
+; EMU:      .align 3
+; EMU-LABEL: __emutls_v.emulated_init_var:
+; EMU-NEXT: .xword 4
+; EMU-NEXT: .xword 8
+; EMU-NEXT: .xword 0
+; EMU-NEXT: .xword __emutls_t.emulated_init_var
+
+; EMU-LABEL: __emutls_t.emulated_init_var:
+; EMU-NEXT: .word 37
 }
+
+; CHECK-NOT: __emutls_v.general_dynamic_var:
+; EMU-NOT: __emutls_t.general_dynamic_var
diff --git a/test/CodeGen/AArch64/arm64-trunc-store.ll b/test/CodeGen/AArch64/arm64-trunc-store.ll
index 7cde629b33ae4..be0388284fb85 100644
--- a/test/CodeGen/AArch64/arm64-trunc-store.ll
+++ b/test/CodeGen/AArch64/arm64-trunc-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-post-ra | FileCheck %s
 
 define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind {
 ; CHECK-LABEL: bar:
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index a52c4ebf13e7e..c1800085884c9 100644
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -134,6 +134,72 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   ret <2 x i64> %tmp4
 }
 
+define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) {
+; CHECK-LABEL: uabdl8h_log2_shuffle
+; CHECK: uabdl2.8h
+; CHECK: uabdl.8h
+  %aload = load <16 x i8>, <16 x i8>* %a, align 1
+  %bload = load <16 x i8>, <16 x i8>* %b, align 1
+  %aext = zext <16 x i8> %aload to <16 x i16>
+  %bext = zext <16 x i8> %bload to <16 x i16>
+  %abdiff = sub nsw <16 x i16> %aext, %bext
+  %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
+  %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
+  %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
+  %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin1.rdx = add <16 x i16> %absel, %rdx.shuf
+  %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx
+  %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136
+  %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138
+  %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0
+  ret i16 %reduced_v
+}
+
+define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) {
+; CHECK-LABEL: uabdl4s_log2_shuffle
+; CHECK: uabdl2.4s
+; CHECK: uabdl.4s
+  %aload = load <8 x i16>, <8 x i16>* %a, align 1
+  %bload = load <8 x i16>, <8 x i16>* %b, align 1
+  %aext = zext <8 x i16> %aload to <8 x i32>
+  %bext = zext <8 x i16> %bload to <8 x i32>
+  %abdiff = sub nsw <8 x i32> %aext, %bext
+  %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
+  %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
+  %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
+  %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %absel, %rdx.shuf
+  %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136
+  %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138
+  %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0
+  ret i32 %reduced_v
+}
+
+define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
+; CHECK: uabdl2d_log2_shuffle
+; CHECK: uabdl2.2d
+; CHECK: uabdl.2d
+  %aload = load <4 x i32>, <4 x i32>* %a, align 1
+  %bload = load <4 x i32>, <4 x i32>* %b, align 1
+  %aext = zext <4 x i32> %aload to <4 x i64>
+  %bext = zext <4 x i32> %bload to <4 x i64>
+  %abdiff = sub nsw <4 x i64> %aext, %bext
+  %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
+  %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
+  %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
+  %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136
+  %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138
+  %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0
+  ret i64 %reduced_v
+}
+
 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fabd_2s:
 ;CHECK: fabd.2s
diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
index 44f2af1c5e79a..8702b41023d0c 100644
--- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false -disable-post-ra < %s | FileCheck %s
 
 %va_list = type {i8*, i8*, i8*, i32, i32}
 
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 5bee1611e6c64..994a9956cf7f8 100644
--- a/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -1,27 +1,27 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
-
-;CHECK: @func30
-;CHECK: ushll.4s  v0, v0, #0
-;CHECK: movi.4s v1, #0x1
-;CHECK: and.16b v0, v0, v1
-;CHECK: str  q0, [x0]
-;CHECK: ret
-
-%T0_30 = type <4 x i1>
-%T1_30 = type <4 x i32>
-define void @func30(%T0_30 %v0, %T1_30* %p1) {
-  %r = zext %T0_30 %v0 to %T1_30
-  store %T1_30 %r, %T1_30* %p1
-  ret void
-}
-
-; Extend from v1i1 was crashing things (PR20791). Make sure we do something
-; sensible instead.
-define <1 x i32> @autogen_SD7918() {
-; CHECK-LABEL: autogen_SD7918
-; CHECK: movi d0, #0000000000000000
-; CHECK-NEXT: ret
-  %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
-  %ZE = zext <1 x i1> %I29 to <1 x i32>
-  ret <1 x i32> %ZE
-}
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func30
+;CHECK: movi.4h v1, #0x1
+;CHECK: and.8b v0, v0, v1
+;CHECK: ushll.4s  v0, v0, #0
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_30 = type <4 x i1>
+%T1_30 = type <4 x i32>
+define void @func30(%T0_30 %v0, %T1_30* %p1) {
+  %r = zext %T0_30 %v0 to %T1_30
+  store %T1_30 %r, %T1_30* %p1
+  ret void
+}
+
+; Extend from v1i1 was crashing things (PR20791). Make sure we do something
+; sensible instead.
+define <1 x i32> @autogen_SD7918() {
+; CHECK-LABEL: autogen_SD7918
+; CHECK: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+  %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
+  %ZE = zext <1 x i1> %I29 to <1 x i32>
+  ret <1 x i32> %ZE
+}
diff --git a/test/CodeGen/AArch64/arm64-vminmaxnm.ll b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
index b5aca45cd479a..302ba9d681c64 100644
--- a/test/CodeGen/AArch64/arm64-vminmaxnm.ll
+++ b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
@@ -42,13 +42,28 @@ define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp
   ret <2 x double> %vminnm2.i
 }
 
+define float @f7(float %a, float %b) nounwind readnone ssp {
+; CHECK: fmaxnm	s0, s0, s1
+; CHECK: ret
+  %vmaxnm2.i = tail call float @llvm.aarch64.neon.fmaxnm.f32(float %a, float %b) nounwind
+  ret float %vmaxnm2.i
+}
+
+define double @f8(double %a, double %b) nounwind readnone ssp {
+; CHECK: fminnm	d0, d0, d1
+; CHECK: ret
+  %vmaxnm2.i = tail call double @llvm.aarch64.neon.fminnm.f64(double %a, double %b) nounwind
+  ret double %vmaxnm2.i
+}
+
 declare <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
 declare <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
 declare <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
 declare <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-
+declare float @llvm.aarch64.neon.fmaxnm.f32(float, float) nounwind readnone
+declare double @llvm.aarch64.neon.fminnm.f64(double, double) nounwind readnone
 
 define double @test_fmaxnmv(<2 x double> %in) {
 ; CHECK-LABEL: test_fmaxnmv:
diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
index ce9c0a64b5872..ec49110d40526 100644
--- a/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0                             -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -disable-post-ra -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort=1 -disable-post-ra -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; Get the actual value of the overflow bit.
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index cb90caeadc1f3..900d2072925f5 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
 
 
 ; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
@@ -893,6 +893,8 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
 ; CHECK: stxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
+; CHECK: [[GET_OUT]]:
+; CHECK: clrex
 ; CHECK-NOT: dmb
 
 ; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
@@ -916,6 +918,8 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
 ; CHECK: stlxrh [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
+; CHECK: [[GET_OUT]]:
+; CHECK: clrex
 ; CHECK-NOT: dmb
 
 ; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
@@ -927,21 +931,21 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
    %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
    %old = extractvalue { i32, i1 } %pair, 0
 
+; CHECK: mov {{[xw]}}[[WANTED:[0-9]+]], {{[xw]}}0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: cmp w[[OLD]], w[[WANTED]]
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
 ; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
+; CHECK: [[GET_OUT]]:
+; CHECK: clrex
 ; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -963,6 +967,8 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
   ; As above, w1 is a reasonable guess.
 ; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
+; CHECK: [[GET_OUT]]:
+; CHECK: clrex
 ; CHECK-NOT: dmb
 
 ; CHECK: str x[[OLD]],
diff --git a/test/CodeGen/AArch64/bitcast-v2i8.ll b/test/CodeGen/AArch64/bitcast-v2i8.ll
index 4bdac641c5bca..aff3ffc70a711 100644
--- a/test/CodeGen/AArch64/bitcast-v2i8.ll
+++ b/test/CodeGen/AArch64/bitcast-v2i8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck %s
 
 ; Part of PR21549: going through the stack isn't ideal but is correct.
 
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
index 9b731fa72a470..509b547a5c82a 100644
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -196,3 +196,44 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) {
 
   ret void
 }
+
+; Bitfield insert where the second or operand is a better match to be folded into the BFM
+define void @test_32bit_opnd1_better(i32* %existing, i32* %new) {
+; CHECK-LABEL: test_32bit_opnd1_better:
+
+  %oldval = load volatile i32, i32* %existing
+  %oldval_keep = and i32 %oldval, 65535 ; 0x0000ffff
+
+  %newval = load i32, i32* %new
+  %newval_shifted = shl i32 %newval, 16
+  %newval_masked = and i32 %newval_shifted, 16711680 ; 0x00ff0000
+
+  %combined = or i32 %oldval_keep, %newval_masked
+  store volatile i32 %combined, i32* %existing
+; CHECK: and [[BIT:w[0-9]+]], {{w[0-9]+}}, #0xffff
+; CHECK: bfi [[BIT]], {{w[0-9]+}}, #16, #8
+
+  ret void
+}
+
+; Tests when all the bits from one operand are not useful
+define i32 @test_nouseful_bits(i8 %a, i32 %b) {
+; CHECK-LABEL: test_nouseful_bits:
+; CHECK: bfi
+; CHECK: bfi
+; CHECK: bfi
+; CHECK-NOT: bfi
+; CHECK-NOT: or
+; CHECK: lsl
+  %conv = zext i8 %a to i32     ;   0  0  0  A
+  %shl = shl i32 %b, 8          ;   B2 B1 B0 0
+  %or = or i32 %conv, %shl      ;   B2 B1 B0 A
+  %shl.1 = shl i32 %or, 8       ;   B1 B0 A 0
+  %or.1 = or i32 %conv, %shl.1  ;   B1 B0 A A
+  %shl.2 = shl i32 %or.1, 8     ;   B0 A A 0
+  %or.2 = or i32 %conv, %shl.2  ;   B0 A A A
+  %shl.3 = shl i32 %or.2, 8     ;   A A A 0
+  %or.3 = or i32 %conv, %shl.3  ;   A A A A
+  %shl.4 = shl i32 %or.3, 8     ;   A A A 0
+  ret i32 %shl.4
+}
diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll
index 78399c80b5de2..5f19b6943b8e2 100644
--- a/test/CodeGen/AArch64/bitfield.ll
+++ b/test/CodeGen/AArch64/bitfield.ll
@@ -3,51 +3,67 @@
 @var32 = global i32 0
 @var64 = global i64 0
 
-define void @test_extendb(i8 %var) {
-; CHECK-LABEL: test_extendb:
+define void @test_extendb32(i8 %var) {
+; CHECK-LABEL: test_extendb32:
 
   %sxt32 = sext i8 %var to i32
   store volatile i32 %sxt32, i32* @var32
 ; CHECK: sxtb {{w[0-9]+}}, {{w[0-9]+}}
 
-  %sxt64 = sext i8 %var to i64
-  store volatile i64 %sxt64, i64* @var64
-; CHECK: sxtb {{x[0-9]+}}, {{w[0-9]+}}
-
 ; N.b. this doesn't actually produce a bitfield instruction at the
 ; moment, but it's still a good test to have and the semantics are
 ; correct.
   %uxt32 = zext i8 %var to i32
   store volatile i32 %uxt32, i32* @var32
 ; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xff
+  ret void
+}
+
+define void @test_extendb64(i8 %var) {
+; CHECK-LABEL: test_extendb64:
+
+  %sxt64 = sext i8 %var to i64
+  store volatile i64 %sxt64, i64* @var64
+; CHECK: sxtb {{x[0-9]+}}, {{w[0-9]+}}
 
+; N.b. this doesn't actually produce a bitfield instruction at the
+; moment, but it's still a good test to have and the semantics are
+; correct.
   %uxt64 = zext i8 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xff
   ret void
 }
 
-define void @test_extendh(i16 %var) {
-; CHECK-LABEL: test_extendh:
+define void @test_extendh32(i16 %var) {
+; CHECK-LABEL: test_extendh32:
 
   %sxt32 = sext i16 %var to i32
   store volatile i32 %sxt32, i32* @var32
 ; CHECK: sxth {{w[0-9]+}}, {{w[0-9]+}}
 
-  %sxt64 = sext i16 %var to i64
-  store volatile i64 %sxt64, i64* @var64
-; CHECK: sxth {{x[0-9]+}}, {{w[0-9]+}}
-
 ; N.b. this doesn't actually produce a bitfield instruction at the
 ; moment, but it's still a good test to have and the semantics are
 ; correct.
   %uxt32 = zext i16 %var to i32
   store volatile i32 %uxt32, i32* @var32
 ; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
+  ret void
+}
+
+define void @test_extendh64(i16 %var) {
+; CHECK-LABEL: test_extendh64:
+
+  %sxt64 = sext i16 %var to i64
+  store volatile i64 %sxt64, i64* @var64
+; CHECK: sxth {{x[0-9]+}}, {{w[0-9]+}}
 
+; N.b. this doesn't actually produce a bitfield instruction at the
+; moment, but it's still a good test to have and the semantics are
+; correct.
   %uxt64 = zext i16 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
   ret void
 }
 
@@ -60,7 +76,7 @@ define void @test_extendw(i32 %var) {
 
   %uxt64 = zext i32 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
+; CHECK: mov {{w[0-9]+}}, w0
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/bitreverse.ll b/test/CodeGen/AArch64/bitreverse.ll
new file mode 100644
index 0000000000000..936e3554b397f
--- /dev/null
+++ b/test/CodeGen/AArch64/bitreverse.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
+
+; These tests just check that the plumbing is in place for @llvm.bitreverse. The
+; actual output is massive at the moment as llvm.bitreverse is not yet legal.
+
+declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
+
+define <2 x i16> @f(<2 x i16> %a) {
+; CHECK-LABEL: f:
+; CHECK: ushr
+  %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
+  ret <2 x i16> %b
+}
+
+declare i8 @llvm.bitreverse.i8(i8) readnone
+
+; Unfortunately some of the shift-and-inserts become BFIs, and some do not :(
+define i8 @g(i8 %a) {
+; CHECK-LABEL: g:
+; CHECK-DAG: lsr [[S5:w.*]], w0, #5
+; CHECK-DAG: lsr [[S4:w.*]], w0, #4
+; CHECK-DAG: lsr [[S3:w.*]], w0, #3
+; CHECK-DAG: lsr [[S2:w.*]], w0, #2
+; CHECK-DAG: lsl [[L1:w.*]], w0, #29
+; CHECK-DAG: lsl [[L2:w.*]], w0, #19
+; CHECK-DAG: lsl [[L3:w.*]], w0, #17
+
+; CHECK-DAG: and [[T1:w.*]], [[L1]], #0x40000000
+; CHECK-DAG: bfi [[T1]], w0, #31, #1
+; CHECK-DAG: bfi [[T1]], [[S2]], #29, #1
+; CHECK-DAG: bfi [[T1]], [[S3]], #28, #1
+; CHECK-DAG: bfi [[T1]], [[S4]], #27, #1
+; CHECK-DAG: bfi [[T1]], [[S5]], #26, #1
+; CHECK-DAG: and [[T2:w.*]], [[L2]], #0x2000000
+; CHECK-DAG: and [[T3:w.*]], [[L3]], #0x1000000
+; CHECK-DAG: orr [[T4:w.*]], [[T1]], [[T2]]
+; CHECK-DAG: orr [[T5:w.*]], [[T4]], [[T3]]
+; CHECK:     lsr w0, [[T5]], #24
+
+  %b = call i8 @llvm.bitreverse.i8(i8 %a)
+  ret i8 %b
+}
+
+declare <8 x i8> @llvm.bitreverse.v8i8(<8 x i8>) readnone
+
+define <8 x i8> @g_vec(<8 x i8> %a) {
+; Try and match as much of the sequence as precisely as possible.
+
+; CHECK-LABEL: g_vec:
+; CHECK-DAG: movi [[M1:v.*]], #0x80
+; CHECK-DAG: movi [[M2:v.*]], #0x40
+; CHECK-DAG: movi [[M3:v.*]], #0x20
+; CHECK-DAG: movi [[M4:v.*]], #0x10
+; CHECK-DAG: movi [[M5:v.*]], #0x8
+; CHECK-DAG: movi [[M6:v.*]], #0x4{{$}}
+; CHECK-DAG: movi [[M7:v.*]], #0x2{{$}}
+; CHECK-DAG: movi [[M8:v.*]], #0x1{{$}}
+; CHECK-DAG: shl  [[S1:v.*]], v0.8b, #7
+; CHECK-DAG: shl  [[S2:v.*]], v0.8b, #5
+; CHECK-DAG: shl  [[S3:v.*]], v0.8b, #3
+; CHECK-DAG: shl  [[S4:v.*]], v0.8b, #1
+; CHECK-DAG: ushr [[S5:v.*]], v0.8b, #1
+; CHECK-DAG: ushr [[S6:v.*]], v0.8b, #3
+; CHECK-DAG: ushr [[S7:v.*]], v0.8b, #5
+; CHECK-DAG: ushr [[S8:v.*]], v0.8b, #7
+; CHECK-DAG: and  [[A1:v.*]], [[S1]], [[M1]]
+; CHECK-DAG: and  [[A2:v.*]], [[S2]], [[M2]]
+; CHECK-DAG: and  [[A3:v.*]], [[S3]], [[M3]]
+; CHECK-DAG: and  [[A4:v.*]], [[S4]], [[M4]]
+; CHECK-DAG: and  [[A5:v.*]], [[S5]], [[M5]]
+; CHECK-DAG: and  [[A6:v.*]], [[S6]], [[M6]]
+; CHECK-DAG: and  [[A7:v.*]], [[S7]], [[M7]]
+; CHECK-DAG: and  [[A8:v.*]], [[S8]], [[M8]]
+
+; The rest can be ORRed together in any order; it's not worth the test
+; maintenance to match them precisely.
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK: ret
+  %b = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a)
+  ret <8 x i8> %b
+}
diff --git a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
index c78fabac61874..004267f4e4e04 100644
--- a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -403,6 +403,32 @@ return:                                           ; preds = %land.lhs.true, %con
   ret i32 %retval.0
 }
 
+define void @cmp_shifted(i32 %in, i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: cmp_shifted:
+; CHECK: cmp w0, #1
+; [...]
+; CHECK: cmp w0, #2, lsl #12
+
+  %tst_low = icmp sgt i32 %in, 0
+  br i1 %tst_low, label %true, label %false
+
+true:
+  call i32 @zoo(i32 128)
+  ret void
+
+false:
+  %tst = icmp sgt i32 %in, 8191
+  br i1 %tst, label %truer, label %falser
+
+truer:
+  call i32 @zoo(i32 42)
+  ret void
+
+falser:
+  call i32 @zoo(i32 1)
+  ret void
+}
+
 declare i32 @zoo(i32)
 
 declare double @yoo(i32)
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index 1266842fcc6d1..a8399f92ebe4e 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -2,6 +2,7 @@
 
 
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a35 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s
diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll
new file mode 100644
index 0000000000000..a9ae00c8d270b
--- /dev/null
+++ b/test/CodeGen/AArch64/cxx-tlscc.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -mtriple=aarch64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-apple-ios -enable-shrink-wrap=true | FileCheck --check-prefix=CHECK %s
+; Shrink wrapping currently does not kick in because we have a TLS CALL
+; in the entry block and it will clobber the link register.
+
+%struct.S = type { i8 }
+
+@sg = internal thread_local global %struct.S zeroinitializer, align 1
+@__dso_handle = external global i8
+@__tls_guard = internal thread_local unnamed_addr global i1 false
+
+declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
+declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
+declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
+
+define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
+  %.b.i = load i1, i1* @__tls_guard, align 1
+  br i1 %.b.i, label %__tls_init.exit, label %init.i
+
+init.i:
+  store i1 true, i1* @__tls_guard, align 1
+  %call.i.i = tail call %struct.S* @_ZN1SC1Ev(%struct.S* nonnull @sg)
+  %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (%struct.S* (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle)
+  br label %__tls_init.exit
+
+__tls_init.exit:
+  ret %struct.S* @sg
+}
+
+; CHECK-LABEL: _ZTW2sg
+; CHECK-NOT: stp d31, d30
+; CHECK-NOT: stp d29, d28
+; CHECK-NOT: stp d27, d26
+; CHECK-NOT: stp d25, d24
+; CHECK-NOT: stp d23, d22
+; CHECK-NOT: stp d21, d20
+; CHECK-NOT: stp d19, d18
+; CHECK-NOT: stp d17, d16
+; CHECK-NOT: stp d7, d6
+; CHECK-NOT: stp d5, d4
+; CHECK-NOT: stp d3, d2
+; CHECK-NOT: stp d1, d0
+; CHECK-NOT: stp x20, x19
+; CHECK-NOT: stp x14, x13
+; CHECK-NOT: stp x12, x11
+; CHECK-NOT: stp x10, x9
+; CHECK-NOT: stp x8, x7
+; CHECK-NOT: stp x6, x5
+; CHECK-NOT: stp x4, x3
+; CHECK-NOT: stp x2, x1
+; CHECK: blr
+; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]]
+; CHECK: blr
+; CHECK: tlv_atexit
+; CHECK: [[BB_end]]:
+; CHECK: blr
+; CHECK-NOT: ldp x2, x1
+; CHECK-NOT: ldp x4, x3
+; CHECK-NOT: ldp x6, x5
+; CHECK-NOT: ldp x8, x7
+; CHECK-NOT: ldp x10, x9
+; CHECK-NOT: ldp x12, x11
+; CHECK-NOT: ldp x14, x13
+; CHECK-NOT: ldp x20, x19
+; CHECK-NOT: ldp d1, d0
+; CHECK-NOT: ldp d3, d2
+; CHECK-NOT: ldp d5, d4
+; CHECK-NOT: ldp d7, d6
+; CHECK-NOT: ldp d17, d16
+; CHECK-NOT: ldp d19, d18
+; CHECK-NOT: ldp d21, d20
+; CHECK-NOT: ldp d23, d22
+; CHECK-NOT: ldp d25, d24
+; CHECK-NOT: ldp d27, d26
+; CHECK-NOT: ldp d29, d28
+; CHECK-NOT: ldp d31, d30
diff --git a/test/CodeGen/AArch64/dag-combine-select.ll b/test/CodeGen/AArch64/dag-combine-select.ll
new file mode 100644
index 0000000000000..45b998d9136d0
--- /dev/null
+++ b/test/CodeGen/AArch64/dag-combine-select.ll
@@ -0,0 +1,47 @@
+; RUN: llc -disable-post-ra -o - %s | FileCheck %s
+target triple = "arm64--"
+
+@out = internal global i32 0, align 4
+
+; Ensure that we transform select(C0, x, select(C1, x, y)) towards
+; select(C0 | C1, x, y) so we can use CMP;CCMP for the implementation.
+; CHECK-LABEL: test0:
+; CHECK: cmp w0, #7
+; CHECK: ccmp w1, #0, #0, ne
+; CHECK: csel w0, w1, w2, gt
+; CHECK: ret
+define i32 @test0(i32 %v0, i32 %v1, i32 %v2) {
+  %cmp1 = icmp eq i32 %v0, 7
+  %cmp2 = icmp sgt i32 %v1, 0
+  %sel0 = select i1 %cmp1, i32 %v1, i32 %v2
+  %sel1 = select i1 %cmp2, i32 %v1, i32 %sel0
+  ret i32 %sel1
+}
+
+; Usually we keep select(C0 | C1, x, y) as is on aarch64 to create CMP;CCMP
+; sequences. This case should be transformed to select(C0, select(C1, x, y), y)
+; anyway to get CSE effects.
+; CHECK-LABEL: test1:
+; CHECK-NOT: ccmp
+; CHECK: cmp w0, #7
+; CHECK: adrp x[[OUTNUM:[0-9]+]], out
+; CHECK: csel w[[SEL0NUM:[0-9]+]], w1, w2, eq
+; CHECK: cmp w[[SEL0NUM]], #13
+; CHECK: csel w[[SEL1NUM:[0-9]+]], w1, w2, lo
+; CHECK: cmp w0, #42
+; CHECK: csel w[[SEL2NUM:[0-9]+]], w1, w[[SEL1NUM]], eq
+; CHECK: str w[[SEL1NUM]], [x[[OUTNUM]], :lo12:out]
+; CHECK: str w[[SEL2NUM]], [x[[OUTNUM]], :lo12:out]
+; CHECK: ret
+define void @test1(i32 %bitset, i32 %val0, i32 %val1) {
+  %cmp1 = icmp eq i32 %bitset, 7
+  %cond = select i1 %cmp1, i32 %val0, i32 %val1
+  %cmp5 = icmp ult i32 %cond, 13
+  %cond11 = select i1 %cmp5, i32 %val0, i32 %val1
+  %cmp3 = icmp eq i32 %bitset, 42
+  %or.cond = or i1 %cmp3, %cmp5
+  %cond17 = select i1 %or.cond, i32 %val0, i32 %val1
+  store volatile i32 %cond11, i32* @out, align 4
+  store volatile i32 %cond17, i32* @out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/divrem.ll b/test/CodeGen/AArch64/divrem.ll
new file mode 100644
index 0000000000000..9f648eb63eac1
--- /dev/null
+++ b/test/CodeGen/AArch64/divrem.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
+
+; SDIVREM/UDIVREM DAG nodes are generated but expanded when lowering and
+; should not generate select error.
+define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) {
+; CHECK-LABEL: test_udivrem
+; CHECK-DAG: udivrem
+; CHECK-NOT: LLVM ERROR: Cannot select
+  %div = udiv <2 x i32> %x, %y
+  store <2 x i32> %div, <2 x i32>* %z
+  %1 = urem <2 x i32> %x, %y
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @test_sdivrem(<4 x i32> %x,  <4 x i32>* %y) {
+; CHECK-LABEL: test_sdivrem
+; CHECK-DAG: sdivrem
+  %div = sdiv <4 x i32> %x,  < i32 20, i32 20, i32 20, i32 20 >
+  store <4 x i32> %div, <4 x i32>* %y
+  %1 = srem <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 >
+  ret <4 x i32> %1
+}
diff --git a/test/CodeGen/AArch64/emutls.ll b/test/CodeGen/AArch64/emutls.ll
new file mode 100644
index 0000000000000..ac5762edba98b
--- /dev/null
+++ b/test/CodeGen/AArch64/emutls.ll
@@ -0,0 +1,116 @@
+; RUN: llc -emulated-tls -mtriple=aarch64-linux-android \
+; RUN:     -relocation-model=pic < %s | FileCheck -check-prefix=ARM64 %s
+
+; Copied from X86/emutls.ll
+
+; Use my_emutls_get_address like __emutls_get_address.
+@my_emutls_v_xyz = external global i8*, align 4
+declare i8* @my_emutls_get_address(i8*)
+
+define i32 @my_get_xyz() {
+; ARM64-LABEL: my_get_xyz:
+; ARM64:        adrp x0, :got:my_emutls_v_xyz
+; ARM64-NEXT:   ldr x0, [x0, :got_lo12:my_emutls_v_xyz]
+; ARM64-NEXT:   bl my_emutls_get_address
+; ARM64-NEXT:   ldr  w0, [x0]
+; ARM64-NEXT:   ldp x29, x30, [sp]
+
+entry:
+  %call = call i8* @my_emutls_get_address(i8* bitcast (i8** @my_emutls_v_xyz to i8*))
+  %0 = bitcast i8* %call to i32*
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+@i1 = thread_local global i32 15
+@i2 = external thread_local global i32
+@i3 = internal thread_local global i32 15
+@i4 = hidden thread_local global i32 15
+@i5 = external hidden thread_local global i32
+@s1 = thread_local global i16 15
+@b1 = thread_local global i8 0
+
+define i32 @f1() {
+; ARM64-LABEL: f1:
+; ARM64:        adrp x0, :got:__emutls_v.i1
+; ARM64-NEXT:   ldr x0, [x0, :got_lo12:__emutls_v.i1]
+; ARM64-NEXT:   bl __emutls_get_address
+; ARM64-NEXT:   ldr  w0, [x0]
+; ARM64-NEXT:   ldp x29, x30, [sp]
+
+entry:
+  %tmp1 = load i32, i32* @i1
+  ret i32 %tmp1
+}
+
+define i32* @f2() {
+; ARM64-LABEL: f2:
+; ARM64:        adrp x0, :got:__emutls_v.i1
+; ARM64-NEXT:   ldr x0, [x0, :got_lo12:__emutls_v.i1]
+; ARM64-NEXT:   bl __emutls_get_address
+; ARM64-NEXT:   ldp x29, x30, [sp]
+
+entry:
+  ret i32* @i1
+}
+
+;;;;;;;;;;;;;; 64-bit __emutls_v. and __emutls_t.
+
+; ARM64       .section .data.rel.local,
+; ARM64-LABEL: __emutls_v.i1:
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword __emutls_t.i1
+
+; ARM64       .section .rodata,
+; ARM64-LABEL: __emutls_t.i1:
+; ARM64-NEXT: .word 15
+
+; ARM64-NOT:   __emutls_v.i2
+
+; ARM64       .section .data.rel.local,
+; ARM64-LABEL: __emutls_v.i3:
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword __emutls_t.i3
+
+; ARM64       .section .rodata,
+; ARM64-LABEL: __emutls_t.i3:
+; ARM64-NEXT: .word 15
+
+; ARM64       .section .data.rel.local,
+; ARM64-LABEL: __emutls_v.i4:
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword __emutls_t.i4
+
+; ARM64       .section .rodata,
+; ARM64-LABEL: __emutls_t.i4:
+; ARM64-NEXT: .word 15
+
+; ARM64-NOT:   __emutls_v.i5:
+; ARM64       .hidden __emutls_v.i5
+; ARM64-NOT:   __emutls_v.i5:
+
+; ARM64       .section .data.rel.local,
+; ARM64-LABEL: __emutls_v.s1:
+; ARM64-NEXT: .xword 2
+; ARM64-NEXT: .xword 2
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword __emutls_t.s1
+
+; ARM64       .section .rodata,
+; ARM64-LABEL: __emutls_t.s1:
+; ARM64-NEXT: .hword 15
+
+; ARM64       .section .data.rel.local,
+; ARM64-LABEL: __emutls_v.b1:
+; ARM64-NEXT: .xword 1
+; ARM64-NEXT: .xword 1
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword 0
+
+; ARM64-NOT:  __emutls_t.b1
diff --git a/test/CodeGen/AArch64/emutls_generic.ll b/test/CodeGen/AArch64/emutls_generic.ll
new file mode 100644
index 0000000000000..7664db3df8d27
--- /dev/null
+++ b/test/CodeGen/AArch64/emutls_generic.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -emulated-tls -mtriple=aarch64-linux-android -relocation-model=pic \
+; RUN:     | FileCheck -check-prefix=ARM_64 %s
+; RUN: llc < %s -emulated-tls -mtriple=aarch64-linux-android -relocation-model=pic -O3 \
+; RUN:     | FileCheck -check-prefix=ARM_64 %s
+; RUN: llc < %s -emulated-tls -mtriple=aarch64-linux-android -O3 \
+; RUN:     | FileCheck -check-prefix=ARM_64 %s
+
+; Make sure that TLS symbols are emitted in expected order.
+
+@external_x = external thread_local global i32, align 8
+@external_y = thread_local global i8 7, align 2
+@internal_y = internal thread_local global i64 9, align 16
+
+define i32* @get_external_x() {
+entry:
+  ret i32* @external_x
+}
+
+define i8* @get_external_y() {
+entry:
+  ret i8* @external_y
+}
+
+define i64* @get_internal_y() {
+entry:
+  ret i64* @internal_y
+}
+
+; ARM_64-LABEL:  get_external_x:
+; ARM_64:      __emutls_v.external_x
+; ARM_64:      __emutls_get_address
+; ARM_64-LABEL:  get_external_y:
+; ARM_64:      __emutls_v.external_y
+; ARM_64:      __emutls_get_address
+; ARM_64-LABEL:  get_internal_y:
+; ARM_64:      __emutls_v.internal_y
+; ARM_64:      __emutls_get_address
+; ARM_64-NOT:   __emutls_t.external_x
+; ARM_64-NOT:   __emutls_v.external_x:
+; ARM_64:        .align 3
+; ARM_64-LABEL:  __emutls_v.external_y:
+; ARM_64-NEXT:   .xword 1
+; ARM_64-NEXT:   .xword 2
+; ARM_64-NEXT:   .xword 0
+; ARM_64-NEXT:   .xword __emutls_t.external_y
+; ARM_64-NOT:    __emutls_v.external_x:
+; ARM_64:        .section .rodata,
+; ARM_64-LABEL:  __emutls_t.external_y:
+; ARM_64-NEXT:   .byte 7
+; ARM_64:        .data
+; ARM_64:        .align 3
+; ARM_64-LABEL:  __emutls_v.internal_y:
+; ARM_64-NEXT:   .xword 8
+; ARM_64-NEXT:   .xword 16
+; ARM_64-NEXT:   .xword 0
+; ARM_64-NEXT:   .xword __emutls_t.internal_y
+; ARM_64:        .section .rodata,
+; ARM_64-LABEL:  __emutls_t.internal_y:
+; ARM_64-NEXT:   .xword 9
diff --git a/test/CodeGen/AArch64/eon.ll b/test/CodeGen/AArch64/eon.ll
new file mode 100644
index 0000000000000..ea61ce34c050a
--- /dev/null
+++ b/test/CodeGen/AArch64/eon.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Check that the eon instruction is generated instead of eor,movn
+define i64 @test1(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: test1:
+; CHECK: eon
+; CHECK: ret
+entry:
+  %shl = shl i64 %b, 4
+  %neg = xor i64 %a, -1
+  %xor = xor i64 %shl, %neg
+  ret i64 %xor
+}
+
+; Same check with mutliple uses of %neg
+define i64 @test2(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: test2:
+; CHECK: eon
+; CHECK: eon
+; CHECK: lsl
+; CHECK: ret
+entry:
+  %shl = shl i64 %b, 4
+  %neg = xor i64 %shl, -1
+  %xor = xor i64 %neg, %a
+  %xor1 = xor i64 %c, %neg
+  %shl2 = shl i64 %xor, %xor1
+  ret i64 %shl2
+}
diff --git a/test/CodeGen/AArch64/f16-instructions.ll b/test/CodeGen/AArch64/f16-instructions.ll
index be5e2e51385d6..e8ecb13b35645 100644
--- a/test/CodeGen/AArch64/f16-instructions.ll
+++ b/test/CodeGen/AArch64/f16-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
@@ -143,6 +143,33 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
   ret half %r
 }
 
+; CHECK-LABEL: test_select_cc_f32_f16:
+; CHECK-DAG:   fcvt s2, h2
+; CHECK-DAG:   fcvt s3, h3
+; CHECK-NEXT:  fcmp s2, s3
+; CHECK-NEXT:  fcsel s0, s0, s1, ne
+; CHECK-NEXT:  ret
+define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
+  %cc = fcmp une half %c, %d
+  %r = select i1 %cc, float %a, float %b
+  ret float %r
+}
+
+; CHECK-LABEL: test_select_cc_f16_f32:
+; CHECK-DAG:  fcvt s0, h0
+; CHECK-DAG:  fcvt s1, h1
+; CHECK-DAG:  fcmp s2, s3
+; CHECK-DAG:  cset w8, ne
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
+  %cc = fcmp une float %c, %d
+  %r = select i1 %cc, half %a, half %b
+  ret half %r
+}
+
 ; CHECK-LABEL: test_fcmp_une:
 ; CHECK-NEXT: fcvt s1, h1
 ; CHECK-NEXT: fcvt s0, h0
@@ -644,13 +671,10 @@ define half @test_fabs(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_minnum:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-NEXT: mov  x29, sp
-; CHECK-NEXT: fcvt s0, h0
 ; CHECK-NEXT: fcvt s1, h1
-; CHECK-NEXT: bl {{_?}}fminf
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fminnm s0, s0, s1
 ; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: ldp x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 define half @test_minnum(half %a, half %b) #0 {
   %r = call half @llvm.minnum.f16(half %a, half %b)
@@ -658,13 +682,10 @@ define half @test_minnum(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_maxnum:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-NEXT: mov  x29, sp
-; CHECK-NEXT: fcvt s0, h0
 ; CHECK-NEXT: fcvt s1, h1
-; CHECK-NEXT: bl {{_?}}fmaxf
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmaxnm s0, s0, s1
 ; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: ldp x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 define half @test_maxnum(half %a, half %b) #0 {
   %r = call half @llvm.maxnum.f16(half %a, half %b)
@@ -683,11 +704,50 @@ define half @test_copysign(half %a, half %b) #0 {
   ret half %r
 }
 
-; CHECK-LABEL: test_floor:
-; CHECK-NEXT: fcvt s1, h0
-; CHECK-NEXT: frintm s0, s1
+; CHECK-LABEL: test_copysign_f32:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: bit.16b v0, v1, v2
 ; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_copysign_f32(half %a, float %b) #0 {
+  %tb = fptrunc float %b to half
+  %r = call half @llvm.copysign.f16(half %a, half %tb)
+  ret half %r
+}
+
+; CHECK-LABEL: test_copysign_f64:
+; CHECK-NEXT: fcvt s1, d1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: bit.16b v0, v1, v2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_copysign_f64(half %a, double %b) #0 {
+  %tb = fptrunc double %b to half
+  %r = call half @llvm.copysign.f16(half %a, half %tb)
+  ret half %r
+}
+
+; Check that the FP promotion will use a truncating FP_ROUND, so we can fold
+; away the (fpext (fp_round <result>)) here.
+
+; CHECK-LABEL: test_copysign_extended:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: bit.16b v0, v1, v2
+; CHECK-NEXT: ret
+define float @test_copysign_extended(half %a, half %b) #0 {
+  %r = call half @llvm.copysign.f16(half %a, half %b)
+  %xr = fpext half %r to float
+  ret float %xr
+}
+
+; CHECK-LABEL: test_floor:
+; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
+; CHECK-NEXT: frintm [[INT32:s[0-9]+]], [[FLOAT32]]
+; CHECK-NEXT: fcvt h0, [[INT32]]
 ; CHECK-NEXT: ret
 define half @test_floor(half %a) #0 {
   %r = call half @llvm.floor.f16(half %a)
@@ -695,10 +755,9 @@ define half @test_floor(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_ceil:
-; CHECK-NEXT: fcvt s1, h0
-; CHECK-NEXT: frintp s0, s1
-; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
+; CHECK-NEXT: frintp [[INT32:s[0-9]+]], [[FLOAT32]]
+; CHECK-NEXT: fcvt h0, [[INT32]]
 ; CHECK-NEXT: ret
 define half @test_ceil(half %a) #0 {
   %r = call half @llvm.ceil.f16(half %a)
@@ -706,10 +765,9 @@ define half @test_ceil(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_trunc:
-; CHECK-NEXT: fcvt s1, h0
-; CHECK-NEXT: frintz s0, s1
-; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
+; CHECK-NEXT: frintz [[INT32:s[0-9]+]], [[FLOAT32]]
+; CHECK-NEXT: fcvt h0, [[INT32]]
 ; CHECK-NEXT: ret
 define half @test_trunc(half %a) #0 {
   %r = call half @llvm.trunc.f16(half %a)
@@ -737,10 +795,9 @@ define half @test_nearbyint(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_round:
-; CHECK-NEXT: fcvt s1, h0
-; CHECK-NEXT: frinta s0, s1
-; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
+; CHECK-NEXT: frinta [[INT32:s[0-9]+]], [[FLOAT32]]
+; CHECK-NEXT: fcvt h0, [[INT32]]
 ; CHECK-NEXT: ret
 define half @test_round(half %a) #0 {
   %r = call half @llvm.round.f16(half %a)
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
new file mode 100644
index 0000000000000..55fbf63319ee3
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -O0 -fast-isel -fast-isel-abort=0 -verify-machineinstrs < %s | FileCheck %s
+
+define void @test(i64 %a, i64 %b, i2* %c) {
+; CHECK-LABEL: test
+; CHECK:       and [[REG1:w[0-9]+]], w8, #0x3
+; CHECK-NEXT:  strb [[REG1]], {{\[}}x2{{\]}}
+; CHECK-NEXT:  tbz w9, #0,
+ %1 = trunc i64 %a to i2
+ %2 = trunc i64 %b to i1
+; Force fast-isel to fall back to SDAG.
+ store i2 %1, i2* %c, align 8
+ br i1 %2, label %bb1, label %bb2
+
+bb1:
+  ret void
+
+bb2:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
index da6ddbf5101ea..e04a62b85c8eb 100644
--- a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
 
-; CHECK-label: test_or
+; CHECK-LABEL: test_or
 ; CHECK:       cbnz w0, {{LBB[0-9]+_2}}
 ; CHECK:       cbz w1, {{LBB[0-9]+_1}}
 define i64 @test_or(i32 %a, i32 %b) {
@@ -18,7 +18,7 @@ bb4:
   ret i64 %2
 }
 
-; CHECK-label: test_ans
+; CHECK-LABEL: test_and
 ; CHECK:       cbz w0, {{LBB[0-9]+_2}}
 ; CHECK:       cbnz w1, {{LBB[0-9]+_3}}
 define i64 @test_and(i32 %a, i32 %b) {
@@ -36,7 +36,55 @@ bb4:
   ret i64 %2
 }
 
+; If the branch is unpredictable, don't add another branch.
+
+; CHECK-LABEL: test_or_unpredictable
+; CHECK:       cmp   w0, #0
+; CHECK-NEXT:  cset  w8, eq
+; CHECK-NEXT:  cmp   w1, #0
+; CHECK-NEXT:  cset  w9, eq
+; CHECK-NEXT:  orr   w8, w8, w9
+; CHECK-NEXT:  tbnz w8, #0,
+define i64 @test_or_unpredictable(i32 %a, i32 %b) {
+bb1:
+  %0 = icmp eq i32 %a, 0
+  %1 = icmp eq i32 %b, 0
+  %or.cond = or i1 %0, %1
+  br i1 %or.cond, label %bb3, label %bb4, !unpredictable !2
+
+bb3:
+  ret i64 0
+
+bb4:
+  %2 = call i64 @bar()
+  ret i64 %2
+}
+
+; CHECK-LABEL: test_and_unpredictable
+; CHECK:       cmp   w0, #0
+; CHECK-NEXT:  cset  w8, ne
+; CHECK-NEXT:  cmp   w1, #0
+; CHECK-NEXT:  cset  w9, ne
+; CHECK-NEXT:  and   w8, w8, w9
+; CHECK-NEXT:  tbz w8, #0,
+define i64 @test_and_unpredictable(i32 %a, i32 %b) {
+bb1:
+  %0 = icmp ne i32 %a, 0
+  %1 = icmp ne i32 %b, 0
+  %or.cond = and i1 %0, %1
+  br i1 %or.cond, label %bb4, label %bb3, !unpredictable !2
+
+bb3:
+  ret i64 0
+
+bb4:
+  %2 = call i64 @bar()
+  ret i64 %2
+}
+
 declare i64 @bar()
 
 !0 = !{!"branch_weights", i32 5128, i32 32}
 !1 = !{!"branch_weights", i32 1024, i32 4136}
+!2 = !{}
+
diff --git a/test/CodeGen/AArch64/fast-isel-cmp-vec.ll b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
new file mode 100644
index 0000000000000..2855419a1ca0c
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs \
+; RUN:   -aarch64-atomic-cfg-tidy=0 -disable-cgp -disable-branch-fold \
+; RUN:   < %s | FileCheck %s
+
+;
+; Verify that we don't mess up vector comparisons in fast-isel.
+;
+
+define <2 x i32> @icmp_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: icmp_v2i32:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  cmeq.2s [[CMP:v[0-9]+]], v0, #0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.2s [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.8b v0, [[CMP]], [[MASK]]
+; CHECK-NEXT:  ret
+  %c = icmp eq <2 x i32> %a, zeroinitializer
+  br label %bb2
+bb2:
+  %z = zext <2 x i1> %c to <2 x i32>
+  ret <2 x i32> %z
+}
+
+define <2 x i32> @icmp_constfold_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: icmp_constfold_v2i32:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  movi d[[CMP:[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.2s [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.8b v0, v[[CMP]], [[MASK]]
+; CHECK-NEXT:  ret
+  %1 = icmp eq <2 x i32> %a, %a
+  br label %bb2
+bb2:
+  %2 = zext <2 x i1> %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+define <4 x i32> @icmp_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: icmp_v4i32:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  cmeq.4s [[CMP:v[0-9]+]], v0, #0
+; CHECK-NEXT:  xtn.4h [[CMPV4I16:v[0-9]+]], [[CMP]]
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.4h [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.8b [[ZEXT:v[0-9]+]], [[CMPV4I16]], [[MASK]]
+; CHECK-NEXT:  ushll.4s v0, [[ZEXT]], #0
+; CHECK-NEXT:  ret
+  %c = icmp eq <4 x i32> %a, zeroinitializer
+  br label %bb2
+bb2:
+  %z = zext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %z
+}
+
+define <4 x i32> @icmp_constfold_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: icmp_constfold_v4i32:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  movi d[[CMP:[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.4h [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.8b [[ZEXT:v[0-9]+]], v[[CMP]], [[MASK]]
+; CHECK-NEXT:  ushll.4s v0, [[ZEXT]], #0
+; CHECK-NEXT:  ret
+  %1 = icmp eq <4 x i32> %a, %a
+  br label %bb2
+bb2:
+  %2 = zext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @icmp_v16i8(<16 x i8> %a) {
+; CHECK-LABEL: icmp_v16i8:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  cmeq.16b [[CMP:v[0-9]+]], v0, #0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.16b [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.16b v0, [[CMP]], [[MASK]]
+; CHECK-NEXT:  ret
+  %c = icmp eq <16 x i8> %a, zeroinitializer
+  br label %bb2
+bb2:
+  %z = zext <16 x i1> %c to <16 x i8>
+  ret <16 x i8> %z
+}
+
+define <16 x i8> @icmp_constfold_v16i8(<16 x i8> %a) {
+; CHECK-LABEL: icmp_constfold_v16i8:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  movi.2d [[CMP:v[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.16b [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.16b v0, [[CMP]], [[MASK]]
+; CHECK-NEXT:  ret
+  %1 = icmp eq <16 x i8> %a, %a
+  br label %bb2
+bb2:
+  %2 = zext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
diff --git a/test/CodeGen/AArch64/fast-isel-folded-shift.ll b/test/CodeGen/AArch64/fast-isel-folded-shift.ll
new file mode 100644
index 0000000000000..b881ef5c6d52d
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-folded-shift.ll
@@ -0,0 +1,125 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=1 -verify-machineinstrs < %s | FileCheck %s
+
+; Test invalid shift values. This will fall-back to SDAG.
+; AND
+define zeroext i8 @and_rs_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: and_rs_i8
+; CHECK:       and [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = shl i8 %b, 8
+  %2 = and i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @and_rs_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: and_rs_i16
+; CHECK:       and [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = shl i16 %b, 16
+  %2 = and i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @and_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_rs_i32
+; CHECK:       and w0, w0, w8
+  %1 = shl i32 %b, 32
+  %2 = and i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @and_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_rs_i64
+; CHECK:       and x0, x0, x8
+  %1 = shl i64 %b, 64
+  %2 = and i64 %a, %1
+  ret i64 %2
+}
+
+; OR
+define zeroext i8 @or_rs_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: or_rs_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = shl i8 %b, 8
+  %2 = or i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @or_rs_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: or_rs_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = shl i16 %b, 16
+  %2 = or i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @or_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_rs_i32
+; CHECK:       orr w0, w0, w8
+  %1 = shl i32 %b, 32
+  %2 = or i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @or_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_rs_i64
+; CHECK:       orr x0, x0, x8
+  %1 = shl i64 %b, 64
+  %2 = or i64 %a, %1
+  ret i64 %2
+}
+
+; XOR
+define zeroext i8 @xor_rs_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: xor_rs_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = shl i8 %b, 8
+  %2 = xor i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @xor_rs_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: xor_rs_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = shl i16 %b, 16
+  %2 = xor i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @xor_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_rs_i32
+; CHECK:       eor w0, w0, w8
+  %1 = shl i32 %b, 32
+  %2 = xor i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @xor_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_rs_i64
+; CHECK:       eor x0, x0, x8
+  %1 = shl i64 %b, 64
+  %2 = xor i64 %a, %1
+  ret i64 %2
+}
+
+;ADD
+define i32 @add_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: add_rs_i32
+; CHECK:       add w0, w0, w8
+  %1 = shl i32 %b, 32
+  %2 = add i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @add_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: add_rs_i64
+; CHECK:       add x0, x0, x8
+  %1 = shl i64 %b, 64
+  %2 = add i64 %a, %1
+  ret i64 %2
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-logic-op.ll b/test/CodeGen/AArch64/fast-isel-logic-op.ll
index 89c5f2c480243..16d0429fe98df 100644
--- a/test/CodeGen/AArch64/fast-isel-logic-op.ll
+++ b/test/CodeGen/AArch64/fast-isel-logic-op.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=0                  -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=0                    -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=1 -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
 
 ; AND
diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll
index a392619a768dc..b5e03f08280ff 100644
--- a/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -16,7 +16,7 @@ define fastcc void @foo(i32 %in) {
 ; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame:
-; CHECK: sub sp, sp, #16
+; CHECK: str w{{[0-9]+}}, [sp, #-16]!
 
   call fastcc void @will_pop([8 x i32] undef, i32 42)
 ; CHECK: bl will_pop
@@ -42,7 +42,7 @@ define void @foo1(i32 %in) {
 ; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame
-; CHECK: sub sp, sp, #16
+; CHECK: str w{{[0-9]+}}, [sp, #-16]!
 
   call void @wont_pop([8 x i32] undef, i32 42)
 ; CHECK: bl wont_pop
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
index 9917fcd044fdd..f021eb2326188 100644
--- a/test/CodeGen/AArch64/fastcc.ll
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -7,12 +7,12 @@
 define fastcc void @func_stack0() {
 ; CHECK-LABEL: func_stack0:
 ; CHECK: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #32
+; CHECK: str w{{[0-9]+}}, [sp, #-32]!
 
 ; CHECK-TAIL-LABEL: func_stack0:
 ; CHECK-TAIL: stp x29, x30, [sp, #-16]!
 ; CHECK-TAIL-NEXT: mov x29, sp
-; CHECK-TAIL-NEXT: sub sp, sp, #32
+; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]!
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -55,13 +55,13 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
 ; CHECK: stp x29, x30, [sp, #-16]!
 ; CHECK: mov x29, sp
-; CHECK: sub sp, sp, #32
+; CHECK: str w{{[0-9]+}}, [sp, #-32]!
 
 
 ; CHECK-TAIL-LABEL: func_stack8:
 ; CHECK-TAIL: stp x29, x30, [sp, #-16]!
 ; CHECK-TAIL: mov x29, sp
-; CHECK-TAIL: sub sp, sp, #32
+; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]!
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
diff --git a/test/CodeGen/AArch64/fcvt_combine.ll b/test/CodeGen/AArch64/fcvt_combine.ll
new file mode 100644
index 0000000000000..093ce4a4cd857
--- /dev/null
+++ b/test/CodeGen/AArch64/fcvt_combine.ll
@@ -0,0 +1,154 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK-NOT: fmul.2s
+; CHECK: fcvtzs.2s v0, v0, #4
+; CHECK: ret
+define <2 x i32> @test1(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; CHECK-LABEL: test2
+; CHECK-NOT: fmul.4s
+; CHECK: fcvtzs.4s v0, v0, #3
+; CHECK: ret
+define <4 x i32> @test2(<4 x float> %f) {
+  %mul.i = fmul <4 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+  %vcvt.i = fptosi <4 x float> %mul.i to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+; CHECK-LABEL: test3
+; CHECK-NOT: fmul.2d
+; CHECK: fcvtzs.2d v0, v0, #5
+; CHECK: ret
+define <2 x i64> @test3(<2 x double> %d) {
+  %mul.i = fmul <2 x double> %d, <double 32.000000e+00, double 32.000000e+00>
+  %vcvt.i = fptosi <2 x double> %mul.i to <2 x i64>
+  ret <2 x i64> %vcvt.i
+}
+
+; Truncate double to i32
+; CHECK-LABEL: test4
+; CHECK-NOT: fmul.2d v0, v0, #4
+; CHECK: fcvtzs.2d v0, v0
+; CHECK: xtn.2s
+; CHECK: ret
+define <2 x i32> @test4(<2 x double> %d) {
+  %mul.i = fmul <2 x double> %d, <double 16.000000e+00, double 16.000000e+00>
+  %vcvt.i = fptosi <2 x double> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Truncate float to i16
+; CHECK-LABEL: test5
+; CHECK-NOT: fmul.2s
+; CHECK: fcvtzs.2s v0, v0, #4
+; CHECK: ret
+define <2 x i16> @test5(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i16>
+  ret <2 x i16> %vcvt.i
+}
+
+; Don't convert float to i64
+; CHECK-LABEL: test6
+; CHECK: fmov.2s v1, #16.00000000
+; CHECK: fmul.2s v0, v0, v1
+; CHECK: fcvtl v0.2d, v0.2s
+; CHECK: fcvtzs.2d v0, v0
+; CHECK: ret
+define <2 x i64> @test6(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i64>
+  ret <2 x i64> %vcvt.i
+}
+
+; Check unsigned conversion.
+; CHECK-LABEL: test7
+; CHECK-NOT: fmul.2s
+; CHECK: fcvtzu.2s v0, v0, #4
+; CHECK: ret
+define <2 x i32> @test7(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
+  %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Test which should not fold due to non-power of 2.
+; CHECK-LABEL: test8
+; CHECK: fmov.2s v1, #17.00000000
+; CHECK: fmul.2s v0, v0, v1
+; CHECK: fcvtzu.2s v0, v0
+; CHECK: ret
+define <2 x i32> @test8(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 17.000000e+00, float 17.000000e+00>
+  %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Test which should not fold due to non-matching power of 2.
+; CHECK-LABEL: test9
+; CHECK: fmul.2s v0, v0, v1
+; CHECK: fcvtzu.2s v0, v0
+; CHECK: ret
+define <2 x i32> @test9(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 8.000000e+00>
+  %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Don't combine all undefs.
+; CHECK-LABEL: test10
+; CHECK: fmul.2s v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: fcvtzu.2s v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: ret
+define <2 x i32> @test10(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float undef, float undef>
+  %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Combine if mix of undef and pow2.
+; CHECK-LABEL: test11
+; CHECK: fcvtzu.2s v0, v0, #3
+; CHECK: ret
+define <2 x i32> @test11(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float undef, float 8.000000e+00>
+  %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Don't combine when multiplied by 0.0.
+; CHECK-LABEL: test12
+; CHECK: fmul.2s v0, v0, v1
+; CHECK: fcvtzs.2s v0, v0
+; CHECK: ret
+define <2 x i32> @test12(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 0.000000e+00, float 0.000000e+00>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Test which should not fold due to power of 2 out of range (i.e., 2^33).
+; CHECK-LABEL: test13
+; CHECK: fmul.2s v0, v0, v1
+; CHECK: fcvtzs.2s v0, v0
+; CHECK: ret
+define <2 x i32> @test13(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 0x4200000000000000, float 0x4200000000000000>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Test case where const is max power of 2 (i.e., 2^32).
+; CHECK-LABEL: test14
+; CHECK: fcvtzs.2s v0, v0, #32
+; CHECK: ret
+define <2 x i32> @test14(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 0x41F0000000000000, float 0x41F0000000000000>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
diff --git a/test/CodeGen/AArch64/fdiv_combine.ll b/test/CodeGen/AArch64/fdiv_combine.ll
new file mode 100644
index 0000000000000..6f38a267ec3fe
--- /dev/null
+++ b/test/CodeGen/AArch64/fdiv_combine.ll
@@ -0,0 +1,115 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s
+
+; Test signed conversion.
+; CHECK-LABEL: @test1
+; CHECK: scvtf.2s v0, v0, #4
+; CHECK: ret
+define <2 x float> @test1(<2 x i32> %in) {
+entry:
+  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 16.0, float 16.0>
+  ret <2 x float> %div.i
+}
+
+; Test unsigned conversion.
+; CHECK-LABEL: @test2
+; CHECK: ucvtf.2s v0, v0, #3
+; CHECK: ret
+define <2 x float> @test2(<2 x i32> %in) {
+entry:
+  %vcvt.i = uitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 8.0, float 8.0>
+  ret <2 x float> %div.i
+}
+
+; Test which should not fold due to non-power of 2.
+; CHECK-LABEL: @test3
+; CHECK: scvtf.2s v0, v0
+; CHECK: fmov.2s v1, #9.00000000
+; CHECK: fdiv.2s v0, v0, v1
+; CHECK: ret
+define <2 x float> @test3(<2 x i32> %in) {
+entry:
+  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 9.0, float 9.0>
+  ret <2 x float> %div.i
+}
+
+; Test which should not fold due to power of 2 out of range.
+; CHECK-LABEL: @test4
+; CHECK: scvtf.2s v0, v0
+; CHECK: movi.2s v1, #0x50, lsl #24
+; CHECK: fdiv.2s v0, v0, v1
+; CHECK: ret
+define <2 x float> @test4(<2 x i32> %in) {
+entry:
+  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 0x4200000000000000, float 0x4200000000000000>
+  ret <2 x float> %div.i
+}
+
+; Test case where const is max power of 2 (i.e., 2^32).
+; CHECK-LABEL: @test5
+; CHECK: scvtf.2s v0, v0, #32
+; CHECK: ret
+define <2 x float> @test5(<2 x i32> %in) {
+entry:
+  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 0x41F0000000000000, float 0x41F0000000000000>
+  ret <2 x float> %div.i
+}
+
+; Test quadword.
+; CHECK-LABEL: @test6
+; CHECK: scvtf.4s v0, v0, #2
+; CHECK: ret
+define <4 x float> @test6(<4 x i32> %in) {
+entry:
+  %vcvt.i = sitofp <4 x i32> %in to <4 x float>
+  %div.i = fdiv <4 x float> %vcvt.i, <float 4.0, float 4.0, float 4.0, float 4.0>
+  ret <4 x float> %div.i
+}
+
+; Test unsigned i16 to float
+; CHECK-LABEL: @test7
+; CHECK: ushll.4s  v0, v0, #0
+; CHECK: ucvtf.4s  v0, v0, #1
+; CHECK: ret
+define <4 x float> @test7(<4 x i16> %in) {
+  %conv = uitofp <4 x i16> %in to <4 x float>
+  %shift = fdiv <4 x float> %conv, <float 2.0, float 2.0, float 2.0, float 2.0>
+  ret <4 x float> %shift
+}
+
+; Test signed i16 to float
+; CHECK-LABEL: @test8
+; CHECK: sshll.4s v0, v0, #0
+; CHECK: scvtf.4s v0, v0, #2
+; CHECK: ret
+define <4 x float> @test8(<4 x i16> %in) {
+  %conv = sitofp <4 x i16> %in to <4 x float>
+  %shift = fdiv <4 x float> %conv, <float 4.0, float 4.0, float 4.0, float 4.0>
+  ret <4 x float> %shift
+}
+
+; Can't convert i64 to float.
+; CHECK-LABEL: @test9
+; CHECK: ucvtf.2d v0, v0
+; CHECK: fcvtn v0.2s, v0.2d
+; CHECK: movi.2s v1, #0x40, lsl #24
+; CHECK: fdiv.2s v0, v0, v1
+; CHECK: ret
+define <2 x float> @test9(<2 x i64> %in) {
+  %conv = uitofp <2 x i64> %in to <2 x float>
+  %shift = fdiv <2 x float> %conv, <float 2.0, float 2.0>
+  ret <2 x float> %shift
+}
+
+; CHECK-LABEL: @test10
+; CHECK: ucvtf.2d v0, v0, #1
+; CHECK: ret
+define <2 x double> @test10(<2 x i64> %in) {
+  %conv = uitofp <2 x i64> %in to <2 x double>
+  %shift = fdiv <2 x double> %conv, <double 2.0, double 2.0>
+  ret <2 x double> %shift
+}
diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll
index 2dd0d1245930b..c0fec4d171cd1 100644
--- a/test/CodeGen/AArch64/fold-constants.ll
+++ b/test/CodeGen/AArch64/fold-constants.ll
@@ -3,9 +3,6 @@
 define i64 @dotests_616() {
 ; CHECK-LABEL: dotests_616
 ; CHECK:       movi d0, #0000000000000000
-; CHECK-NEXT:  umov w8, v0.b[2]
-; CHECK-NEXT:  sbfx w8, w8, #0, #1
-; CHECK-NEXT:  fmov s0, w8
 ; CHECK-NEXT:  fmov x0, d0
 ; CHECK-NEXT:  ret
 entry:
@@ -19,3 +16,19 @@ entry:
   %vget_lane = extractelement <1 x i64> %4, i32 0
   ret i64 %vget_lane
 }
+
+; PR25763 - folding constant vector comparisons with sign-extended result
+define <8 x i16> @dotests_458() {
+; CHECK-LABEL: dotests_458
+; CHECK:       movi d0, #0x00000000ff0000
+; CHECK-NEXT:  sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:  ret
+entry:
+  %vclz_v.i = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> <i8 127, i8 38, i8 -1, i8 -128, i8 127, i8 0, i8 0, i8 0>, i1 false) #6
+  %vsra_n = lshr <8 x i8> %vclz_v.i, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+  %name_6 = or <8 x i8> %vsra_n, <i8 127, i8 -128, i8 -1, i8 67, i8 84, i8 127, i8 -1, i8 0>
+  %cmp.i603 = icmp slt <8 x i8> %name_6, <i8 -57, i8 -128, i8 127, i8 -128, i8 -1, i8 0, i8 -1, i8 -1>
+  %vmovl.i4.i = sext <8 x i1> %cmp.i603 to <8 x i16>
+  ret <8 x i16> %vmovl.i4.i
+}
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1)
diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll
index 0dbda152fca91..f6e4bdf734599 100644
--- a/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -130,7 +130,6 @@ define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) {
   ret <4 x i16> %2
 }
 
-
 define <4 x half> @sitofp_i8(<4 x i8> %a) #0 {
 ; CHECK-LABEL: sitofp_i8:
 ; CHECK-NEXT: shl [[OP1:v[0-9]+\.4h]], v0.4h, #8
@@ -218,4 +217,54 @@ define <4 x half> @uitofp_i64(<4 x i64> %a) #0 {
   ret <4 x half> %1
 }
 
+define void @test_insert_at_zero(half %a, <4 x half>* %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+  %1 = insertelement <4 x half> undef, half %a, i64 0
+  store <4 x half> %1, <4 x half>* %b, align 4
+  ret void
+}
+
+define <4 x i8> @fptosi_i8(<4 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptosi<4 x half> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptosi_i16(<4 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptosi<4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <4 x i8> @fptoui_i8(<4 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; NOTE: fcvtzs selected here because the xtn shaves the sign bit
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptoui<4 x half> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptoui_i16(<4 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzu [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptoui<4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll
index 10a8c22d6f7ef..137d1f358a304 100644
--- a/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -358,4 +358,67 @@ define <8 x half> @uitofp_i64(<8 x i64> %a) #0 {
   ret <8 x half> %1
 }
 
+define void @test_insert_at_zero(half %a, <8 x half>* %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+  %1 = insertelement <8 x half> undef, half %a, i64 0
+  store <8 x half> %1, <8 x half>* %b, align 4
+  ret void
+}
+
+define <8 x i8> @fptosi_i8(<8 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzs  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzs  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-DAG: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: xtn     v0.8b, [[I16]].8h
+; CHECK-NEXT: ret
+  %1 = fptosi<8 x half> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptosi_i16(<8 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzs  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzs  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-NEXT: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: ret
+  %1 = fptosi<8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i8> @fptoui_i8(<8 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzu  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzu  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-DAG: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: xtn     v0.8b, [[I16]].8h
+; CHECK-NEXT: ret
+  %1 = fptoui<8 x half> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptoui_i16(<8 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzu  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzu  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-NEXT: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: ret
+  %1 = fptoui<8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/free-zext.ll b/test/CodeGen/AArch64/free-zext.ll
index cff11f85bda4e..ea4f1f4e10f3e 100644
--- a/test/CodeGen/AArch64/free-zext.ll
+++ b/test/CodeGen/AArch64/free-zext.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i64 @test_free_zext(i8* %a, i16* %b) {
-; CHECK-LABEL: test_free_zext
+; CHECK-LABEL: test_free_zext:
 ; CHECK-DAG: ldrb w[[A:[0-9]+]], [x0]
 ; CHECK: ldrh w[[B:[0-9]+]], [x1]
 ; CHECK: add x0, x[[B]], x[[A]]
@@ -12,3 +12,60 @@ define i64 @test_free_zext(i8* %a, i16* %b) {
   %add = add nsw i64 %conv1, %conv
   ret i64 %add
 }
+
+define void @test_free_zext2(i32* %ptr, i32* %dst1, i64* %dst2) {
+; CHECK-LABEL: test_free_zext2:
+; CHECK: ldrh w[[A:[0-9]+]], [x0]
+; CHECK-NOT: and x
+; CHECK: str w[[A]], [x1]
+; CHECK: str x[[A]], [x2]
+  %load = load i32, i32* %ptr, align 8
+  %load16 = and i32 %load, 65535
+  %load64 = zext i32 %load16 to i64
+  store i32 %load16, i32* %dst1, align 4
+  store i64 %load64, i64* %dst2, align 8
+  ret void
+}
+
+; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads
+; feeding a phi that zext's each loaded value.
+define i32 @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i32 %c) {
+; CHECK-LABEL: test_free_zext3:
+bb1:
+; CHECK: ldrh [[REG:w[0-9]+]]
+; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
+  %tmp1 = load i32, i32* %ptr, align 4
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %bb2, label %bb3
+bb2:
+; CHECK: ldrh [[REG2:w[0-9]+]]
+; CHECK-NOT: and {{w[0-9]+}}, [[REG2]], #0xffff
+  %tmp2 = load i32, i32* %ptr2, align 4
+  br label %bb3
+bb3:
+  %tmp3 = phi i32 [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
+; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
+  %tmpand = and i32 %tmp3, 65535
+  ret i32 %tmpand
+}
+
+; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able
+; load feeding a phi in the same block.
+define void @test_free_zext4(i32* %ptr, i32* %ptr2, i32* %dst) {
+; CHECK-LABEL: test_free_zext4:
+; CHECK: ldrh [[REG:w[0-9]+]]
+; TODO: fix isel to remove final and XCHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
+; CHECK: ldrh [[REG:w[0-9]+]]
+bb1:
+  %load1 = load i32, i32* %ptr, align 4
+  br label %loop
+loop:
+  %phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ]
+  %and = and i32 %phi, 65535
+  store i32 %and, i32* %dst, align 4
+  %load2 = load i32, i32* %ptr2, align 4
+  %cmp = icmp ne i32 %and, 0
+  br i1 %cmp, label %loop, label %end
+end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index 9100ae39282bb..2ea13e3888678 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-post-ra | FileCheck --check-prefix=CHECK %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -disable-post-ra | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index 22a33157fd555..2f45666ba13ae 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -89,11 +89,11 @@ define void @check_stack_args() {
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
 
-; CHECK-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #-16]
 ; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
 ; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
 
-; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp, #-16]!
 ; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
 ; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
 
diff --git a/test/CodeGen/AArch64/global-alignment.ll b/test/CodeGen/AArch64/global-alignment.ll
index 657778e34187d..5e820b8bb3037 100644
--- a/test/CodeGen/AArch64/global-alignment.ll
+++ b/test/CodeGen/AArch64/global-alignment.ll
@@ -3,7 +3,7 @@
 @var32 = global [3 x i32] zeroinitializer
 @var64 = global [3 x i64] zeroinitializer
 @var32_align64 = global [3 x i32] zeroinitializer, align 8
-@alias = alias [3 x i32]* @var32_align64
+@alias = alias [3 x i32], [3 x i32]* @var32_align64
 
 define i64 @test_align32() {
 ; CHECK-LABEL: test_align32:
diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll
index 14b04303ffb38..b93f41c07df9e 100644
--- a/test/CodeGen/AArch64/global-merge-1.ll
+++ b/test/CodeGen/AArch64/global-merge-1.ll
@@ -12,16 +12,20 @@
 
 define void @f1(i32 %a1, i32 %a2) {
 ;CHECK-APPLE-IOS-NOT: adrp
-;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals@PAGE
+;CHECK-APPLE-IOS: adrp	x8, l__MergedGlobals@PAGE
 ;CHECK-APPLE-IOS-NOT: adrp
-;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals@PAGEOFF
+;CHECK-APPLE-IOS: add	x8, x8, l__MergedGlobals@PAGEOFF
   store i32 %a1, i32* @m, align 4
   store i32 %a2, i32* @n, align 4
   ret void
 }
 
-;CHECK:	.type	_MergedGlobals,@object  // @_MergedGlobals
-;CHECK:	.local	_MergedGlobals
-;CHECK:	.comm	_MergedGlobals,8,8
+;CHECK:	.type	.L_MergedGlobals,@object  // @_MergedGlobals
+;CHECK:	.local	.L_MergedGlobals
+;CHECK:	.comm	.L_MergedGlobals,8,8
+;CHECK: m = .L_MergedGlobals
+;CHECK: n = .L_MergedGlobals+4
 
-;CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,3 ; @_MergedGlobals
+;CHECK-APPLE-IOS: .zerofill __DATA,__bss,l__MergedGlobals,8,3 ; @_MergedGlobals
+;CHECK-APPLE-IOS-NOT: _m = l__MergedGlobals
+;CHECK-APPLE-IOS-NOT: _n = l__MergedGlobals+4
diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll
index af684039bf10f..53bed1d9bc093 100644
--- a/test/CodeGen/AArch64/global-merge-2.ll
+++ b/test/CodeGen/AArch64/global-merge-2.ll
@@ -9,8 +9,8 @@
 define void @f1(i32 %a1, i32 %a2) {
 ;CHECK-APPLE-IOS-LABEL: _f1:
 ;CHECK-APPLE-IOS-NOT: adrp
-;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
-;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS: adrp	x8, l__MergedGlobals@PAGE
+;CHECK-APPLE-IOS: add	x8, x8, l__MergedGlobals@PAGEOFF
 ;CHECK-APPLE-IOS-NOT: adrp
   store i32 %a1, i32* @x, align 4
   store i32 %a2, i32* @y, align 4
@@ -19,34 +19,34 @@ define void @f1(i32 %a1, i32 %a2) {
 
 define void @g1(i32 %a1, i32 %a2) {
 ;CHECK-APPLE-IOS-LABEL: _g1:
-;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
-;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS: adrp	x8, l__MergedGlobals@PAGE
+;CHECK-APPLE-IOS: add	x8, x8, l__MergedGlobals@PAGEOFF
 ;CHECK-APPLE-IOS-NOT: adrp
   store i32 %a1, i32* @y, align 4
   store i32 %a2, i32* @z, align 4
   ret void
 }
 
-;CHECK:	.type	_MergedGlobals_x,@object // @_MergedGlobals_x
-;CHECK:	.globl	_MergedGlobals_x
-;CHECK:	.align	3
-;CHECK: _MergedGlobals_x:
-;CHECK:	.size	_MergedGlobals_x, 12
+;CHECK:	.type	.L_MergedGlobals,@object // @_MergedGlobals
+;CHECK:	.local	.L_MergedGlobals
+;CHECK:	.comm	.L_MergedGlobals,12,8
 
 ;CHECK:	.globl	x
-;CHECK: x = _MergedGlobals_x
+;CHECK: x = .L_MergedGlobals
+;CHECK: .size x, 4
 ;CHECK:	.globl	y
-;CHECK: y = _MergedGlobals_x+4
+;CHECK: y = .L_MergedGlobals+4
+;CHECK: .size y, 4
 ;CHECK:	.globl	z
-;CHECK: z = _MergedGlobals_x+8
+;CHECK: z = .L_MergedGlobals+8
+;CHECK: .size z, 4
 
-;CHECK-APPLE-IOS: .globl	__MergedGlobals_x       ; @_MergedGlobals_x
-;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,12,3
+;CHECK-APPLE-IOS: .zerofill __DATA,__bss,l__MergedGlobals,12,3
 
 ;CHECK-APPLE-IOS: .globl	_x
-;CHECK-APPLE-IOS: _x = __MergedGlobals_x
+;CHECK-APPLE-IOS:  = l__MergedGlobals
 ;CHECK-APPLE-IOS: .globl	_y
-;CHECK-APPLE-IOS: _y = __MergedGlobals_x+4
+;CHECK-APPLE-IOS: _y = l__MergedGlobals+4
 ;CHECK-APPLE-IOS: .globl	_z
-;CHECK-APPLE-IOS: _z = __MergedGlobals_x+8
+;CHECK-APPLE-IOS: _z = l__MergedGlobals+8
 ;CHECK-APPLE-IOS: .subsections_via_symbols
diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll
index 925108308e563..6895380ca63e9 100644
--- a/test/CodeGen/AArch64/global-merge-3.ll
+++ b/test/CodeGen/AArch64/global-merge-3.ll
@@ -1,17 +1,17 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
 
 @x = global [1000 x i32] zeroinitializer, align 1
 @y = global [1000 x i32] zeroinitializer, align 1
 @z = internal global i32 1, align 4
 
 define void @f1(i32 %a1, i32 %a2, i32 %a3) {
-;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS: adrp	x8, l__MergedGlobals@PAGE
 ;CHECK-APPLE-IOS-NOT: adrp
-;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
-;CHECK-APPLE-IOS: adrp	x9, __MergedGlobals_y@PAGE
-;CHECK-APPLE-IOS: add	x9, x9, __MergedGlobals_y@PAGEOFF
+;CHECK-APPLE-IOS: add	x8, x8, l__MergedGlobals@PAGEOFF
+;CHECK-APPLE-IOS: adrp	x9, l__MergedGlobals.1@PAGE
+;CHECK-APPLE-IOS: add	x9, x9, l__MergedGlobals.1@PAGEOFF
   %x3 = getelementptr inbounds [1000 x i32], [1000 x i32]* @x, i32 0, i64 3
   %y3 = getelementptr inbounds [1000 x i32], [1000 x i32]* @y, i32 0, i64 3
   store i32 %a1, i32* %x3, align 4
@@ -20,32 +20,32 @@ define void @f1(i32 %a1, i32 %a2, i32 %a3) {
   ret void
 }
 
-;CHECK:	.type	_MergedGlobals_x,@object // @_MergedGlobals_x
-;CHECK: .globl	_MergedGlobals_x
+;CHECK:	.type	.L_MergedGlobals,@object // @_MergedGlobals
 ;CHECK: .align	4
-;CHECK: _MergedGlobals_x:
-;CHECK: .size	_MergedGlobals_x, 4004
+;CHECK: .L_MergedGlobals:
+;CHECK: .size	.L_MergedGlobals, 4004
 
-;CHECK: .type	_MergedGlobals_y,@object // @_MergedGlobals_y
-;CHECK: .globl	_MergedGlobals_y
-;CHECK: _MergedGlobals_y:
-;CHECK: .size	_MergedGlobals_y, 4000
+;CHECK: .type	.L_MergedGlobals.1,@object // @_MergedGlobals.1
+;CHECK: .local	.L_MergedGlobals.1
+;CHECK: .comm	.L_MergedGlobals.1,4000,16
 
-;CHECK-APPLE-IOS: .globl	__MergedGlobals_x       ; @_MergedGlobals_x
 ;CHECK-APPLE-IOS: .align	4
-;CHECK-APPLE-IOS:  __MergedGlobals_x:
+;CHECK-APPLE-IOS:  l__MergedGlobals:
 ;CHECK-APPLE-IOS: .long 1
 ;CHECK-APPLE-IOS: .space	4000
 
-;CHECK-APPLE-IOS: .globl	__MergedGlobals_y       ; @_MergedGlobals_y
-;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_y,4000,4
+;CHECK-APPLE-IOS: .zerofill __DATA,__bss,l__MergedGlobals.1,4000,4
 
+;CHECK: z = .L_MergedGlobals
 ;CHECK:	.globl	x
-;CHECK: x = _MergedGlobals_x+4
+;CHECK: x = .L_MergedGlobals+4
+;CHECK: .size x, 4000
 ;CHECK:	.globl	y
-;CHECK: y = _MergedGlobals_y
+;CHECK: y = .L_MergedGlobals.1
+;CHECK: .size y, 4000
 
+;CHECK-APPLE-IOS-NOT: _z = l__MergedGlobals
 ;CHECK-APPLE-IOS:.globl	_x
-;CHECK-APPLE-IOS: _x = __MergedGlobals_x+4
+;CHECK-APPLE-IOS: _x = l__MergedGlobals+4
 ;CHECK-APPLE-IOS:.globl	_y
-;CHECK-APPLE-IOS: _y = __MergedGlobals_y
+;CHECK-APPLE-IOS: _y = l__MergedGlobals.1
diff --git a/test/CodeGen/AArch64/global-merge-4.ll b/test/CodeGen/AArch64/global-merge-4.ll
index bc6b68a9c046a..a5109f6e8ea59 100644
--- a/test/CodeGen/AArch64/global-merge-4.ll
+++ b/test/CodeGen/AArch64/global-merge-4.ll
@@ -64,9 +64,9 @@ define internal i32* @returnFoo() #1 {
   ret i32* getelementptr inbounds ([5 x i32], [5 x i32]* @foo, i64 0, i64 0)
 }
 
-;CHECK:	.type	_MergedGlobals,@object  // @_MergedGlobals
-;CHECK:	.local	_MergedGlobals
-;CHECK:	.comm	_MergedGlobals,60,16
+;CHECK:	.type	.L_MergedGlobals,@object  // @_MergedGlobals
+;CHECK:	.local	.L_MergedGlobals
+;CHECK:	.comm	.L_MergedGlobals,60,16
 
 attributes #0 = { nounwind ssp }
 attributes #1 = { nounwind readnone ssp }
diff --git a/test/CodeGen/AArch64/global-merge-group-by-use.ll b/test/CodeGen/AArch64/global-merge-group-by-use.ll
index ddc044ed9e082..8b3fc97c9e2e3 100644
--- a/test/CodeGen/AArch64/global-merge-group-by-use.ll
+++ b/test/CodeGen/AArch64/global-merge-group-by-use.ll
@@ -12,7 +12,7 @@
 
 ; CHECK-LABEL: f1:
 define void @f1(i32 %a1, i32 %a2) #0 {
-; CHECK-NEXT: adrp x8, [[SET1:__MergedGlobals.[0-9]*]]@PAGE
+; CHECK-NEXT: adrp x8, [[SET1:l__MergedGlobals.[0-9]*]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET1]]@PAGEOFF
 ; CHECK-NEXT: stp w0, w1, [x8]
 ; CHECK-NEXT: ret
@@ -27,7 +27,7 @@ define void @f1(i32 %a1, i32 %a2) #0 {
 
 ; CHECK-LABEL: f2:
 define void @f2(i32 %a1, i32 %a2, i32 %a3) #0 {
-; CHECK-NEXT: adrp x8, [[SET2:__MergedGlobals.[0-9]*]]@PAGE
+; CHECK-NEXT: adrp x8, [[SET2:l__MergedGlobals.[0-9]*]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET2]]@PAGEOFF
 ; CHECK-NEXT: stp w0, w1, [x8]
 ; CHECK-NEXT: str w2, [x8, #8]
@@ -48,7 +48,7 @@ define void @f2(i32 %a1, i32 %a2, i32 %a3) #0 {
 ; CHECK-LABEL: f3:
 define void @f3(i32 %a1, i32 %a2) #0 {
 ; CHECK-NEXT: adrp x8, _m3@PAGE
-; CHECK-NEXT: adrp x9, [[SET3:__MergedGlobals[0-9]*]]@PAGE
+; CHECK-NEXT: adrp x9, [[SET3:l__MergedGlobals[0-9]*]]@PAGE
 ; CHECK-NEXT: str w0, [x8, _m3@PAGEOFF]
 ; CHECK-NEXT: str w1, [x9, [[SET3]]@PAGEOFF]
 ; CHECK-NEXT: ret
diff --git a/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll b/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll
index e83cbab140a74..3994389257719 100644
--- a/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll
+++ b/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll
@@ -11,7 +11,7 @@
 
 ; CHECK-LABEL: f1:
 define void @f1(i32 %a1, i32 %a2) minsize nounwind {
-; CHECK-NEXT: adrp x8, [[SET:__MergedGlobals]]@PAGE
+; CHECK-NEXT: adrp x8, [[SET:l__MergedGlobals]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF
 ; CHECK-NEXT: stp w0, w1, [x8]
 ; CHECK-NEXT: ret
diff --git a/test/CodeGen/AArch64/global-merge-ignore-single-use.ll b/test/CodeGen/AArch64/global-merge-ignore-single-use.ll
index e6de4699132ae..c3756a85feff5 100644
--- a/test/CodeGen/AArch64/global-merge-ignore-single-use.ll
+++ b/test/CodeGen/AArch64/global-merge-ignore-single-use.ll
@@ -10,7 +10,7 @@
 
 ; CHECK-LABEL: f1:
 define void @f1(i32 %a1, i32 %a2) #0 {
-; CHECK-NEXT: adrp x8, [[SET:__MergedGlobals]]@PAGE
+; CHECK-NEXT: adrp x8, [[SET:l__MergedGlobals]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF
 ; CHECK-NEXT: stp w0, w1, [x8]
 ; CHECK-NEXT: ret
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
index b2c11c7517c0b..d2133213f1864 100644
--- a/test/CodeGen/AArch64/ldst-opt.ll
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -3,11 +3,15 @@
 ; This file contains tests for the AArch64 load/store optimizer.
 
 %padding = type { i8*, i8*, i8*, i8* }
+%s.byte = type { i8, i8 }
+%s.halfword = type { i16, i16 }
 %s.word = type { i32, i32 }
 %s.doubleword = type { i64, i32 }
 %s.quadword = type { fp128, i32 }
 %s.float = type { float, i32 }
 %s.double = type { double, i32 }
+%struct.byte = type { %padding, %s.byte }
+%struct.halfword = type { %padding, %s.halfword }
 %struct.word = type { %padding, %s.word }
 %struct.doubleword = type { %padding, %s.doubleword }
 %struct.quadword = type { %padding, %s.quadword }
@@ -24,6 +28,62 @@
 ;
 ; with X being either w1, x1, s0, d0 or q0.
 
+declare void @bar_byte(%s.byte*, i8)
+
+define void @load-pre-indexed-byte(%struct.byte* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-byte
+; CHECK: ldrb w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1, i32 0
+  %add = load i8, i8* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1
+  tail call void @bar_byte(%s.byte* %c, i8 %add)
+  ret void
+}
+
+define void @store-pre-indexed-byte(%struct.byte* %ptr, i8 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-byte
+; CHECK: strb w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1, i32 0
+  store i8 %val, i8* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1
+  tail call void @bar_byte(%s.byte* %c, i8 %val)
+  ret void
+}
+
+declare void @bar_halfword(%s.halfword*, i16)
+
+define void @load-pre-indexed-halfword(%struct.halfword* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-halfword
+; CHECK: ldrh w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1, i32 0
+  %add = load i16, i16* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1
+  tail call void @bar_halfword(%s.halfword* %c, i16 %add)
+  ret void
+}
+
+define void @store-pre-indexed-halfword(%struct.halfword* %ptr, i16 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-halfword
+; CHECK: strh w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1, i32 0
+  store i16 %val, i16* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1
+  tail call void @bar_halfword(%s.halfword* %c, i16 %val)
+  ret void
+}
+
 declare void @bar_word(%s.word*, i32)
 
 define void @load-pre-indexed-word(%struct.word* %ptr) nounwind {
@@ -164,6 +224,48 @@ bar:
   ret void
 }
 
+; Check the following transform:
+;
+; (ldp|stp) w1, w2 [x0, #32]
+;  ...
+; add x0, x0, #32
+;  ->
+; (ldp|stp) w1, w2, [x0, #32]!
+;
+
+define void @load-pair-pre-indexed-word(%struct.word* %ptr) nounwind {
+; CHECK-LABEL: load-pair-pre-indexed-word
+; CHECK: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]!
+; CHECK-NOT: add x0, x0, #32
+entry:
+  %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0
+  %a1 = load i32, i32* %a, align 4
+  %b = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 1
+  %b1 = load i32, i32* %b, align 4
+  %add = add i32 %a1, %b1
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1
+  tail call void @bar_word(%s.word* %c, i32 %add)
+  ret void
+}
+
+define void @store-pair-pre-indexed-word(%struct.word* %ptr, i32 %val) nounwind {
+; CHECK-LABEL: store-pair-pre-indexed-word
+; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]!
+; CHECK-NOT: add x0, x0, #32
+entry:
+  %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0
+  store i32 %val, i32* %a, align 4
+  %b = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 1
+  store i32 %val, i32* %b, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1
+  tail call void @bar_word(%s.word* %c, i32 %val)
+  ret void
+}
+
 ; Check the following transform:
 ;
 ; add x8, x8, #16
@@ -174,11 +276,11 @@ bar:
 ;
 ; with X being either w0, x0, s0, d0 or q0.
 
-%pre.struct.i32 = type { i32, i32, i32}
-%pre.struct.i64 = type { i32, i64, i64}
-%pre.struct.i128 = type { i32, <2 x i64>, <2 x i64>}
-%pre.struct.float = type { i32, float, float}
-%pre.struct.double = type { i32, double, double}
+%pre.struct.i32 = type { i32, i32, i32, i32, i32}
+%pre.struct.i64 = type { i32, i64, i64, i64, i64}
+%pre.struct.i128 = type { i32, <2 x i64>, <2 x i64>, <2 x i64>}
+%pre.struct.float = type { i32, float, float, float}
+%pre.struct.double = type { i32, double, double, double}
 
 define i32 @load-pre-indexed-word2(%pre.struct.i32** %this, i1 %cond,
                                    %pre.struct.i32* %load2) nounwind {
@@ -270,6 +372,96 @@ return:
   ret double %ret
 }
 
+define i32 @load-pre-indexed-word3(%pre.struct.i32** %this, i1 %cond,
+                                   %pre.struct.i32* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-word3
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, #12]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i32*, %pre.struct.i32** %this
+  %gep1 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load1, i64 0, i32 3
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load2, i64 0, i32 4
+  br label %return
+return:
+  %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load i32, i32* %retptr
+  ret i32 %ret
+}
+
+define i64 @load-pre-indexed-doubleword3(%pre.struct.i64** %this, i1 %cond,
+                                         %pre.struct.i64* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-doubleword3
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i64*, %pre.struct.i64** %this
+  %gep1 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load i64, i64* %retptr
+  ret i64 %ret
+}
+
+define <2 x i64> @load-pre-indexed-quadword3(%pre.struct.i128** %this, i1 %cond,
+                                             %pre.struct.i128* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-quadword3
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i128*, %pre.struct.i128** %this
+  %gep1 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load <2 x i64>, <2 x i64>* %retptr
+  ret <2 x i64> %ret
+}
+
+define float @load-pre-indexed-float3(%pre.struct.float** %this, i1 %cond,
+                                      %pre.struct.float* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-float3
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.float*, %pre.struct.float** %this
+  %gep1 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load float, float* %retptr
+  ret float %ret
+}
+
+define double @load-pre-indexed-double3(%pre.struct.double** %this, i1 %cond,
+                                        %pre.struct.double* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-double3
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.double*, %pre.struct.double** %this
+  %gep1 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load double, double* %retptr
+  ret double %ret
+}
+
 ; Check the following transform:
 ;
 ; add x8, x8, #16
@@ -375,6 +567,101 @@ return:
   ret void
 }
 
+define void @store-pre-indexed-word3(%pre.struct.i32** %this, i1 %cond,
+                                     %pre.struct.i32* %load2,
+                                     i32 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-word3
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #12]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i32*, %pre.struct.i32** %this
+  %gep1 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load1, i64 0, i32 3
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load2, i64 0, i32 4
+  br label %return
+return:
+  %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store i32 %val, i32* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-doubleword3(%pre.struct.i64** %this, i1 %cond,
+                                           %pre.struct.i64* %load2,
+                                           i64 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-doubleword3
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #24]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i64*, %pre.struct.i64** %this
+  %gep1 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load1, i64 0, i32 3
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load2, i64 0, i32 4
+  br label %return
+return:
+  %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store i64 %val, i64* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-quadword3(%pre.struct.i128** %this, i1 %cond,
+                                         %pre.struct.i128* %load2,
+                                         <2 x i64> %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-quadword3
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i128*, %pre.struct.i128** %this
+  %gep1 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store <2 x i64> %val, <2 x i64>* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-float3(%pre.struct.float** %this, i1 %cond,
+                                      %pre.struct.float* %load2,
+                                      float %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-float3
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.float*, %pre.struct.float** %this
+  %gep1 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store float %val, float* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-double3(%pre.struct.double** %this, i1 %cond,
+                                      %pre.struct.double* %load2,
+                                      double %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-double3
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.double*, %pre.struct.double** %this
+  %gep1 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store double %val, double* %retptr
+  ret void
+}
+
 ; Check the following transform:
 ;
 ; ldr X, [x20]
@@ -385,6 +672,54 @@ return:
 ;
 ; with X being either w0, x0, s0, d0 or q0.
 
+define void @load-post-indexed-byte(i8* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-byte
+; CHECK: ldrb w{{[0-9]+}}, [x{{[0-9]+}}], #4
+entry:
+  %gep1 = getelementptr i8, i8* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i8* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i8, i8* %iv2, i64 -1
+  %load = load i8, i8* %gep2
+  call void @use-byte(i8 %load)
+  %load2 = load i8, i8* %iv2
+  call void @use-byte(i8 %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i8, i8* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-halfword(i16* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-halfword
+; CHECK: ldrh w{{[0-9]+}}, [x{{[0-9]+}}], #8
+entry:
+  %gep1 = getelementptr i16, i16* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i16* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i16, i16* %iv2, i64 -1
+  %load = load i16, i16* %gep2
+  call void @use-halfword(i16 %load)
+  %load2 = load i16, i16* %iv2
+  call void @use-halfword(i16 %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i16, i16* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
 define void @load-post-indexed-word(i32* %array, i64 %count) nounwind {
 ; CHECK-LABEL: load-post-indexed-word
 ; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #16
@@ -515,6 +850,52 @@ exit:
 ;
 ; with X being either w0, x0, s0, d0 or q0.
 
+define void @store-post-indexed-byte(i8* %array, i64 %count, i8 %val) nounwind {
+; CHECK-LABEL: store-post-indexed-byte
+; CHECK: strb w{{[0-9]+}}, [x{{[0-9]+}}], #4
+entry:
+  %gep1 = getelementptr i8, i8* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i8* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i8, i8* %iv2, i64 -1
+  %load = load i8, i8* %gep2
+  call void @use-byte(i8 %load)
+  store i8 %val, i8* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i8, i8* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-halfword(i16* %array, i64 %count, i16 %val) nounwind {
+; CHECK-LABEL: store-post-indexed-halfword
+; CHECK: strh w{{[0-9]+}}, [x{{[0-9]+}}], #8
+entry:
+  %gep1 = getelementptr i16, i16* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i16* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i16, i16* %iv2, i64 -1
+  %load = load i16, i16* %gep2
+  call void @use-halfword(i16 %load)
+  store i16 %val, i16* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i16, i16* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
 define void @store-post-indexed-word(i32* %array, i64 %count, i32 %val) nounwind {
 ; CHECK-LABEL: store-post-indexed-word
 ; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}], #16
@@ -630,12 +1011,98 @@ exit:
   ret void
 }
 
+declare void @use-byte(i8)
+declare void @use-halfword(i16)
 declare void @use-word(i32)
 declare void @use-doubleword(i64)
 declare void @use-quadword(<2 x i64>)
 declare void @use-float(float)
 declare void @use-double(double)
 
+; Check the following transform:
+;
+; stp w0, [x20]
+;  ...
+; add x20, x20, #32
+;  ->
+; stp w0, [x20], #32
+
+define void @store-pair-post-indexed-word() nounwind {
+; CHECK-LABEL: store-pair-post-indexed-word
+; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [sp], #16
+; CHECK: ret
+  %src = alloca { i32, i32 }, align 8
+  %dst = alloca { i32, i32 }, align 8
+
+  %src.realp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %src, i32 0, i32 0
+  %src.real = load i32, i32* %src.realp
+  %src.imagp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %src, i32 0, i32 1
+  %src.imag = load i32, i32* %src.imagp
+
+  %dst.realp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %dst, i32 0, i32 1
+  store i32 %src.real, i32* %dst.realp
+  store i32 %src.imag, i32* %dst.imagp
+  ret void
+}
+
+define void @store-pair-post-indexed-doubleword() nounwind {
+; CHECK-LABEL: store-pair-post-indexed-doubleword
+; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [sp], #32
+; CHECK: ret
+  %src = alloca { i64, i64 }, align 8
+  %dst = alloca { i64, i64 }, align 8
+
+  %src.realp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %src, i32 0, i32 0
+  %src.real = load i64, i64* %src.realp
+  %src.imagp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %src, i32 0, i32 1
+  %src.imag = load i64, i64* %src.imagp
+
+  %dst.realp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %dst, i32 0, i32 1
+  store i64 %src.real, i64* %dst.realp
+  store i64 %src.imag, i64* %dst.imagp
+  ret void
+}
+
+define void @store-pair-post-indexed-float() nounwind {
+; CHECK-LABEL: store-pair-post-indexed-float
+; CHECK: stp s{{[0-9]+}}, s{{[0-9]+}}, [sp], #16
+; CHECK: ret
+  %src = alloca { float, float }, align 8
+  %dst = alloca { float, float }, align 8
+
+  %src.realp = getelementptr inbounds { float, float }, { float, float }* %src, i32 0, i32 0
+  %src.real = load float, float* %src.realp
+  %src.imagp = getelementptr inbounds { float, float }, { float, float }* %src, i32 0, i32 1
+  %src.imag = load float, float* %src.imagp
+
+  %dst.realp = getelementptr inbounds { float, float }, { float, float }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { float, float }, { float, float }* %dst, i32 0, i32 1
+  store float %src.real, float* %dst.realp
+  store float %src.imag, float* %dst.imagp
+  ret void
+}
+
+define void @store-pair-post-indexed-double() nounwind {
+; CHECK-LABEL: store-pair-post-indexed-double
+; CHECK: stp d{{[0-9]+}}, d{{[0-9]+}}, [sp], #32
+; CHECK: ret
+  %src = alloca { double, double }, align 8
+  %dst = alloca { double, double }, align 8
+
+  %src.realp = getelementptr inbounds { double, double }, { double, double }* %src, i32 0, i32 0
+  %src.real = load double, double* %src.realp
+  %src.imagp = getelementptr inbounds { double, double }, { double, double }* %src, i32 0, i32 1
+  %src.imag = load double, double* %src.imagp
+
+  %dst.realp = getelementptr inbounds { double, double }, { double, double }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { double, double }, { double, double }* %dst, i32 0, i32 1
+  store double %src.real, double* %dst.realp
+  store double %src.imag, double* %dst.imagp
+  ret void
+}
+
 ; Check the following transform:
 ;
 ; (ldr|str) X, [x20]
diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll
index 18dbad4ce25b1..86f5edd5da1d4 100644
--- a/test/CodeGen/AArch64/merge-store.ll
+++ b/test/CodeGen/AArch64/merge-store.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march aarch64 %s -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown  -mcpu=cyclone  | FileCheck %s --check-prefix=CYCLONE
 
 @g0 = external global <3 x float>, align 16
 @g1 = external global <3 x float>, align 4
@@ -18,3 +19,32 @@ define void @blam() {
   store float %tmp9, float* %tmp7
   ret void;
 }
+
+
+; PR21711 - Merge vector stores into wider vector stores.
+
+; On Cyclone, the stores should not get merged into a 16-byte store because
+; unaligned 16-byte stores are slow. This test would infinite loop when
+; the fastness of unaligned accesses was not specified correctly.
+
+define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
+  %idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
+  %idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4
+
+  %shuffle0 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuffle1 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+
+  store <2 x float> %shuffle0, <2 x float>* %idx0, align 8
+  store <2 x float> %shuffle1, <2 x float>* %idx1, align 8
+  ret void
+
+; CHECK-LABEL:    merge_vec_extract_stores
+; CHECK:          stur   q0, [x0, #24]
+; CHECK-NEXT:     ret
+
+; CYCLONE-LABEL:    merge_vec_extract_stores
+; CYCLONE:          ext   v1.16b, v0.16b, v0.16b, #8
+; CYCLONE-NEXT:     str   d0, [x0, #24]
+; CYCLONE-NEXT:     str   d1, [x0, #32]
+; CYCLONE-NEXT:     ret
+}
diff --git a/test/CodeGen/AArch64/misched-fusion.ll b/test/CodeGen/AArch64/misched-fusion.ll
new file mode 100644
index 0000000000000..d38869329034f
--- /dev/null
+++ b/test/CodeGen/AArch64/misched-fusion.ll
@@ -0,0 +1,34 @@
+; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s
+target triple = "arm64-apple-ios"
+
+declare void @foobar(i32 %v0, i32 %v1)
+
+; Make sure sub is scheduled in front of cbnz
+; CHECK-LABEL: test_sub_cbz:
+; CHECK: add w[[ADDRES:[0-9]+]], w1, #7
+; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13
+; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]]
+; CHECK: mov x0, x[[ADDRES]]
+; CHECK: mov x1, x[[SUBRES]]
+; CHECK: bl _foobar
+; CHECK: [[SKIPBLOCK]]:
+; CHECK: mov x0, x[[SUBRES]]
+; CHECK: mov x1, x[[ADDRES]]
+; CHECK: bl _foobar
+define void @test_sub_cbz(i32 %a0, i32 %a1) {
+entry:
+  ; except for the fusion opportunity the sub/add should be equal so the
+  ; scheduler would leave them in source order if it weren't for the scheduling
+  %v0 = sub i32 %a0, 13
+  %cond = icmp eq i32 %v0, 0
+  %v1 = add i32 %a1, 7
+  br i1 %cond, label %if, label %exit
+
+if:
+  call void @foobar(i32 %v1, i32 %v0)
+  br label %exit
+
+exit:
+  call void @foobar(i32 %v0, i32 %v1)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
index 4515697b99918..e93521858a312 100644
--- a/test/CodeGen/AArch64/mul-lohi.ll
+++ b/test/CodeGen/AArch64/mul-lohi.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s
 ; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - | FileCheck --check-prefix=CHECK-BE %s
+
 define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: test_128bitmul:
 ; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3
@@ -16,3 +17,31 @@ define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
   %prod = mul i128 %lhs, %rhs
   ret i128 %prod
 }
+
+; The machine combiner should create madd instructions when
+; optimizing for size because that's smaller than mul + add.
+
+define i128 @test_128bitmul_optsize(i128 %lhs, i128 %rhs) optsize {
+; CHECK-LABEL: test_128bitmul_optsize:
+; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
+; CHECK-NEXT:  madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
+; CHECK-NEXT:  madd  x1, x1, x2, [[TEMP1]]
+; CHECK-NEXT:  mul   x0, x0, x2
+; CHECK-NEXT:  ret
+
+  %prod = mul i128 %lhs, %rhs
+  ret i128 %prod
+}
+
+define i128 @test_128bitmul_minsize(i128 %lhs, i128 %rhs) minsize {
+; CHECK-LABEL: test_128bitmul_minsize:
+; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
+; CHECK-NEXT:  madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
+; CHECK-NEXT:  madd  x1, x1, x2, [[TEMP1]]
+; CHECK-NEXT:  mul   x0, x0, x2
+; CHECK-NEXT:  ret
+
+  %prod = mul i128 %lhs, %rhs
+  ret i128 %prod
+}
+
diff --git a/test/CodeGen/AArch64/nest-register.ll b/test/CodeGen/AArch64/nest-register.ll
index 9c659fb74ec44..cc42913e10a6c 100644
--- a/test/CodeGen/AArch64/nest-register.ll
+++ b/test/CodeGen/AArch64/nest-register.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -disable-post-ra -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 ; Tests that the 'nest' parameter attribute causes the relevant parameter to be
 ; passed in the right register.
diff --git a/test/CodeGen/AArch64/nontemporal.ll b/test/CodeGen/AArch64/nontemporal.ll
new file mode 100644
index 0000000000000..db9779e031904
--- /dev/null
+++ b/test/CodeGen/AArch64/nontemporal.ll
@@ -0,0 +1,339 @@
+; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false -disable-post-ra | FileCheck %s
+
+define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i64:
+; CHECK-NEXT:  mov d[[HI1:[0-9]+]], v1[1]
+; CHECK-NEXT:  mov d[[HI0:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d1, d[[HI1]], [x0, #16]
+; CHECK-NEXT:  stnp d0, d[[HI0]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i32:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x i32> %v, <4 x i32>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v8i16(<8 x i16>* %p, <8 x i16> %v) #0 {
+; CHECK-LABEL: test_stnp_v8i16:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <8 x i16> %v, <8 x i16>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {
+; CHECK-LABEL: test_stnp_v16i8:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <16 x i8> %v, <16 x i8>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2i32(<2 x i32>* %p, <2 x i32> %v) #0 {
+; CHECK-LABEL: test_stnp_v2i32:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <2 x i32> %v, <2 x i32>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4i16(<4 x i16>* %p, <4 x i16> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i16:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x i16> %v, <4 x i16>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v8i8(<8 x i8>* %p, <8 x i8> %v) #0 {
+; CHECK-LABEL: test_stnp_v8i8:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <8 x i8> %v, <8 x i8>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f64(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <2 x double> %v, <2 x double>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32(<4 x float>* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x float> %v, <4 x float>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <2 x float> %v, <2 x float>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v1f64(<1 x double>* %p, <1 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v1f64:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <1 x double> %v, <1 x double>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v1i64(<1 x i64>* %p, <1 x i64> %v) #0 {
+; CHECK-LABEL: test_stnp_v1i64:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <1 x i64> %v, <1 x i64>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_i64(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64:
+; CHECK-NEXT:  ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT:  stnp w1, w[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store i64 %v, i64* %p, align 1, !nontemporal !0
+  ret void
+}
+
+
+define void @test_stnp_v2f64_offset(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64_offset:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0, #16]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 1
+  store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f64_offset_neg(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64_offset_neg:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0, #-16]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 -1
+  store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_offset(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0, #8]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 1
+  store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_offset_neg(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset_neg:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0, #-8]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 -1
+  store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_i64_offset(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64_offset:
+; CHECK-NEXT:  ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT:  stnp w1, w[[HI]], [x0, #8]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i64, i64* %p, i32 1
+  store i64 %v, i64* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_i64_offset_neg(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64_offset_neg:
+; CHECK-NEXT:  ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT:  stnp w1, w[[HI]], [x0, #-8]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i64, i64* %p, i32 -1
+  store i64 %v, i64* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_invalid_offset_4(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_invalid_offset_4:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #4
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 4
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_invalid_offset_neg_4(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_invalid_offset_neg_4:
+; CHECK-NEXT:  sub x[[PTR:[0-9]+]], x0, #4
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 -4
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_invalid_offset_512(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_invalid_offset_512:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #512
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 512
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_offset_504(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_offset_504:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0, #504]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 504
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_invalid_offset_508(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_invalid_offset_508:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #508
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 508
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_invalid_offset_neg_520(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_invalid_offset_neg_520:
+; CHECK-NEXT:  sub x[[PTR:[0-9]+]], x0, #520
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 -520
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_offset_neg_512(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_offset_neg_512:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0, #-512]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 -512
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+
+define void @test_stnp_v2f32_invalid_offset_256(i8* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_invalid_offset_256:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #256
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 256
+  %tmp1 = bitcast i8* %tmp0 to <2 x float>*
+  store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_offset_252(i8* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset_252:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0, #252]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 252
+  %tmp1 = bitcast i8* %tmp0 to <2 x float>*
+  store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_invalid_offset_neg_260(i8* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_invalid_offset_neg_260:
+; CHECK-NEXT:  sub x[[PTR:[0-9]+]], x0, #260
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 -260
+  %tmp1 = bitcast i8* %tmp0 to <2 x float>*
+  store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_offset_neg_256(i8* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset_neg_256:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0, #-256]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 -256
+  %tmp1 = bitcast i8* %tmp0 to <2 x float>*
+  store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+declare void @dummy(<4 x float>*)
+
+define void @test_stnp_v4f32_offset_alloca(<4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_offset_alloca:
+; CHECK:       stnp d0, d{{.*}}, [sp]
+; CHECK-NEXT:  mov x0, sp
+; CHECK-NEXT:  bl _dummy
+  %tmp0 = alloca <4 x float>
+  store <4 x float> %v, <4 x float>* %tmp0, align 1, !nontemporal !0
+  call void @dummy(<4 x float>* %tmp0)
+  ret void
+}
+
+define void @test_stnp_v4f32_offset_alloca_2(<4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_offset_alloca_2:
+; CHECK:       stnp d0, d{{.*}}, [sp, #16]
+; CHECK-NEXT:  mov x0, sp
+; CHECK-NEXT:  bl _dummy
+  %tmp0 = alloca <4 x float>, i32 2
+  %tmp1 = getelementptr <4 x float>, <4 x float>* %tmp0, i32 1
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  call void @dummy(<4 x float>* %tmp0)
+  ret void
+}
+
+!0 = !{ i32 1 }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index 143558f7b2c72..c59a5b6743d63 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -15,7 +15,7 @@
 ; CHECK-NEXT: .xword  .L_ZTIi.DW.stub-[[TYPEINFO_LBL]]
 
   ; .. and which is properly defined (in a writable section for the dynamic loader) later.
-; CHECK: .section .data.rel,"aw"
+; CHECK: .data
 ; CHECK: .L_ZTIi.DW.stub:
 ; CHECK-NEXT: .xword _ZTIi
 
diff --git a/test/CodeGen/AArch64/readcyclecounter.ll b/test/CodeGen/AArch64/readcyclecounter.ll
new file mode 100644
index 0000000000000..037f118093869
--- /dev/null
+++ b/test/CodeGen/AArch64/readcyclecounter.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=aarch64-unknown-unknown -asm-verbose=false < %s |\
+; RUN:   FileCheck %s --check-prefix=CHECK --check-prefix=PERFMON
+; RUN: llc -mtriple=aarch64-unknown-unknown -mattr=-perfmon -asm-verbose=false < %s |\
+; RUN:   FileCheck %s --check-prefix=CHECK --check-prefix=NOPERFMON
+
+define i64 @test_readcyclecounter() nounwind {
+  ; CHECK-LABEL:   test_readcyclecounter:
+  ; PERFMON-NEXT:   mrs x0, PMCCNTR_EL0
+  ; NOPERFMON-NEXT: mov x0, xzr
+  ; CHECK-NEXT:     ret
+  %tmp0 = call i64 @llvm.readcyclecounter()
+  ret i64 %tmp0
+}
+
+declare i64 @llvm.readcyclecounter()
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index 0d301bbd502a3..ba34873eaa5b7 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -27,8 +27,8 @@ define i64 @test_chains() {
 
 ; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
 ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
-; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: sturb w[[STRVAL:[0-9]+]], [x29, [[LOCADDR]]]
+; CHECK; and w0, w[[STRVAL]], #0xff
 
   %ret.1 = load i8, i8* %locvar
   %ret.2 = zext i8 %ret.1 to i64
diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll
index 8b3e6dd5ad92a..a397c339a2d7b 100644
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a35 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s
diff --git a/test/CodeGen/AArch64/rotate.ll b/test/CodeGen/AArch64/rotate.ll
new file mode 100644
index 0000000000000..5ac86d5f59c9d
--- /dev/null
+++ b/test/CodeGen/AArch64/rotate.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=aarch64--linux-gnueabihf | FileCheck %s
+
+;; This used to cause a backend crash about not being able to
+;; select ROTL. Make sure if generates the basic ushr/shl.
+define <2 x i64> @testcase(<2 x i64>* %in) {
+; CHECK-LABEL: testcase
+; CHECK: ushr {{v[0-9]+}}.2d
+; CHECK: shl  {{v[0-9]+}}.2d
+  %1 = load <2 x i64>, <2 x i64>* %in
+  %2 = lshr <2 x i64> %1, <i64 8, i64 8>
+  %3 = shl <2 x i64> %1, <i64 56, i64 56>
+  %4 = or <2 x i64> %2, %3
+  ret <2 x i64> %4
+}
diff --git a/test/CodeGen/AArch64/round-conv.ll b/test/CodeGen/AArch64/round-conv.ll
new file mode 100644
index 0000000000000..5ed7d9409e3dd
--- /dev/null
+++ b/test/CodeGen/AArch64/round-conv.ll
@@ -0,0 +1,330 @@
+; RUN: llc < %s -mtriple=arm64 | FileCheck %s
+
+; CHECK-LABEL: testmsws:
+; CHECK: fcvtms w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testmsws(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testmsxs:
+; CHECK: fcvtms x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testmsxs(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptosi float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testmswd:
+; CHECK: fcvtms w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testmswd(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testmsxd:
+; CHECK: fcvtms x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testmsxd(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptosi double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testmuws:
+; CHECK: fcvtmu w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testmuws(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testmuxs:
+; CHECK: fcvtmu x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testmuxs(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptoui float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testmuwd:
+; CHECK: fcvtmu w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testmuwd(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testmuxd:
+; CHECK: fcvtmu x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testmuxd(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptoui double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testpsws:
+; CHECK: fcvtps w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testpsws(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testpsxs:
+; CHECK: fcvtps x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testpsxs(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptosi float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testpswd:
+; CHECK: fcvtps w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testpswd(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testpsxd:
+; CHECK: fcvtps x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testpsxd(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptosi double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testpuws:
+; CHECK: fcvtpu w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testpuws(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testpuxs:
+; CHECK: fcvtpu x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testpuxs(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptoui float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testpuwd:
+; CHECK: fcvtpu w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testpuwd(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testpuxd:
+; CHECK: fcvtpu x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testpuxd(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptoui double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testzsws:
+; CHECK: fcvtzs w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testzsws(float %a) {
+entry:
+  %call = call float @truncf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testzsxs:
+; CHECK: fcvtzs x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testzsxs(float %a) {
+entry:
+  %call = call float @truncf(float %a) nounwind readnone
+  %conv = fptosi float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testzswd:
+; CHECK: fcvtzs w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testzswd(double %a) {
+entry:
+  %call = call double @trunc(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testzsxd:
+; CHECK: fcvtzs x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testzsxd(double %a) {
+entry:
+  %call = call double @trunc(double %a) nounwind readnone
+  %conv = fptosi double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testzuws:
+; CHECK: fcvtzu w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testzuws(float %a) {
+entry:
+  %call = call float @truncf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testzuxs:
+; CHECK: fcvtzu x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testzuxs(float %a) {
+entry:
+  %call = call float @truncf(float %a) nounwind readnone
+  %conv = fptoui float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testzuwd:
+; CHECK: fcvtzu w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testzuwd(double %a) {
+entry:
+  %call = call double @trunc(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testzuxd:
+; CHECK: fcvtzu x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testzuxd(double %a) {
+entry:
+  %call = call double @trunc(double %a) nounwind readnone
+  %conv = fptoui double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testasws:
+; CHECK: fcvtas w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testasws(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testasxs:
+; CHECK: fcvtas x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testasxs(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptosi float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testaswd:
+; CHECK: fcvtas w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testaswd(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testasxd:
+; CHECK: fcvtas x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testasxd(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptosi double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testauws:
+; CHECK: fcvtau w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testauws(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testauxs:
+; CHECK: fcvtau x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testauxs(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptoui float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testauwd:
+; CHECK: fcvtau w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testauwd(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testauxd:
+; CHECK: fcvtau x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testauxd(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptoui double %call to i64
+  ret i64 %conv
+}
+
+declare float @floorf(float) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare float @ceilf(float) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare float @truncf(float) nounwind readnone
+declare double @trunc(double) nounwind readnone
+declare float @roundf(float) nounwind readnone
+declare double @round(double) nounwind readnone
diff --git a/test/CodeGen/AArch64/shrink-wrap.ll b/test/CodeGen/AArch64/shrink-wrap.ll
new file mode 100755
index 0000000000000..ea101a8da15d5
--- /dev/null
+++ b/test/CodeGen/AArch64/shrink-wrap.ll
@@ -0,0 +1,184 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s
+
+; Regression test for a crash in the ShrinkWrap pass not handling targets
+; requiring a register scavenger.
+
+%type1 = type { i32, i32, i32 }
+
+@g1 = external unnamed_addr global i32, align 4
+@g2 = external unnamed_addr global i1
+@g3 = external unnamed_addr global [144 x i32], align 4
+@g4 = external unnamed_addr constant [144 x i32], align 4
+@g5 = external unnamed_addr constant [144 x i32], align 4
+@g6 = external unnamed_addr constant [144 x i32], align 4
+@g7 = external unnamed_addr constant [144 x i32], align 4
+@g8 = external unnamed_addr constant [144 x i32], align 4
+@g9 = external unnamed_addr constant [144 x i32], align 4
+@g10 = external unnamed_addr constant [144 x i32], align 4
+@g11 = external unnamed_addr global i32, align 4
+@g12 = external unnamed_addr global [144 x [144 x i8]], align 1
+@g13 = external unnamed_addr global %type1*, align 8
+@g14 = external unnamed_addr global [144 x [144 x i8]], align 1
+@g15 = external unnamed_addr global [144 x [144 x i8]], align 1
+@g16 = external unnamed_addr global [144 x [144 x i8]], align 1
+@g17 = external unnamed_addr global [62 x i32], align 4
+@g18 = external unnamed_addr global i32, align 4
+@g19 = external unnamed_addr constant [144 x i32], align 4
+@g20 = external unnamed_addr global [144 x [144 x i8]], align 1
+@g21 = external unnamed_addr global i32, align 4
+
+declare fastcc i32 @foo()
+
+declare fastcc i32 @bar()
+
+define internal fastcc i32 @func(i32 %alpha, i32 %beta) {
+entry:
+  %v1 = alloca [2 x [11 x i32]], align 4
+  %v2 = alloca [11 x i32], align 16
+  %v3 = alloca [11 x i32], align 16
+  switch i32 undef, label %if.end.9 [
+    i32 4, label %if.then.6
+    i32 3, label %if.then.2
+  ]
+
+if.then.2:
+  %call3 = tail call fastcc i32 @bar()
+  br label %cleanup
+
+if.then.6:
+  %call7 = tail call fastcc i32 @foo()
+  unreachable
+
+if.end.9:
+  %tmp = load i32, i32* @g1, align 4
+  %rem.i = urem i32 %tmp, 1000000
+  %idxprom.1.i = zext i32 %rem.i to i64
+  %tmp1 = load %type1*, %type1** @g13, align 8
+  %v4 = getelementptr inbounds %type1, %type1* %tmp1, i64 %idxprom.1.i, i32 0
+  %.b = load i1, i1* @g2, align 1
+  %v5 = select i1 %.b, i32 2, i32 0
+  %tmp2 = load i32, i32* @g18, align 4
+  %tmp3 = load i32, i32* @g11, align 4
+  %idxprom58 = sext i32 %tmp3 to i64
+  %tmp4 = load i32, i32* @g21, align 4
+  %idxprom69 = sext i32 %tmp4 to i64
+  br label %for.body
+
+for.body:
+  %v6 = phi i32 [ 0, %if.end.9 ], [ %v7, %for.inc ]
+  %a.0983 = phi i32 [ 1, %if.end.9 ], [ %a.1, %for.inc ]
+  %arrayidx = getelementptr inbounds [62 x i32], [62 x i32]* @g17, i64 0, i64 undef
+  %tmp5 = load i32, i32* %arrayidx, align 4
+  br i1 undef, label %for.inc, label %if.else.51
+
+if.else.51:
+  %idxprom53 = sext i32 %tmp5 to i64
+  %arrayidx54 = getelementptr inbounds [144 x i32], [144 x i32]* @g3, i64 0, i64 %idxprom53
+  %tmp6 = load i32, i32* %arrayidx54, align 4
+  switch i32 %tmp6, label %for.inc [
+    i32 1, label %block.bb
+    i32 10, label %block.bb.159
+    i32 7, label %block.bb.75
+    i32 8, label %block.bb.87
+    i32 9, label %block.bb.147
+    i32 12, label %block.bb.111
+    i32 3, label %block.bb.123
+    i32 4, label %block.bb.135
+  ]
+
+block.bb:
+  %arrayidx56 = getelementptr inbounds [144 x i32], [144 x i32]* @g6, i64 0, i64 %idxprom53
+  %tmp7 = load i32, i32* %arrayidx56, align 4
+  %shr = ashr i32 %tmp7, %v5
+  %add57 = add nsw i32 %shr, 0
+  %arrayidx61 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g14, i64 0, i64 %idxprom53, i64 %idxprom58
+  %tmp8 = load i8, i8* %arrayidx61, align 1
+  %conv = zext i8 %tmp8 to i32
+  %add62 = add nsw i32 %conv, %add57
+  br label %for.inc
+
+block.bb.75:
+  %arrayidx78 = getelementptr inbounds [144 x i32], [144 x i32]* @g10, i64 0, i64 %idxprom53
+  %tmp9 = load i32, i32* %arrayidx78, align 4
+  %shr79 = ashr i32 %tmp9, %v5
+  %add80 = add nsw i32 %shr79, 0
+  %add86 = add nsw i32 0, %add80
+  br label %for.inc
+
+block.bb.87:
+  %arrayidx90 = getelementptr inbounds [144 x i32], [144 x i32]* @g9, i64 0, i64 %idxprom53
+  %tmp10 = load i32, i32* %arrayidx90, align 4
+  %shr91 = ashr i32 %tmp10, 0
+  %sub92 = sub nsw i32 0, %shr91
+  %arrayidx96 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g15, i64 0, i64 %idxprom53, i64 %idxprom69
+  %tmp11 = load i8, i8* %arrayidx96, align 1
+  %conv97 = zext i8 %tmp11 to i32
+  %sub98 = sub nsw i32 %sub92, %conv97
+  br label %for.inc
+
+block.bb.111:
+  %arrayidx114 = getelementptr inbounds [144 x i32], [144 x i32]* @g19, i64 0, i64 %idxprom53
+  %tmp12 = load i32, i32* %arrayidx114, align 4
+  %shr115 = ashr i32 %tmp12, 0
+  %sub116 = sub nsw i32 0, %shr115
+  %arrayidx120 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g12, i64 0, i64 %idxprom53, i64 %idxprom69
+  %tmp13 = load i8, i8* %arrayidx120, align 1
+  %conv121 = zext i8 %tmp13 to i32
+  %sub122 = sub nsw i32 %sub116, %conv121
+  br label %for.inc
+
+block.bb.123:
+  %arrayidx126 = getelementptr inbounds [144 x i32], [144 x i32]* @g5, i64 0, i64 %idxprom53
+  %tmp14 = load i32, i32* %arrayidx126, align 4
+  %shr127 = ashr i32 %tmp14, %v5
+  %add128 = add nsw i32 %shr127, 0
+  %add134 = add nsw i32 0, %add128
+  br label %for.inc
+
+block.bb.135:
+  %arrayidx138 = getelementptr inbounds [144 x i32], [144 x i32]* @g4, i64 0, i64 %idxprom53
+  %tmp15 = load i32, i32* %arrayidx138, align 4
+  %shr139 = ashr i32 %tmp15, 0
+  %sub140 = sub nsw i32 0, %shr139
+  %arrayidx144 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g20, i64 0, i64 %idxprom53, i64 %idxprom69
+  %tmp16 = load i8, i8* %arrayidx144, align 1
+  %conv145 = zext i8 %tmp16 to i32
+  %sub146 = sub nsw i32 %sub140, %conv145
+  br label %for.inc
+
+block.bb.147:
+  %arrayidx150 = getelementptr inbounds [144 x i32], [144 x i32]* @g8, i64 0, i64 %idxprom53
+  %tmp17 = load i32, i32* %arrayidx150, align 4
+  %shr151 = ashr i32 %tmp17, %v5
+  %add152 = add nsw i32 %shr151, 0
+  %arrayidx156 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g16, i64 0, i64 %idxprom53, i64 %idxprom58
+  %tmp18 = load i8, i8* %arrayidx156, align 1
+  %conv157 = zext i8 %tmp18 to i32
+  %add158 = add nsw i32 %conv157, %add152
+  br label %for.inc
+
+block.bb.159:
+  %sub160 = add nsw i32 %v6, -450
+  %arrayidx162 = getelementptr inbounds [144 x i32], [144 x i32]* @g7, i64 0, i64 %idxprom53
+  %tmp19 = load i32, i32* %arrayidx162, align 4
+  %shr163 = ashr i32 %tmp19, 0
+  %sub164 = sub nsw i32 %sub160, %shr163
+  %sub170 = sub nsw i32 %sub164, 0
+  br label %for.inc
+
+for.inc:
+  %v7 = phi i32 [ %v6, %for.body ], [ %v6, %if.else.51 ], [ %sub170, %block.bb.159 ], [ %add158, %block.bb.147 ], [ %sub146, %block.bb.135 ], [ %add134, %block.bb.123 ], [ %sub122, %block.bb.111 ], [ %sub98, %block.bb.87 ], [ %add86, %block.bb.75 ], [ %add62, %block.bb ]
+  %a.1 = phi i32 [ %a.0983, %for.body ], [ undef, %if.else.51 ], [ undef, %block.bb.159 ], [ undef, %block.bb.147 ], [ undef, %block.bb.135 ], [ undef, %block.bb.123 ], [ undef, %block.bb.111 ], [ undef, %block.bb.87 ], [ undef, %block.bb.75 ], [ undef, %block.bb ]
+  %cmp48 = icmp sgt i32 %a.1, %tmp2
+  br i1 %cmp48, label %for.end, label %for.body
+
+for.end:
+  store i32 %tmp, i32* %v4, align 4
+  %hold_hash.i.7 = getelementptr inbounds %type1, %type1* %tmp1, i64 %idxprom.1.i, i32 1
+  store i32 0, i32* %hold_hash.i.7, align 4
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ %call3, %if.then.2 ], [ undef, %for.end ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/AArch64/stackmap-frame-setup.ll b/test/CodeGen/AArch64/stackmap-frame-setup.ll
new file mode 100644
index 0000000000000..4712012b0d25d
--- /dev/null
+++ b/test/CodeGen/AArch64/stackmap-frame-setup.ll
@@ -0,0 +1,20 @@
+; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=aarch64-apple-darwin -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
+; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
+
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+; ISEL:      ADJCALLSTACKDOWN 0, implicit-def
+; ISEL-NEXT: STACKMAP
+; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+; FAST-ISEL:      ADJCALLSTACKDOWN 0, implicit-def
+; FAST-ISEL-NEXT: STACKMAP
+; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index e5766154bb46f..fa5d8b943b6b5 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -59,8 +59,7 @@ define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
 ; callee will not deallocate the space, even in fastcc.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
 
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack16
   ret void
 }
@@ -89,8 +88,7 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
   ret void
 
 ; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack16
 }
 
diff --git a/test/CodeGen/AArch64/tailcall-explicit-sret.ll b/test/CodeGen/AArch64/tailcall-explicit-sret.ll
index 4d80f2ac5c121..bcc8af8d0690f 100644
--- a/test/CodeGen/AArch64/tailcall-explicit-sret.ll
+++ b/test/CodeGen/AArch64/tailcall-explicit-sret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false -disable-post-ra | FileCheck %s
 ; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/tbi.ll b/test/CodeGen/AArch64/tbi.ll
new file mode 100644
index 0000000000000..ab2d31b7cacc0
--- /dev/null
+++ b/test/CodeGen/AArch64/tbi.ll
@@ -0,0 +1,102 @@
+; RUN: llc -aarch64-use-tbi -mtriple=arm64-apple-ios8.0.0 < %s \
+; RUN:     | FileCheck --check-prefix=TBI    --check-prefix=BOTH %s
+; RUN: llc -aarch64-use-tbi -mtriple=arm64-apple-ios7.1.0 < %s \
+; RUN:     | FileCheck --check-prefix=NO_TBI --check-prefix=BOTH %s
+
+; BOTH-LABEL:ld_and32:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_and32(i64 %p) {
+  %and = and i64 %p, 72057594037927935
+  %cast = inttoptr i64 %and to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
+
+; load (r & MASK) + 4
+; BOTH-LABEL:ld_and_plus_offset:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_and_plus_offset(i64 %p) {
+  %and = and i64 %p, 72057594037927935
+  %cast = inttoptr i64 %and to i32*
+  %gep = getelementptr i32, i32* %cast, i64 4
+  %load = load i32, i32* %gep
+  ret i32 %load
+}
+
+; load (r & WIDER_MASK)
+; BOTH-LABEL:ld_and32_wider:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_and32_wider(i64 %p) {
+  %and = and i64 %p, 1152921504606846975
+  %cast = inttoptr i64 %and to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
+
+; BOTH-LABEL:ld_and64:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i64 @ld_and64(i64 %p) {
+  %and = and i64 %p, 72057594037927935
+  %cast = inttoptr i64 %and to i64*
+  %load = load i64, i64* %cast
+  ret i64 %load
+}
+
+; BOTH-LABEL:st_and32:
+; TBI-NOT: and x
+; NO_TBI: and x
+define void @st_and32(i64 %p, i32 %v) {
+  %and = and i64 %p, 72057594037927935
+  %cast = inttoptr i64 %and to i32*
+  store i32 %v, i32* %cast
+  ret void
+}
+
+; load (x1 + x2) & MASK
+; BOTH-LABEL:ld_ro:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_ro(i64 %a, i64 %b) {
+  %p = add i64 %a, %b
+  %and = and i64 %p, 72057594037927935
+  %cast = inttoptr i64 %and to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
+
+; load (r1 & MASK) + r2
+; BOTH-LABEL:ld_ro2:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_ro2(i64 %a, i64 %b) {
+  %and = and i64 %a, 72057594037927935
+  %p = add i64 %and, %b
+  %cast = inttoptr i64 %p to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
+
+; load (r1 & MASK) | r2
+; BOTH-LABEL:ld_indirect_and:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_indirect_and(i64 %r1, i64 %r2) {
+  %and = and i64 %r1, 72057594037927935
+  %p = or i64 %and, %r2
+  %cast = inttoptr i64 %p to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
+
+; BOTH-LABEL:ld_and32_narrower:
+; BOTH: and x
+define i32 @ld_and32_narrower(i64 %p) {
+  %and = and i64 %p, 36028797018963967
+  %cast = inttoptr i64 %and to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
diff --git a/test/CodeGen/AArch64/vector-fcopysign.ll b/test/CodeGen/AArch64/vector-fcopysign.ll
new file mode 100644
index 0000000000000..865a0a5b85808
--- /dev/null
+++ b/test/CodeGen/AArch64/vector-fcopysign.ll
@@ -0,0 +1,178 @@
+; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false -disable-post-ra | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+;============ v1f32
+
+; WidenVecRes same
+define <1 x float> @test_copysign_v1f32_v1f32(<1 x float> %a, <1 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v1f32_v1f32:
+; CHECK-NEXT:    movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.8b v0, v1, v2
+; CHECK-NEXT:    ret
+  %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %b)
+  ret <1 x float> %r
+}
+
+; WidenVecRes mismatched
+define <1 x float> @test_copysign_v1f32_v1f64(<1 x float> %a, <1 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v1f32_v1f64:
+; CHECK-NEXT:    fcvt s1, d1
+; CHECK-NEXT:    movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %tmp0 = fptrunc <1 x double> %b to <1 x float>
+  %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %tmp0)
+  ret <1 x float> %r
+}
+
+declare <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %b) #0
+
+;============ v1f64
+
+; WidenVecOp #1
+define <1 x double> @test_copysign_v1f64_v1f32(<1 x double> %a, <1 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v1f64_v1f32:
+; CHECK-NEXT:    fcvt d1, s1
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    fneg.2d v2, v2
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %tmp0 = fpext <1 x float> %b to <1 x double>
+  %r = call <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %tmp0)
+  ret <1 x double> %r
+}
+
+define <1 x double> @test_copysign_v1f64_v1f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v1f64_v1f64:
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    fneg.2d v2, v2
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %r = call <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %r
+}
+
+declare <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %b) #0
+
+;============ v2f32
+
+define <2 x float> @test_copysign_v2f32_v2f32(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v2f32_v2f32:
+; CHECK-NEXT:    movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.8b v0, v1, v2
+; CHECK-NEXT:    ret
+  %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_copysign_v2f32_v2f64(<2 x float> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v2f32_v2f64:
+; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.8b v0, v1, v2
+; CHECK-NEXT:    ret
+  %tmp0 = fptrunc <2 x double> %b to <2 x float>
+  %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %tmp0)
+  ret <2 x float> %r
+}
+
+declare <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) #0
+
+;============ v4f32
+
+define <4 x float> @test_copysign_v4f32_v4f32(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v4f32_v4f32:
+; CHECK-NEXT:    movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %r
+}
+
+; SplitVecOp #1
+define <4 x float> @test_copysign_v4f32_v4f64(<4 x float> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v4f32_v4f64:
+; CHECK-NEXT:    mov s3, v0[1]
+; CHECK-NEXT:    mov d4, v1[1]
+; CHECK-NEXT:    movi.4s v5, #0x80, lsl #24
+; CHECK-NEXT:    fcvt s1, d1
+; CHECK-NEXT:    mov s6, v0[2]
+; CHECK-NEXT:    mov s7, v0[3]
+; CHECK-NEXT:    fcvt s16, d2
+; CHECK-NEXT:    bit.16b v0, v1, v5
+; CHECK-NEXT:    bit.16b v6, v16, v5
+; CHECK-NEXT:    fcvt s1, d4
+; CHECK-NEXT:    bit.16b v3, v1, v5
+; CHECK-NEXT:    mov d1, v2[1]
+; CHECK-NEXT:    fcvt s1, d1
+; CHECK-NEXT:    ins.s v0[1], v3[0]
+; CHECK-NEXT:    ins.s v0[2], v6[0]
+; CHECK-NEXT:    bit.16b v7, v1, v5
+; CHECK-NEXT:    ins.s v0[3], v7[0]
+; CHECK-NEXT:    ret
+  %tmp0 = fptrunc <4 x double> %b to <4 x float>
+  %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0)
+  ret <4 x float> %r
+}
+
+declare <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) #0
+
+;============ v2f64
+
+define <2 x double> @test_copysign_v2f64_v232(<2 x double> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v2f64_v232:
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    fneg.2d v2, v2
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %tmp0 = fpext <2 x float> %b to <2 x double>
+  %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %tmp0)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_copysign_v2f64_v2f64(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v2f64_v2f64:
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    fneg.2d v2, v2
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %r
+}
+
+declare <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) #0
+
+;============ v4f64
+
+; SplitVecRes mismatched
+define <4 x double> @test_copysign_v4f64_v4f32(<4 x double> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v4f64_v4f32:
+; CHECK-NEXT:    movi.2d v3, #0000000000000000
+; CHECK-NEXT:    fcvtl2 v4.2d, v2.4s
+; CHECK-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-NEXT:    fneg.2d v3, v3
+; CHECK-NEXT:    bit.16b v1, v4, v3
+; CHECK-NEXT:    bit.16b v0, v2, v3
+; CHECK-NEXT:    ret
+  %tmp0 = fpext <4 x float> %b to <4 x double>
+  %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %tmp0)
+  ret <4 x double> %r
+}
+
+; SplitVecRes same
+define <4 x double> @test_copysign_v4f64_v4f64(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v4f64_v4f64:
+; CHECK-NEXT:    movi.2d v4, #0000000000000000
+; CHECK-NEXT:    fneg.2d v4, v4
+; CHECK-NEXT:    bit.16b v0, v2, v4
+; CHECK-NEXT:    bit.16b v1, v3, v4
+; CHECK-NEXT:    ret
+  %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b)
+  ret <4 x double> %r
+}
+
+declare <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/xbfiz.ll b/test/CodeGen/AArch64/xbfiz.ll
index f763400d7f6a7..3211cc3f2cedb 100644
--- a/test/CodeGen/AArch64/xbfiz.ll
+++ b/test/CodeGen/AArch64/xbfiz.ll
@@ -31,3 +31,33 @@ define i32 @ubfiz32(i32 %v) {
   %shr = lshr i32 %shl, 2
   ret i32 %shr
 }
+
+define i64 @ubfiz64and(i64 %v) {
+; CHECK-LABEL: ubfiz64and:
+; CHECK: ubfiz	x0, x0, #36, #11
+  %shl = shl i64 %v, 36
+  %and = and i64 %shl, 140668768878592
+  ret i64 %and
+}
+
+define i32 @ubfiz32and(i32 %v) {
+; CHECK-LABEL: ubfiz32and:
+; CHECK: ubfiz	w0, w0, #6, #24
+  %shl = shl i32 %v, 6
+  %and = and i32 %shl, 1073741760
+  ret i32 %and
+}
+
+; Check that we don't generate a ubfiz if the lsl has more than one
+; use, since we'd just be replacing an and with a ubfiz.
+define i32 @noubfiz32(i32 %v) {
+; CHECK-LABEL: noubfiz32:
+; CHECK: lsl	w[[REG1:[0-9]+]], w0, #6
+; CHECK: and	w[[REG2:[0-9]+]], w[[REG1]], #0x3fffffc0
+; CHECK: add	w0, w[[REG1]], w[[REG2]]
+; CHECK: ret
+  %shl = shl i32 %v, 6
+  %and = and i32 %shl, 1073741760
+  %add = add i32 %shl, %and
+  ret i32 %add
+}
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
index 655e75dbc1a48..2ddfa9649ac94 100644
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -5,7 +5,7 @@
 ;FUNC-LABEL: {{^}}test1:
 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
 ;SI-NOT: [[REG]]
 ;SI: buffer_store_dword [[REG]],
 define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -21,8 +21,8 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -39,10 +39,10 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/address-space.ll b/test/CodeGen/AMDGPU/address-space.ll
index 4be8c5847529c..3aa2f653bf9c4 100644
--- a/test/CodeGen/AMDGPU/address-space.ll
+++ b/test/CodeGen/AMDGPU/address-space.ll
@@ -5,15 +5,11 @@
 
 %struct.foo = type { [3 x float], [3 x float] }
 
-; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is
-; already in a VGPR after the first read.
-
 ; CHECK-LABEL: {{^}}do_as_ptr_calcs:
 ; CHECK: s_load_dword [[SREG1:s[0-9]+]],
-; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]]
 ; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:20
 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
new file mode 100644
index 0000000000000..61bcd4b3c093d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -0,0 +1,66 @@
+; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: unsupported addrspacecast not implemented
+
+; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+
+; Disable optimizations in case there are optimizations added that
+; specialize away generic pointer accesses.
+
+; CHECK-LABEL: {{^}}branch_use_flat_i32:
+; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: s_endpgm
+define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+entry:
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %local, label %global
+
+local:
+  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
+  br label %end
+
+global:
+  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  br label %end
+
+end:
+  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+;  %val = load i32, i32 addrspace(4)* %fptr, align 4
+;  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; TODO: This should not be zero when registers are used for small
+; scratch allocations again.
+
+; Check for prologue initializing special SGPRs pointing to scratch.
+; CHECK-LABEL: {{^}}store_flat_scratch:
+; CHECK: s_movk_i32 flat_scratch_lo, 0
+; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
+; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
+; CHECK: flat_store_dword
+; CHECK: s_barrier
+; CHECK: flat_load_dword
+define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+  %alloca = alloca i32, i32 9, align 4
+  %x = call i32 @llvm.r600.read.tidig.x() #3
+  %pptr = getelementptr i32, i32* %alloca, i32 %x
+  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() #1
+  %reload = load i32, i32 addrspace(4)* %fptr, align 4
+  store i32 %reload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+declare i32 @llvm.r600.read.tidig.x() #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind convergent }
+attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll
index 5672d470bd7e0..f83fb16101fb6 100644
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
+declare i32 @llvm.r600.read.tidig.x() #0
+
 ; FUNC-LABEL: {{^}}test2:
 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -54,13 +56,80 @@ define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_and_i32:
-; SI: v_and_b32
-define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+; FIXME: We should really duplicate the constant so that the SALU use
+; can fold into the s_and_b32 and the VALU one is materialized
+; directly without copying from the SGPR.
+
+; Second use is a VGPR use of the constant.
+; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_0:
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687
+; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
+; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
+; SI: buffer_store_dword [[VK]]
+define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %and = and i32 %a, 1234567
+
+  ; Just to stop future replacement of copy to vgpr + store with VALU op.
+  %foo = add i32 %and, %b
+  store volatile i32 %foo, i32 addrspace(1)* %out
+  store volatile i32 1234567, i32 addrspace(1)* %out
+  ret void
+}
+
+; Second use is another SGPR use of the constant.
+; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_1:
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687
+; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
+; SI: s_add_i32
+; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
+; SI: buffer_store_dword [[VK]]
+define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %and = and i32 %a, 1234567
+  %foo = add i32 %and, 1234567
+  %bar = add i32 %foo, %b
+  store volatile i32 %bar, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr:
+; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep.a
+  %b = load i32, i32 addrspace(1)* %gep.b
   %and = and i32 %a, %b
-  store i32 %and, i32 addrspace(1)* %out, align 4
+  store i32 %and, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i32_sgpr_vgpr:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]]
+; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
+; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
+define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.b
+  %and = and i32 %a, %b
+  store i32 %and, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i32_vgpr_sgpr:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]]
+; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
+; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
+define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep.a
+  %and = and i32 %a, %b
+  store i32 %and, i32 addrspace(1)* %gep.out
   ret void
 }
 
@@ -148,9 +217,23 @@ endif:
 }
 
 ; FUNC-LABEL: {{^}}v_and_constant_i64:
-; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207
+; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], {{v[0-9]+}}
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], {{v[0-9]+}}
+; SI: buffer_store_dwordx2
 define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %and = and i64 %a, 1231231234567
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Should replace and 0
+; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant:
+; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -294,3 +377,5 @@ define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 a
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
new file mode 100644
index 0000000000000..b116c72322bb9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -0,0 +1,193 @@
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA -check-prefix=ALL %s
+; RUN: opt -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s
+
+declare i32 @llvm.r600.read.tgid.x() #0
+declare i32 @llvm.r600.read.tgid.y() #0
+declare i32 @llvm.r600.read.tgid.z() #0
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.tidig.y() #0
+declare i32 @llvm.r600.read.tidig.z() #0
+
+declare i32 @llvm.r600.read.local.size.x() #0
+declare i32 @llvm.r600.read.local.size.y() #0
+declare i32 @llvm.r600.read.local.size.z() #0
+
+declare i32 @llvm.r600.read.global.size.x() #0
+declare i32 @llvm.r600.read.global.size.y() #0
+declare i32 @llvm.r600.read.global.size.z() #0
+
+
+; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.r600.read.tgid.x()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.r600.read.tgid.y()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.r600.read.tgid.y()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  %val1 = call i32 @llvm.r600.read.tgid.y()
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.r600.read.tgid.x()
+  %val1 = call i32 @llvm.r600.read.tgid.y()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.r600.read.tgid.z()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.r600.read.tgid.x()
+  %val1 = call i32 @llvm.r600.read.tgid.z()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.r600.read.tgid.y()
+  %val1 = call i32 @llvm.r600.read.tgid.z()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.r600.read.tgid.x()
+  %val1 = call i32 @llvm.r600.read.tgid.y()
+  %val2 = call i32 @llvm.r600.read.tgid.z()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  store volatile i32 %val2, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.r600.read.tidig.x()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.r600.read.tidig.y()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.r600.read.tidig.z()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.r600.read.tidig.x()
+  %val1 = call i32 @llvm.r600.read.tgid.x()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.r600.read.tidig.y()
+  %val1 = call i32 @llvm.r600.read.tgid.y()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.r600.read.tidig.x()
+  %val1 = call i32 @llvm.r600.read.tidig.y()
+  %val2 = call i32 @llvm.r600.read.tidig.z()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  store volatile i32 %val2, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.r600.read.tidig.x()
+  %val1 = call i32 @llvm.r600.read.tidig.y()
+  %val2 = call i32 @llvm.r600.read.tidig.z()
+  %val3 = call i32 @llvm.r600.read.tgid.x()
+  %val4 = call i32 @llvm.r600.read.tgid.y()
+  %val5 = call i32 @llvm.r600.read.tgid.z()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  store volatile i32 %val2, i32 addrspace(1)* %ptr
+  store volatile i32 %val3, i32 addrspace(1)* %ptr
+  store volatile i32 %val4, i32 addrspace(1)* %ptr
+  store volatile i32 %val5, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.r600.read.local.size.x()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.r600.read.local.size.y()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.r600.read.local.size.z()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+; HSA: attributes #0 = { nounwind readnone }
+; HSA: attributes #1 = { nounwind }
+; HSA: attributes #2 = { nounwind "amdgpu-work-group-id-y" }
+; HSA: attributes #3 = { nounwind "amdgpu-work-group-id-z" }
+; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" }
+; HSA: attributes #5 = { nounwind "amdgpu-work-item-id-y" }
+; HSA: attributes #6 = { nounwind "amdgpu-work-item-id-z" }
+; HSA: attributes #7 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-item-id-y" }
+; HSA: attributes #8 = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
+; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
+; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index 8c2a0795860d8..f8a74222d5669 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
-declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+declare void @llvm.AMDGPU.barrier.local() nounwind convergent
 
 ; The required pointer calculations for the alloca'd actually requires
 ; an add and won't be folded into the addressing, which fails with a
@@ -14,7 +14,7 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 ; FIXME: We end up with zero argument for ADD, because
 ; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
 ; with the appropriate offset.  We should fold this into the store.
-; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}}
+; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}}
 ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}]
 ;
 ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
@@ -22,7 +22,7 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 ; to interpret:
 ; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
 
-; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16
+; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16
 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
 define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %alloca = alloca [4 x i32], i32 4, align 16
@@ -35,7 +35,7 @@ define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 add
   %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
   store i32 %result, i32* %alloca_ptr, align 4
   ; Dummy call
-  call void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+  call void @llvm.AMDGPU.barrier.local() nounwind convergent
   %reload = load i32, i32* %alloca_ptr, align 4
   %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll
new file mode 100644
index 0000000000000..0ef7d5184c1f6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bitreverse.ll
@@ -0,0 +1,115 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i16 @llvm.bitreverse.i16(i16) #1
+declare i32 @llvm.bitreverse.i32(i32) #1
+declare i64 @llvm.bitreverse.i64(i64) #1
+
+declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1
+declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1
+
+declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
+declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
+
+declare i32 @llvm.AMDGPU.brev(i32) #1
+
+; FUNC-LABEL: {{^}}s_brev_i16:
+; SI: s_brev_b32
+define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
+  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
+  store i16 %brev, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_brev_i16:
+; SI: v_bfrev_b32_e32
+define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
+  %val = load i16, i16 addrspace(1)* %valptr
+  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
+  store i16 %brev, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_brev_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
+  store i32 %brev, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_brev_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
+  store i32 %brev, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_brev_v2i32:
+; SI: s_brev_b32
+; SI: s_brev_b32
+define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
+  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
+  store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_brev_v2i32:
+; SI: v_bfrev_b32_e32
+; SI: v_bfrev_b32_e32
+define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
+  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
+  store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_brev_i64:
+define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
+  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
+  store i64 %brev, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_brev_i64:
+define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
+  %val = load i64, i64 addrspace(1)* %valptr
+  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
+  store i64 %brev, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_brev_v2i64:
+define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
+  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
+  store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_brev_v2i64:
+define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
+  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
+  store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}legacy_s_brev_i32:
+; SI: s_brev_b32
+define void @legacy_s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %brev = call i32 @llvm.AMDGPU.brev(i32 %val) #1
+  store i32 %brev, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/calling-conventions.ll b/test/CodeGen/AMDGPU/calling-conventions.ll
new file mode 100644
index 0000000000000..57adc8be6a997
--- /dev/null
+++ b/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; Make sure we don't crash or assert on spir_kernel calling convention.
+
+; SI-LABEL: {{^}}kernel:
+; SI: s_endpgm
+define spir_kernel void @kernel(i32 addrspace(1)* %out) {
+entry:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This is treated like a kernel
+; SI-LABEL: {{^}}func:
+; SI: s_endpgm
+define spir_func void @func(i32 addrspace(1)* %out) {
+entry:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
new file mode 100644
index 0000000000000..1c5bed3b905f5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -0,0 +1,98 @@
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
+; OPT: getelementptr i32, i32 addrspace(4)* %in
+; OPT: br i1
+; OPT-NOT: ptrtoint
+
+; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
+; GCN: flat_load_dword
+; GCN: {{^}}BB0_2:
+define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(4)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
+; OPT: getelementptr i32, i32 addrspace(4)* %out,
+; OPT-CI-NOT: getelementptr
+; OPT: br i1
+
+; OPT-CI: ptrtoint
+; OPT-CI: add
+; OPT-CI: inttoptr
+; OPT: br label
+
+; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
+; CI: buffer_load_dword {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
+define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
+  %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(1)*
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(1)* %cast
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_constant_i32(
+; OPT: getelementptr i32, i32 addrspace(4)* %out,
+; OPT-CI-NOT: getelementptr
+; OPT: br i1
+
+; OPT-CI: ptrtoint
+; OPT-CI: add
+; OPT-CI: inttoptr
+; OPT: br label
+
+; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32:
+; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
+  %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(2)*
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(2)* %cast
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index a68d110fdc96d..698494265a7d4 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -1,5 +1,7 @@
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
@@ -115,35 +117,6 @@ done:
   ret void
 }
 
-; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
-; OPT: getelementptr i32, i32 addrspace(4)* %in
-; OPT: br i1
-; OPT-NOT: ptrtoint
-
-; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
-; GCN: flat_load_dword
-; GCN: {{^}}BB4_2:
-
-define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
-entry:
-  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %endif, label %if
-
-if:
-  %tmp1 = load i32, i32 addrspace(4)* %in.gep
-  br label %endif
-
-endif:
-  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(4)* %out.gep
-  br label %done
-
-done:
-  ret void
-}
-
 ; OPT-LABEL: @test_sink_scratch_small_offset_i32(
 ; OPT-NOT:  getelementptr [512 x i32]
 ; OPT: br i1
@@ -153,7 +126,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
-; GCN: {{^}}BB5_2:
+; GCN: {{^}}BB4_2:
 define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
@@ -189,7 +162,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; GCN: {{^}}BB6_2:
+; GCN: {{^}}BB5_2:
 define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
@@ -222,7 +195,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-; GCN: {{^}}BB7_2:
+; GCN: {{^}}BB6_2:
 define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
 entry:
   %offset.ext = zext i32 %offset to i64
@@ -246,3 +219,220 @@ done:
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
+
+
+
+; OPT-LABEL: @test_sink_constant_small_offset_i32
+; OPT-NOT:  getelementptr i32, i32 addrspace(2)*
+; OPT: br i1
+
+; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
+; GCN: s_and_saveexec_b64
+; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
+; GCN: s_or_b64 exec, exec
+define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
+; OPT-NOT:  getelementptr i32, i32 addrspace(2)*
+; OPT: br i1
+
+; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
+; GCN: s_and_saveexec_b64
+; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
+; GCN: s_or_b64 exec, exec
+define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
+; OPT-SI:  getelementptr i32, i32 addrspace(2)*
+; OPT-CI-NOT:  getelementptr i32, i32 addrspace(2)*
+; OPT-VI-NOT:  getelementptr i32, i32 addrspace(2)*
+; OPT: br i1
+
+; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
+; GCN: s_and_saveexec_b64
+; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400
+
+; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
+; GCN: s_or_b64 exec, exec
+define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
+; OPT-SI: getelementptr i32, i32 addrspace(2)*
+; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
+; OPT: br i1
+
+; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
+; GCN: s_and_saveexec_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}}
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
+; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; GCN: s_or_b64 exec, exec
+define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
+; OPT: getelementptr i32, i32 addrspace(2)*
+; OPT: br i1
+
+; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
+; GCN: s_and_saveexec_b64
+; GCN: s_add_u32
+; GCN: s_addc_u32
+; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; GCN: s_or_b64 exec, exec
+define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32:
+; GCN: s_and_saveexec_b64
+; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}}
+; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
+
+; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}}
+; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
+
+; GCN: s_or_b64 exec, exec
+define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
+; OPT-SI: getelementptr i32, i32 addrspace(2)*
+; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
+; OPT-VI: getelementptr i32, i32 addrspace(2)*
+; OPT: br i1
+
+; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
+; GCN: s_and_saveexec_b64
+; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
+; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
+
+; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}}
+
+; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
+; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
+
+; GCN: s_or_b64 exec, exec
+define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(2)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll b/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
new file mode 100644
index 0000000000000..1a37e3c75fa38
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s
+
+
+; HSA-DEFAULT: flat_store_dword
+; HSA-NODEFAULT: buffer_store_dword
+; NOHSA-DEFAULT: buffer_store_dword
+; NOHSA-NODEFAULT: flat_store_dword
+define void @test(i32 addrspace(1)* %out) {
+entry:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
index e1a0ee3ea2175..ec2971e98032a 100644
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -36,6 +36,24 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali
   ret void
 }
 
+; FIXME: or 0 should be replaxed with copy
+; FUNC-LABEL: {{^}}v_ctpop_i64_user:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
+; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]]
+; GCN-DAG: v_or_b32_e64 v[[RESULT_HI:[0-9]+]], 0, s{{[0-9]+}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+; GCN: s_endpgm
+define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
+  %val = load i64, i64 addrspace(1)* %in, align 8
+  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
+  %or = or i64 %ctpop, %s.val
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}s_ctpop_v2i64:
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
@@ -99,8 +117,8 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
 ; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
 ; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
 ; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
-; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
-; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
 ; GCN: s_endpgm
 define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 3399d9da29e3d..834922c62cbd9 100644
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -137,14 +137,8 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
 ; SI-NOT: bfe
 ; SI-NOT: lshr
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
   %cvt = uitofp <8 x i8> %load to <8 x float>
@@ -154,7 +148,7 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8>
 
 ; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
 ; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
-; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
+; SI: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
 ; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
 ; SI: buffer_store_dword [[CONV]],
 define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
new file mode 100644
index 0000000000000..171883e4c74b2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+
+; The memory operand was dropped from the buffer_load_dword_offset
+; when replaced with the addr64 during operand legalization, resulting
+; in the global loads not being scheduled together.
+
+; GCN-LABEL: {{^}}reschedule_global_load_lds_store:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: s_endpgm
+define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
+entry:
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx = shl i32 %tid, 2
+  %gep0 = getelementptr i32, i32 addrspace(1)* %gptr0, i32 %idx
+  %gep1 = getelementptr i32, i32 addrspace(1)* %gptr1, i32 %idx
+  %gep2 = getelementptr i32, i32 addrspace(3)* %lptr, i32 %tid
+  %cmp0 = icmp eq i32 %c, 0
+  br i1 %cmp0, label %for.body, label %exit
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.body ]
+  %gptr0.phi = phi i32 addrspace(1)* [ %gep0, %entry ], [ %gep0.inc, %for.body ]
+  %gptr1.phi = phi i32 addrspace(1)* [ %gep1, %entry ], [ %gep1.inc, %for.body ]
+  %lptr0.phi = phi i32 addrspace(3)* [ %gep2, %entry ], [ %gep2.inc, %for.body ]
+  %lptr1 = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 1
+  %val0 = load i32, i32 addrspace(1)* %gep0
+  store i32 %val0, i32 addrspace(3)* %lptr0.phi
+  %val1 = load i32, i32 addrspace(1)* %gep1
+  store i32 %val1, i32 addrspace(3)* %lptr1
+  %gep0.inc = getelementptr i32, i32 addrspace(1)* %gptr0.phi, i32 4
+  %gep1.inc = getelementptr i32, i32 addrspace(1)* %gptr1.phi, i32 4
+  %gep2.inc = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 4
+  %i.inc = add nsw i32 %i, 1
+  %cmp1 = icmp ne i32 %i, 256
+  br i1 %cmp1, label %for.body, label %exit
+
+exit:                                             ; preds = %for.body, %entry
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index 5e4654abd91bd..e657991557e3a 100644
--- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -10,13 +10,13 @@ declare void @llvm.AMDGPU.barrier.local() #1
 ; CHECK: BB0_1:
 ; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
-; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]]
+; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], vcc, 4, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], vcc, 0x84, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
 
 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1
@@ -66,5 +66,5 @@ for.end:                                          ; preds = %for.body
 }
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { noduplicate nounwind }
+attributes #1 = { convergent nounwind }
 attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/AMDGPU/ds-sub-offset.ll b/test/CodeGen/AMDGPU/ds-sub-offset.ll
new file mode 100644
index 0000000000000..7d6eddb01993c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -0,0 +1,125 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+
+declare void @llvm.AMDGPU.barrier.local() #2
+declare i32 @llvm.r600.read.tidig.x() #0
+
+@lds.obj = addrspace(3) global [256 x i32] undef, align 4
+
+; GCN-LABEL: {{^}}write_ds_sub0_offset0_global:
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0
+; GCN: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]
+; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
+; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
+define void @write_ds_sub0_offset0_global() #0 {
+entry:
+  %x.i = call i32 @llvm.r600.read.tidig.x() #1
+  %sub1 = sub i32 0, %x.i
+  %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
+  store i32 123, i32 addrspace(3)* %arrayidx
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_max_offset:
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
+; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
+; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535
+define void @add_x_shl_neg_to_sub_max_offset() #1 {
+  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %neg = sub i32 0, %x.i
+  %shl = shl i32 %neg, 2
+  %add = add i32 65535, %shl
+  %ptr = inttoptr i32 %add to i8 addrspace(3)*
+  store i8 13, i8 addrspace(3)* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_max_offset_p1:
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
+; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
+; GCN: ds_write_b8 [[NEG]], [[K]]{{$}}
+define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
+  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %neg = sub i32 0, %x.i
+  %shl = shl i32 %neg, 2
+  %add = add i32 65536, %shl
+  %ptr = inttoptr i32 %add to i8 addrspace(3)*
+  store i8 13, i8 addrspace(3)* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_multi_use:
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
+; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
+; GCN-NOT: v_sub
+; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}}
+; GCN-NOT: v_sub
+; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}}
+; GCN: s_endpgm
+define void @add_x_shl_neg_to_sub_multi_use() #1 {
+  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %neg = sub i32 0, %x.i
+  %shl = shl i32 %neg, 2
+  %add0 = add i32 123, %shl
+  %add1 = add i32 456, %shl
+  %ptr0 = inttoptr i32 %add0 to i32 addrspace(3)*
+  store volatile i32 13, i32 addrspace(3)* %ptr0
+  %ptr1 = inttoptr i32 %add1 to i32 addrspace(3)*
+  store volatile i32 13, i32 addrspace(3)* %ptr1
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_multi_use_same_offset:
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
+; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
+; GCN-NOT: v_sub
+; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}}
+; GCN-NOT: v_sub
+; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}}
+; GCN: s_endpgm
+define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
+  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %neg = sub i32 0, %x.i
+  %shl = shl i32 %neg, 2
+  %add = add i32 123, %shl
+  %ptr = inttoptr i32 %add to i32 addrspace(3)*
+  store volatile i32 13, i32 addrspace(3)* %ptr
+  store volatile i32 13, i32 addrspace(3)* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_misaligned_i64_max_offset:
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
+; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
+; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255
+define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
+  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %neg = sub i32 0, %x.i
+  %shl = shl i32 %neg, 2
+  %add = add i32 1019, %shl
+  %ptr = inttoptr i32 %add to i64 addrspace(3)*
+  store i64 123, i64 addrspace(3)* %ptr, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
+; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]]
+; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
+define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
+  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %neg = sub i32 0, %x.i
+  %shl = shl i32 %neg, 2
+  %add = add i32 1020, %shl
+  %ptr = inttoptr i32 %add to i64 addrspace(3)*
+  store i64 123, i64 addrspace(3)* %ptr, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind convergent }
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
index ec04f8b1acd6a..5170d9c82712f 100644
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -216,10 +216,8 @@ define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x f
   ret void
 }
 
-; We should be able to merge in this case, but probably not worth the effort.
-; SI-NOT: ds_read2_b32
-; SI: ds_read_b32
-; SI: ds_read_b32
+; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
+; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
 ; SI: s_endpgm
 define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -507,9 +505,9 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tidig.y() #1
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.local() #2
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 842c2d8bc3394..0061aaf2cdbd1 100644
--- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -35,14 +35,11 @@ define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
   ret void
 }
 
-; FIXME: Shuffling to new superregister
 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Y:[0-9]+]]:[[REG_X:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
-; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Y:[0-9]+]], v[[REG_Y]]
-; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Z:[0-9]+]], v[[REG_Z]]
-; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[COPY_REG_Z]], v[[REG_X]]
-; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[COPY_REG_Y]]
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
+; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]]
 ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD2]]
 ; CI: s_endpgm
@@ -88,8 +85,13 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
 }
 
 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI-DAG: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+
+; FIXME: These moves shouldn't be necessary, it should be able to
+; store the same register if offset1 was the non-zero offset.
+
+; CI: v_mov_b32
+; CI: v_mov_b32
 ; CI: buffer_store_dwordx4
 ; CI: s_endpgm
 define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
@@ -102,8 +104,9 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out)
 }
 
 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI: v_mov_b32
+; CI: v_mov_b32
 ; CI: buffer_store_dwordx4
 ; CI: s_endpgm
 define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
@@ -115,19 +118,16 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
   ret void
 }
 
+; FIXME: Extra moves shuffling superregister
 ; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
+; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
+; CI: v_mov_b32
+; CI: v_mov_b32
+; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
+; CI: v_mov_b32
+; CI: v_mov_b32
+; CI: buffer_store_dwordx4
+; CI: buffer_store_dwordx4
 ; CI: s_endpgm
 define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -138,33 +138,24 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
   ret void
 }
 
+; FIXME: Extra moves shuffling superregister
 ; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
+; CI: v_mov_b32
+; CI: v_mov_b32
+; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:7{{$}}
+; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:4{{$}}
+; CI: v_mov_b32
+; CI: v_mov_b32
+; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
+; CI: v_mov_b32
+; CI: v_mov_b32
 
 ; CI: s_waitcnt lgkmcnt(0)
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
+; CI: buffer_store_dwordx4
+; CI: buffer_store_dwordx4
+; CI: buffer_store_dwordx4
+; CI: buffer_store_dwordx4
 ; CI: s_endpgm
 define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -238,9 +229,9 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tidig.y() #1
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.local() #2
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
index e2e441214b4ab..4a0571ea16f2b 100644
--- a/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -65,7 +65,7 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
 
 ; SI-LABEL: @simple_read2st64_f32_over_max_offset
 ; SI-NOT: ds_read2st64_b32
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
 ; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
 ; SI: s_endpgm
@@ -197,7 +197,7 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
 
 ; SI-LABEL: @simple_read2st64_f64_over_max_offset
 ; SI-NOT: ds_read2st64_b64
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; SI: s_endpgm
@@ -264,9 +264,5 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tidig.y() #1
 
-; Function Attrs: noduplicate nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
-
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
index d4973e377b596..9d3a293f3b898 100644
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -345,8 +345,9 @@ define void @store_constant_disjoint_offsets() {
 
 ; SI-LABEL: @store_misaligned64_constant_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+; SI: s_endpgm
 define void @store_misaligned64_constant_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
@@ -430,9 +431,9 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tidig.y() #1
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.local() #2
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll
index 358aa6a9e3636..5a1024ccf6d72 100644
--- a/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -109,9 +109,9 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tidig.y() #1
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.local() #2
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
new file mode 100644
index 0000000000000..f4409a0984a96
--- /dev/null
+++ b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=amdgcn -mcpu=tahiti -mattr=+promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported dynamic alloca in test_dynamic_stackalloc
+
+define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
+  %alloca = alloca i32, i32 %n
+  store volatile i32 0, i32* %alloca
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll b/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll
new file mode 100644
index 0000000000000..e325591396236
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; How the replacement of i64 stores with v2i32 stores resulted in
+; breaking other users of the bitcast if they already existed
+
+; GCN-LABEL: {{^}}extract_vector_elt_select_error:
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dwordx2
+define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) nounwind {
+  %vec = bitcast i64 %val to <2 x i32>
+  %elt0 = extractelement <2 x i32> %vec, i32 0
+  %elt1 = extractelement <2 x i32> %vec, i32 1
+
+  store volatile i32 %elt0, i32 addrspace(1)* %out
+  store volatile i32 %elt1, i32 addrspace(1)* %out
+  store volatile i64 %val, i64 addrspace(1)* %in
+  ret void
+}
+
+
+define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) nounwind {
+  %p0 = extractelement <2 x i64> %foo, i32 0
+  %p1 = extractelement <2 x i64> %foo, i32 1
+  %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1
+  store volatile i64 %p1, i64 addrspace(1)* %out
+  store volatile i64 %p0, i64 addrspace(1)* %out1
+  ret void
+}
+
+define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) nounwind {
+  %dynelt = extractelement <2 x i64> %foo, i32 %elt
+  store volatile i64 %dynelt, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) nounwind {
+  %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
+  %or = or <2 x i64> %load, %arst
+  %dynelt = extractelement <2 x i64> %or, i32 %elt
+  store volatile i64 %dynelt, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll
index 485c55870c479..19c17289da3d4 100644
--- a/test/CodeGen/AMDGPU/fadd64.ll
+++ b/test/CodeGen/AMDGPU/fadd64.ll
@@ -1,14 +1,44 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
-; CHECK: {{^}}fadd_f64:
+; CHECK-LABEL: {{^}}v_fadd_f64:
 ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
+define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                        double addrspace(1)* %in2) {
+  %r0 = load double, double addrspace(1)* %in1
+  %r1 = load double, double addrspace(1)* %in2
+  %r2 = fadd double %r0, %r1
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}s_fadd_f64:
+; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
+  %r2 = fadd double %r0, %r1
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_fadd_v2f64:
+; CHECK: v_add_f64
+; CHECK: v_add_f64
+; CHECK: buffer_store_dwordx4
+define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                          <2 x double> addrspace(1)* %in2) {
+  %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
+  %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
+  %r2 = fadd <2 x double> %r0, %r1
+  store <2 x double> %r2, <2 x double> addrspace(1)* %out
+  ret void
+}
 
-define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                      double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = fadd double %r0, %r1
-   store double %r2, double addrspace(1)* %out
-   ret void
+; CHECK-LABEL: {{^}}s_fadd_v2f64:
+; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: buffer_store_dwordx4
+define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
+  %r2 = fadd <2 x double> %r0, %r1
+  store <2 x double> %r2, <2 x double> addrspace(1)* %out
+  ret void
 }
diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll
index e8c34f0141e40..c8ef5b101c4d0 100644
--- a/test/CodeGen/AMDGPU/fceil64.ll
+++ b/test/CodeGen/AMDGPU/fceil64.ll
@@ -17,12 +17,12 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; SI: s_lshr_b64
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI: cmp_gt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_lt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
+; SI-DAG: cmp_gt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
+; SI-DAG: cmp_lt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
 ; SI-DAG: v_cmp_lt_f64
 ; SI-DAG: v_cmp_lg_f64
 ; SI: s_and_b64
diff --git a/test/CodeGen/AMDGPU/fcmp.ll b/test/CodeGen/AMDGPU/fcmp.ll
index 5207ab57bade3..97d954fcc3c27 100644
--- a/test/CodeGen/AMDGPU/fcmp.ll
+++ b/test/CodeGen/AMDGPU/fcmp.ll
@@ -20,7 +20,7 @@ entry:
 
 ; CHECK: {{^}}fcmp_br:
 ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
-; CHECK-NEXT {{[0-9]+(5.0}}
+; CHECK-NEXT: {{[0-9]+\(5.0}}
 
 define void @fcmp_br(i32 addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
index 8ceca078f2d6c..86e0c07323bb2 100644
--- a/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -7,39 +7,16 @@
 ; specialize away generic pointer accesses.
 
 
-; CHECK-LABEL: {{^}}branch_use_flat_i32:
-; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: s_endpgm
-define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
-entry:
-  %cmp = icmp ne i32 %c, 0
-  br i1 %cmp, label %local, label %global
-
-local:
-  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
-  br label %end
-
-global:
-  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  br label %end
-
-end:
-  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
-  store i32 %x, i32 addrspace(4)* %fptr, align 4
-;  %val = load i32, i32 addrspace(4)* %fptr, align 4
-;  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-
-
 ; These testcases might become useless when there are optimizations to
 ; remove generic pointers.
 
 ; CHECK-LABEL: {{^}}store_flat_i32:
-; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
-; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
-; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK-DAG: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]],
+; CHECK-DAG: s_load_dword s[[SDATA:[0-9]+]],
+; CHECK: s_waitcnt lgkmcnt(0)
+; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]]
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
 ; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
@@ -83,7 +60,7 @@ define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
 
 
 
-; CHECK-LABEL @load_flat_i32:
+; CHECK-LABEL: load_flat_i32:
 ; CHECK: flat_load_dword
 define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
@@ -92,7 +69,7 @@ define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noa
   ret void
 }
 
-; CHECK-LABEL @load_flat_i64:
+; CHECK-LABEL: load_flat_i64:
 ; CHECK: flat_load_dwordx2
 define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
@@ -101,7 +78,7 @@ define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
   ret void
 }
 
-; CHECK-LABEL @load_flat_v4i32:
+; CHECK-LABEL: load_flat_v4i32:
 ; CHECK: flat_load_dwordx4
 define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
@@ -110,7 +87,7 @@ define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> add
   ret void
 }
 
-; CHECK-LABEL @sextload_flat_i8:
+; CHECK-LABEL: sextload_flat_i8:
 ; CHECK: flat_load_sbyte
 define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
@@ -120,7 +97,7 @@ define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n
   ret void
 }
 
-; CHECK-LABEL @zextload_flat_i8:
+; CHECK-LABEL: zextload_flat_i8:
 ; CHECK: flat_load_ubyte
 define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
@@ -130,7 +107,7 @@ define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n
   ret void
 }
 
-; CHECK-LABEL @sextload_flat_i16:
+; CHECK-LABEL: sextload_flat_i16:
 ; CHECK: flat_load_sshort
 define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
@@ -140,7 +117,7 @@ define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
   ret void
 }
 
-; CHECK-LABEL @zextload_flat_i16:
+; CHECK-LABEL: zextload_flat_i16:
 ; CHECK: flat_load_ushort
 define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
@@ -150,35 +127,9 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
   ret void
 }
 
-
-
-; TODO: This should not be zero when registers are used for small
-; scratch allocations again.
-
-; Check for prologue initializing special SGPRs pointing to scratch.
-; CHECK-LABEL: {{^}}store_flat_scratch:
-; CHECK: s_movk_i32 flat_scratch_lo, 0
-; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
-; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
-; CHECK: flat_store_dword
-; CHECK: s_barrier
-; CHECK: flat_load_dword
-define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
-  %alloca = alloca i32, i32 9, align 4
-  %x = call i32 @llvm.r600.read.tidig.x() #3
-  %pptr = getelementptr i32, i32* %alloca, i32 %x
-  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
-  store i32 %x, i32 addrspace(4)* %fptr
-  ; Dummy call
-  call void @llvm.AMDGPU.barrier.local() #1
-  %reload = load i32, i32 addrspace(4)* %fptr, align 4
-  store i32 %reload, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
 declare void @llvm.AMDGPU.barrier.local() #1
 declare i32 @llvm.r600.read.tidig.x() #3
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind noduplicate }
+attributes #1 = { nounwind convergent }
 attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
new file mode 100644
index 0000000000000..e2ae3353ae1d3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=CI
+; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=VI
+
+; GCN-LABEL: {{^}}no_vcc_no_flat:
+; GCN: ; NumSgprs: 8
+define void @no_vcc_no_flat() {
+entry:
+  call void asm sideeffect "", "~{SGPR7}"()
+  ret void
+}
+
+; GCN-LABEL: {{^}}vcc_no_flat:
+; GCN: ; NumSgprs: 10
+define void @vcc_no_flat() {
+entry:
+  call void asm sideeffect "", "~{SGPR7},~{VCC}"()
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_vcc_flat:
+; CI: ; NumSgprs: 12
+; VI: ; NumSgprs: 14
+define void @no_vcc_flat() {
+entry:
+  call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
+  ret void
+}
+
+; GCN-LABEL: {{^}}vcc_flat:
+; CI: ; NumSgprs: 12
+; VI: ; NumSgprs: 14
+define void @vcc_flat() {
+entry:
+  call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll
index bd574b877117e..6f3437048ed89 100644
--- a/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -364,5 +364,205 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %
   ret void
 }
 
+;
+; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
+;
+
+; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
+; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]]
+define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
+                                        float addrspace(1)* %in1,
+                                        float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %a = fadd float %x, 1.0
+  %m = fmul float %a, %y
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
+; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]]
+define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
+                                        float addrspace(1)* %in1,
+                                        float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %a = fadd float %x, 1.0
+  %m = fmul float %y, %a
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
+; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
+                                           float addrspace(1)* %in1,
+                                           float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %a = fadd float %x, -1.0
+  %m = fmul float %a, %y
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
+; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
+                                           float addrspace(1)* %in1,
+                                           float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %a = fadd float %x, -1.0
+  %m = fmul float %y, %a
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
+; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]]
+define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
+                                        float addrspace(1)* %in1,
+                                        float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %s = fsub float 1.0, %x
+  %m = fmul float %s, %y
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
+; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]]
+define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
+                                        float addrspace(1)* %in1,
+                                        float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %s = fsub float 1.0, %x
+  %m = fmul float %y, %s
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
+; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
+                                           float addrspace(1)* %in1,
+                                           float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %s = fsub float -1.0, %x
+  %m = fmul float %s, %y
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
+; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
+                                         float addrspace(1)* %in1,
+                                         float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %s = fsub float -1.0, %x
+  %m = fmul float %y, %s
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
+; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
+                                        float addrspace(1)* %in1,
+                                        float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %s = fsub float %x, 1.0
+  %m = fmul float %s, %y
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
+; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
+define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
+                                      float addrspace(1)* %in1,
+                                      float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %s = fsub float %x, 1.0
+  %m = fmul float %y, %s
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
+; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]]
+define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
+                                         float addrspace(1)* %in1,
+                                         float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %s = fsub float %x, -1.0
+  %m = fmul float %s, %y
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
+; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]]
+define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
+                                         float addrspace(1)* %in1,
+                                         float addrspace(1)* %in2) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %s = fsub float %x, -1.0
+  %m = fmul float %y, %s
+  store float %m, float addrspace(1)* %out
+  ret void
+}
+
+;
+; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
+;
+
+; FUNC-LABEL: {{^}}test_f32_interp:
+; SI: v_mad_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
+; SI: v_mac_f32_e32 [[VR]], [[VT]], [[VX:v[0-9]]]
+define void @test_f32_interp(float addrspace(1)* %out,
+                             float addrspace(1)* %in1,
+                             float addrspace(1)* %in2,
+                             float addrspace(1)* %in3) {
+  %x = load float, float addrspace(1)* %in1
+  %y = load float, float addrspace(1)* %in2
+  %t = load float, float addrspace(1)* %in3
+  %t1 = fsub float 1.0, %t
+  %tx = fmul float %x, %t
+  %ty = fmul float %y, %t1
+  %r = fadd float %tx, %ty
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_f64_interp:
+; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
+; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
+define void @test_f64_interp(double addrspace(1)* %out,
+                             double addrspace(1)* %in1,
+                             double addrspace(1)* %in2,
+                             double addrspace(1)* %in3) {
+  %x = load double, double addrspace(1)* %in1
+  %y = load double, double addrspace(1)* %in2
+  %t = load double, double addrspace(1)* %in3
+  %t1 = fsub double 1.0, %t
+  %tx = fmul double %x, %t
+  %ty = fmul double %y, %t1
+  %r = fadd double %tx, %ty
+  store double %r, double addrspace(1)* %out
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
index 413957d2982ac..d374fb67350cc 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -87,6 +87,46 @@ define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32:
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; EG: MAX
+define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
+
+  %a = load <1 x float>, <1 x float> addrspace(1)* %gep.0
+  %b = load <1 x float>, <1 x float> addrspace(1)* %gep.1
+
+  %cmp = fcmp ogt <1 x float> %a, %b
+  %val = select <1 x i1> %cmp, <1 x float> %a, <1 x float> %b
+  store <1 x float> %val, <1 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32:
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-NONAN: v_max_f32_e32
+; SI-NONAN: v_max_f32_e32
+; SI-NONAN: v_max_f32_e32
+define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
+
+  %a = load <3 x float>, <3 x float> addrspace(1)* %gep.0
+  %b = load <3 x float>, <3 x float> addrspace(1)* %gep.1
+
+  %cmp = fcmp ogt <3 x float> %a, %b
+  %val = select <3 x i1> %cmp, <3 x float> %a, <3 x float> %b
+  store <3 x float> %val, <3 x float> addrspace(1)* %out
+  ret void
+}
 
 ; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
index 6a625c239d761..52fc3d0d251a4 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -96,6 +96,69 @@ define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32:
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
+
+  %a = load <1 x float>, <1 x float> addrspace(1)* %gep.0
+  %b = load <1 x float>, <1 x float> addrspace(1)* %gep.1
+
+  %cmp = fcmp ult <1 x float> %a, %b
+  %val = select <1 x i1> %cmp, <1 x float> %a, <1 x float> %b
+  store <1 x float> %val, <1 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32:
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+
+; SI-NONAN: v_min_f32_e32
+; SI-NONAN: v_min_f32_e32
+define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1
+
+  %a = load <2 x float>, <2 x float> addrspace(1)* %gep.0
+  %b = load <2 x float>, <2 x float> addrspace(1)* %gep.1
+
+  %cmp = fcmp ult <2 x float> %a, %b
+  %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  store <2 x float> %val, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32:
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+
+; SI-NONAN: v_min_f32_e32
+; SI-NONAN: v_min_f32_e32
+; SI-NONAN: v_min_f32_e32
+define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
+
+  %a = load <3 x float>, <3 x float> addrspace(1)* %gep.0
+  %b = load <3 x float>, <3 x float> addrspace(1)* %gep.1
+
+  %cmp = fcmp ult <3 x float> %a, %b
+  %val = select <3 x i1> %cmp, <3 x float> %a, <3 x float> %b
+  store <3 x float> %val, <3 x float> addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
new file mode 100644
index 0000000000000..1ee92b2f7c086
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -0,0 +1,102 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
+; make add an instruction if the fadd has more than one use.
+
+declare float @llvm.fabs.f32(float) #1
+
+; GCN-LABEL: {{^}}multiple_fadd_use_test:
+; GCN: v_max_legacy_f32_e64 [[A16:v[0-9]+]],
+; GCN: v_add_f32_e32 [[A17:v[0-9]+]], [[A16]], [[A16]]
+; GCN: v_mul_f32_e32 [[A18:v[0-9]+]], [[A17]], [[A17]]
+; GCN: v_mad_f32 [[A20:v[0-9]+]], -[[A18]], [[A17]], 1.0
+; GCN: buffer_store_dword [[A20]]
+define void @multiple_fadd_use_test(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+  %a11 = fadd fast float %y, -1.0
+  %a12 = call float @llvm.fabs.f32(float %a11)
+  %a13 = fadd fast float %x, -1.0
+  %a14 = call float @llvm.fabs.f32(float %a13)
+  %a15 = fcmp ogt float %a12, %a14
+  %a16 = select i1 %a15, float %a12, float %a14
+  %a17 = fmul fast float %a16, 2.0
+  %a18 = fmul fast float %a17, %a17
+  %a19 = fmul fast float %a18, %a17
+  %a20 = fsub fast float 1.0, %a19
+  store float %a20, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}multiple_use_fadd_fmac
+; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], [[X:s[0-9]+]], s{{[0-9]+}}
+; GCN-DAG: v_mac_f32_e64 [[MAD:v[0-9]+]], 2.0, [[X]]
+; GCN-DAG: buffer_store_dword [[MUL2]]
+; GCN-DAG: buffer_store_dword [[MAD]]
+; GCN: s_endpgm
+define void @multiple_use_fadd_fmac(float addrspace(1)* %out, float %x, float %y) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %mul2 = fmul fast float %x, 2.0
+  %mad = fadd fast float %mul2, %y
+  store float %mul2, float addrspace(1)* %out
+  store float %mad, float addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}multiple_use_fadd_fmad:
+; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}|
+; GCN-DAG: v_mad_f32 [[MAD:v[0-9]+]], 2.0, |[[X]]|, v{{[0-9]+}}
+; GCN-DAG: buffer_store_dword [[MUL2]]
+; GCN-DAG: buffer_store_dword [[MAD]]
+; GCN: s_endpgm
+define void @multiple_use_fadd_fmad(float addrspace(1)* %out, float %x, float %y) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %x.abs = call float @llvm.fabs.f32(float %x)
+  %mul2 = fmul fast float %x.abs, 2.0
+  %mad = fadd fast float %mul2, %y
+  store float %mul2, float addrspace(1)* %out
+  store float %mad, float addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad:
+; GCN: v_mad_f32 {{v[0-9]+}}, 2.0, |[[X:s[0-9]+]]|, v{{[0-9]+}}
+; GCN: v_mad_f32 {{v[0-9]+}}, 2.0, |[[X]]|, v{{[0-9]+}}
+define void @multiple_use_fadd_multi_fmad(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %x.abs = call float @llvm.fabs.f32(float %x)
+  %mul2 = fmul fast float %x.abs, 2.0
+  %mad0 = fadd fast float %mul2, %y
+  %mad1 = fadd fast float %mul2, %z
+  store float %mad0, float addrspace(1)* %out
+  store float %mad1, float addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_x2_xn2:
+; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], -4.0, [[X:s[0-9]+]]
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @fmul_x2_xn2(float addrspace(1)* %out, float %x, float %y) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %mul2 = fmul fast float %x, 2.0
+  %muln2 = fmul fast float %x, -2.0
+  %mul = fmul fast float %mul2, %muln2
+  store float %mul, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_x2_xn3:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xc0c00000
+; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @fmul_x2_xn3(float addrspace(1)* %out, float %x, float %y) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %mul2 = fmul fast float %x, 2.0
+  %muln2 = fmul fast float %x, -3.0
+  %mul = fmul fast float %mul2, %muln2
+  store float %mul, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll
index 3b4930d9897d1..b99d2712ed758 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -34,8 +34,7 @@ define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
 ; R600: |PV.{{[XYZW]}}|
 ; R600: -PV
 
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
@@ -49,8 +48,7 @@ define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
 ; R600: |PV.{{[XYZW]}}|
 ; R600: -PV
 
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
@@ -60,8 +58,7 @@ define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
 }
 
 ; FUNC-LABEL: {{^}}fneg_fabs_f32:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -85,11 +82,8 @@ define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in)
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600: -PV
 
-; FIXME: SGPR should be used directly for first src operand.
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-NOT: 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
@@ -97,14 +91,11 @@ define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   ret void
 }
 
-; FIXME: SGPR should be used directly for first src operand.
 ; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-NOT: 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll
index 6618d8b5e57e3..83a8ad8901d21 100644
--- a/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -29,12 +29,12 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; SI: s_lshr_b64
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI: cmp_gt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_lt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
+; SI-DAG: cmp_gt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
+; SI-DAG: cmp_lt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
 ; SI: s_endpgm
 define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll
index 471b0f6b13e78..f5ab390ce686d 100644
--- a/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -11,24 +11,35 @@ define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
   ret void
 }
 
-define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
 ; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
 ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
 ; SI, which is why it is being OR'd with the base pointer.
 ; SI: s_or_b32
 ; CI: s_add_i32
 ; CHECK: ds_write_b32
+define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
   %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
   store i32 99, i32 addrspace(3)* %p
   ret void
 }
 
-define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
 ; CHECK-LABEL: {{^}}gep_as_vector_v4:
-; CHECK: s_add_i32
-; CHECK: s_add_i32
-; CHECK: s_add_i32
-; CHECK: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CHECK: s_endpgm
+define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
@@ -41,10 +52,15 @@ define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind
   ret void
 }
 
-define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
 ; CHECK-LABEL: {{^}}gep_as_vector_v2:
-; CHECK: s_add_i32
-; CHECK: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CHECK: s_endpgm
+define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
   %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll
new file mode 100644
index 0000000000000..bc5f031cd4a29
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-constant.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOHSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA %s
+
+@readonly = private unnamed_addr addrspace(2) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0]
+@readonly2 = private unnamed_addr addrspace(2) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0]
+
+; GCN-LABEL: {{^}}main:
+; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], readonly
+; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], 0
+; GCN: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], readonly
+; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], 0
+; NOHSA: .text
+; HSA: .hsatext
+; GCN: readonly:
+; GCN: readonly2:
+define void @main(i32 %index, float addrspace(1)* %out) {
+  %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly, i32 0, i32 %index
+  %val = load float, float addrspace(2)* %ptr
+  store float %val, float addrspace(1)* %out
+  %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly2, i32 0, i32 %index
+  %val2 = load float, float addrspace(2)* %ptr2
+  store float %val2, float addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/global-extload-i32.ll b/test/CodeGen/AMDGPU/global-extload-i32.ll
index 79b83452939e9..e5e6be2199c37 100644
--- a/test/CodeGen/AMDGPU/global-extload-i32.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i32.ll
@@ -49,8 +49,7 @@ define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64:
 ; SI: buffer_load_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
@@ -63,8 +62,7 @@ define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; SI: buffer_load_dwordx2
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
@@ -75,10 +73,8 @@ define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64:
 ; SI: buffer_load_dwordx4
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
@@ -93,10 +89,8 @@ define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
@@ -106,22 +100,12 @@ define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
@@ -131,14 +115,8 @@ define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
@@ -148,15 +126,10 @@ define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
@@ -166,50 +139,34 @@ define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
 
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
 
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
 
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
@@ -219,40 +176,19 @@ define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
@@ -262,41 +198,15 @@ define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
 
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
@@ -331,41 +241,25 @@ define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
 
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
 
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
 
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
 
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
 
 ; SI: s_endpgm
 define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
@@ -376,77 +270,34 @@ define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
+; SI-DAG: buffer_store_dwordx4
 
 ; SI: s_endpgm
 define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll
index 146f0a5fbf260..6786e4a2f375a 100644
--- a/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/test/CodeGen/AMDGPU/global_atomics.ll
@@ -12,7 +12,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
-; GCN: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -105,7 +105,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
-; GCN: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -197,7 +197,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
-; GCN: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -289,7 +289,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
-; GCN: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -381,7 +381,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
-; GCN: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -473,7 +473,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
-; GCN: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -565,7 +565,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
-; GCN: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -657,7 +657,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
-; GCN: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -749,7 +749,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
-; GCN: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -838,7 +838,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
-; GCN: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll
index bf8f11860b50d..a02cbf43c4009 100644
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -105,6 +105,26 @@ define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x hal
 }
 
 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
 define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x float>
   store <8 x float> %ext, <8 x float> addrspace(1)* %out
@@ -112,12 +132,24 @@ define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x hal
 }
 
 ; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
+; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
+; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}}
+; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
 define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
   %ext = fpext half %arg to double
   store double %ext, double addrspace(1)* %out
   ret void
 }
+
 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN: s_endpgm
 define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
   %ext = fpext <2 x half> %arg to <2 x double>
   store <2 x double> %ext, <2 x double> addrspace(1)* %out
@@ -125,6 +157,16 @@ define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x ha
 }
 
 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN: s_endpgm
 define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x double>
   store <3 x double> %ext, <3 x double> addrspace(1)* %out
@@ -132,6 +174,19 @@ define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x ha
 }
 
 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN: s_endpgm
 define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x double>
   store <4 x double> %ext, <4 x double> addrspace(1)* %out
@@ -139,6 +194,37 @@ define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x ha
 }
 
 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+
+; GCN: s_endpgm
 define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x double>
   store <8 x double> %ext, <8 x double> addrspace(1)* %out
@@ -194,6 +280,12 @@ define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(
 }
 
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
+; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]]
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]]
+; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
+; GCN: s_endpgm
 define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x float>
@@ -226,6 +318,46 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+
+; GCN: s_endpgm
 define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x float>
@@ -246,6 +378,14 @@ define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace
 }
 
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
+; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]]
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]]
+; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
+; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
+; GCN: s_endpgm
 define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x double>
@@ -254,6 +394,25 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
+
+; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]], 32
+; VI: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 32, [[LOAD]]
+; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
+
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN-NOT: v_cvt_f32_f16_e32
+
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN-NOT: v_cvt_f64_f32_e32
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
+; GCN: s_endpgm
 define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
   %cvt = fpext <3 x half> %val to <3 x double>
@@ -310,13 +469,12 @@ define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2
   ret void
 }
 
-; FIXME: Shouldn't do 4th conversion
 ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
 ; GCN: buffer_load_dwordx4
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
+; GCN-NOT: v_cvt_f16_f32_e32
 ; GCN: buffer_store_short
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
@@ -346,14 +504,8 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: v_cvt_f16_f32_e32
@@ -379,54 +531,42 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_short
 ; GCN: s_endpgm
 define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
   %val = load <16 x float>, <16 x float> addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll
new file mode 100644
index 0000000000000..1d76c40c042e8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=ASM %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s | FileCheck %s --check-prefix=ELF
+
+@internal_global_program = internal addrspace(1) global i32 0
+@common_global_program = common addrspace(1) global i32 0
+@external_global_program = addrspace(1) global i32 0
+
+@internal_global_agent = internal addrspace(1) global i32 0, section ".hsadata_global_agent"
+@common_global_agent = common addrspace(1) global i32 0, section ".hsadata_global_agent"
+@external_global_agent = addrspace(1) global i32 0, section ".hsadata_global_agent"
+
+@internal_readonly = internal unnamed_addr addrspace(2) constant i32 0
+@external_readonly = unnamed_addr addrspace(2) constant i32 0
+
+define void @test() {
+  ret void
+}
+
+; ASM: .amdgpu_hsa_module_global internal_global
+; ASM: .hsadata_global_program
+; ASM: internal_global_program:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global common_global
+; ASM: .hsadata_global_program
+; ASM: common_global_program:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_program_global external_global
+; ASM: .hsadata_global_program
+; ASM: external_global_program:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global internal_global
+; ASM: .hsadata_global_agent
+; ASM: internal_global_agent:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global common_global
+; ASM: .hsadata_global_agent
+; ASM: common_global_agent:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_program_global external_global
+; ASM: .hsadata_global_agent
+; ASM: external_global_agent:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global internal_readonly
+; ASM: .hsatext
+; ASM: internal_readonly:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_program_global external_readonly
+; ASM: .hsatext
+; ASM: external_readonly:
+; ASM: .long 0
+
+; ELF: Section {
+; ELF: Name: .hsadata_global_program
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0x100003)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000)
+; ELF: SHF_WRITE (0x1)
+; ELF: ]
+; ELF: }
+
+; ELF: Section {
+; ELF: Name: .hsadata_global_agent
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0x900003)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_AMDGPU_HSA_AGENT (0x800000)
+; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000)
+; ELF: SHF_WRITE (0x1)
+; ELF: ]
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: common_global_agent
+; ELF: Binding: Local
+; ELF: Section: .hsadata_global_agent
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: common_global_program
+; ELF: Binding: Local
+; ELF: Section: .hsadata_global_program
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: internal_global_agent
+; ELF: Binding: Local
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_agent
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: internal_global_program
+; ELF: Binding: Local
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_program
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: internal_readonly
+; ELF: Binding: Local
+; ELF: Type: Object
+; ELF: Section: .hsatext
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: external_global_agent
+; ELF: Binding: Global
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_agent
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: external_global_program
+; ELF: Binding: Global
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_program
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: external_readonly
+; ELF: Binding: Global
+; ELF: Type: Object
+; ELF: Section: .hsatext
+; ELF: }
diff --git a/test/CodeGen/AMDGPU/hsa-group-segment.ll b/test/CodeGen/AMDGPU/hsa-group-segment.ll
new file mode 100644
index 0000000000000..1999dc38a6b0f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-group-segment.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+
+@internal_group = internal addrspace(3) global i32 undef
+@external_group = addrspace(3) global i32 undef
+
+define void @test() {
+entry:
+  store i32 0, i32 addrspace(3)* @internal_group
+  store i32 0, i32 addrspace(3)* @external_group
+  ret void
+}
+
+; HSA-NOT: internal_group:
+; HSA-NOT: external_group:
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index 653a6bb1b6098..abc89b7fd837c 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -1,11 +1,24 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA-CI --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA-VI --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj  | llvm-readobj -s -sd | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
 
+; ELF: Section {
+; ELF: Name: .hsatext
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0xC00007)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_AMDGPU_HSA_AGENT (0x800000)
+; ELF: SHF_AMDGPU_HSA_CODE (0x400000)
+; ELF: SHF_EXECINSTR (0x4)
+; ELF: SHF_WRITE (0x1)
+; ELF: }
+
 ; ELF: SHT_NOTE
 ; ELF: 0000: 04000000 08000000 01000000 414D4400
 ; ELF: 0010: 01000000 00000000 04000000 1B000000
@@ -13,20 +26,31 @@
 ; ELF: 0030: 00000000 00000000 414D4400 414D4447
 ; ELF: 0040: 50550000
 
+; ELF: Symbol {
+; ELF: Name: simple
+; ELF: Type: AMDGPU_HSA_KERNEL (0xA)
+; ELF: }
+
 ; HSA: .hsa_code_object_version 1,0
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
+; HSA: .hsatext
+
+; HSA: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
 ; HSA: .end_amd_kernel_code_t
-; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0
+; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 
 ; Make sure we are setting the ATC bit:
 ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
 ; On VI+ we also need to set MTYPE = 2
 ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
-; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0
+; Make sure we generate flat store for HSA
+; HSA: flat_store_dword v{{[0-9]+}}
 
 define void @simple(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/AMDGPU/image-attributes.ll b/test/CodeGen/AMDGPU/image-attributes.ll
new file mode 100644
index 0000000000000..5906b2f157096
--- /dev/null
+++ b/test/CodeGen/AMDGPU/image-attributes.ll
@@ -0,0 +1,206 @@
+; RUN: llc -march=r600 -mcpu=juniper < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; === WIDTH ==================================================================
+; 9 implicit args = 9 dwords to first image argument.
+; First width at dword index 9+1 -> KC0[2].Z
+
+; FUNC-LABEL: {{^}}width_2d:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[2].Z
+define void @width_2d (%opencl.image2d_t addrspace(1)* %in,
+                       i32 addrspace(1)* %out) {
+entry:
+  %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
+      %opencl.image2d_t addrspace(1)* %in) #0
+  %1 = extractvalue [3 x i32] %0, 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}width_3d:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[2].Z
+define void @width_3d (%opencl.image3d_t addrspace(1)* %in,
+                       i32 addrspace(1)* %out) {
+entry:
+  %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
+      %opencl.image3d_t addrspace(1)* %in) #0
+  %1 = extractvalue [3 x i32] %0, 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; === HEIGHT =================================================================
+; First height at dword index 9+2 -> KC0[2].W
+
+; FUNC-LABEL: {{^}}height_2d:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[2].W
+define void @height_2d (%opencl.image2d_t addrspace(1)* %in,
+                        i32 addrspace(1)* %out) {
+entry:
+  %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
+      %opencl.image2d_t addrspace(1)* %in) #0
+  %1 = extractvalue [3 x i32] %0, 1
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}height_3d:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[2].W
+define void @height_3d (%opencl.image3d_t addrspace(1)* %in,
+                        i32 addrspace(1)* %out) {
+entry:
+  %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
+      %opencl.image3d_t addrspace(1)* %in) #0
+  %1 = extractvalue [3 x i32] %0, 1
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; === DEPTH ==================================================================
+; First depth at dword index 9+3 -> KC0[3].X
+
+; FUNC-LABEL: {{^}}depth_3d:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[3].X
+define void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
+                       i32 addrspace(1)* %out) {
+entry:
+  %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
+      %opencl.image3d_t addrspace(1)* %in) #0
+  %1 = extractvalue [3 x i32] %0, 2
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; === CHANNEL DATA TYPE ======================================================
+; First channel data type at dword index 9+4 -> KC0[3].Y
+
+; FUNC-LABEL: {{^}}data_type_2d:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[3].Y
+define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
+                           i32 addrspace(1)* %out) {
+entry:
+  %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
+      %opencl.image2d_t addrspace(1)* %in) #0
+  %1 = extractvalue [2 x i32] %0, 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}data_type_3d:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[3].Y
+define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
+                                     i32 addrspace(1)* %out) {
+entry:
+  %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
+      %opencl.image3d_t addrspace(1)* %in) #0
+  %1 = extractvalue [2 x i32] %0, 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; === CHANNEL ORDER ==========================================================
+; First channel order at dword index 9+5 -> KC0[3].Z
+
+; FUNC-LABEL: {{^}}channel_order_2d:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[3].Z
+define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
+                               i32 addrspace(1)* %out) {
+entry:
+  %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
+      %opencl.image2d_t addrspace(1)* %in) #0
+  %1 = extractvalue [2 x i32] %0, 1
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}channel_order_3d:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[3].Z
+define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
+                                         i32 addrspace(1)* %out) {
+entry:
+  %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
+      %opencl.image3d_t addrspace(1)* %in) #0
+  %1 = extractvalue [2 x i32] %0, 1
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; === 2ND IMAGE ==============================================================
+; 9 implicit args + 2 explicit args + 5 implicit args for 1st image argument
+;   = 16 dwords to 2nd image argument.
+; Height of the second image is at 16+2 -> KC0[4].Z
+;
+; FUNC-LABEL: {{^}}image_arg_2nd:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[4].Z
+define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
+                            i32 %x,
+                            %opencl.image2d_t addrspace(1)* %in2,
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
+      %opencl.image2d_t addrspace(1)* %in2) #0
+  %1 = extractvalue [3 x i32] %0, 1
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+
+declare [3 x i32] @llvm.OpenCL.image.get.size.2d(%opencl.image2d_t addrspace(1)*) #0
+declare [3 x i32] @llvm.OpenCL.image.get.size.3d(%opencl.image3d_t addrspace(1)*) #0
+declare [2 x i32] @llvm.OpenCL.image.get.format.2d(%opencl.image2d_t addrspace(1)*) #0
+declare [2 x i32] @llvm.OpenCL.image.get.format.3d(%opencl.image3d_t addrspace(1)*) #0
+
+attributes #0 = { readnone }
+
+!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
+!0 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @width_2d,
+       !10, !20, !30, !40, !50}
+!1 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @width_3d,
+       !10, !21, !31, !41, !50}
+!2 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @height_2d,
+       !10, !20, !30, !40, !50}
+!3 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @height_3d,
+       !10, !21, !31, !41, !50}
+!4 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @depth_3d,
+       !10, !21, !31, !41, !50}
+!5 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @data_type_2d,
+       !10, !20, !30, !40, !50}
+!6 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @data_type_3d,
+       !10, !21, !31, !41, !50}
+!7 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @channel_order_2d,
+       !10, !20, !30, !40, !50}
+!8 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @channel_order_3d,
+       !10, !21, !31, !41, !50}
+!9 = !{void (%opencl.image3d_t addrspace(1)*, i32, %opencl.image2d_t addrspace(1)*,
+      i32 addrspace(1)*)* @image_arg_2nd, !12, !22, !32, !42, !52}
+
+!10 = !{!"kernel_arg_addr_space", i32 1, i32 1}
+!20 = !{!"kernel_arg_access_qual", !"read_only", !"none"}
+!21 = !{!"kernel_arg_access_qual", !"read_only", !"none"}
+!30 = !{!"kernel_arg_type", !"image2d_t", !"int*"}
+!31 = !{!"kernel_arg_type", !"image3d_t", !"int*"}
+!40 = !{!"kernel_arg_base_type", !"image2d_t", !"int*"}
+!41 = !{!"kernel_arg_base_type", !"image3d_t", !"int*"}
+!50 = !{!"kernel_arg_type_qual", !"", !""}
+
+!12 = !{!"kernel_arg_addr_space", i32 1, i32 0, i32 1, i32 1}
+!22 = !{!"kernel_arg_access_qual", !"read_only", !"none", !"write_only", !"none"}
+!32 = !{!"kernel_arg_type", !"image3d_t", !"sampler_t", !"image2d_t", !"int*"}
+!42 = !{!"kernel_arg_base_type", !"image3d_t", !"sampler_t", !"image2d_t", !"int*"}
+!52 = !{!"kernel_arg_type_qual", !"", !"", !"", !""}
diff --git a/test/CodeGen/AMDGPU/image-resource-id.ll b/test/CodeGen/AMDGPU/image-resource-id.ll
new file mode 100644
index 0000000000000..d4cf349442409
--- /dev/null
+++ b/test/CodeGen/AMDGPU/image-resource-id.ll
@@ -0,0 +1,409 @@
+; RUN: llc -march=r600 -mcpu=juniper < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; === 1 image arg, read_only ===================================================
+
+; FUNC-LABEL: {{^}}test_2d_rd_1_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 0(
+define void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %in) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_3d_rd_1_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 0(
+define void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
+      %opencl.image3d_t addrspace(1)* %in) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; === 1 image arg, write_only ==================================================
+
+; FUNC-LABEL: {{^}}test_2d_wr_1_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 0(
+define void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %in) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_3d_wr_1_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 0(
+define void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
+      %opencl.image3d_t addrspace(1)* %in) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; === 2 image args, read_only ==================================================
+
+; FUNC-LABEL: {{^}}test_2d_rd_2_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 0(
+define void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+                            %opencl.image2d_t addrspace(1)* %in2, ; read_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %in1) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_2d_rd_2_1:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 1(
+define void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+                            %opencl.image2d_t addrspace(1)* %in2, ; read_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %in2) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_3d_rd_2_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 0(
+define void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+                            %opencl.image3d_t addrspace(1)* %in2, ; read_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
+      %opencl.image3d_t addrspace(1)* %in1) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_3d_rd_2_1:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 1(
+define void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+                            %opencl.image3d_t addrspace(1)* %in2, ; read_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
+      %opencl.image3d_t addrspace(1)* %in2) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; === 2 image args, write_only =================================================
+
+; FUNC-LABEL: {{^}}test_2d_wr_2_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 0(
+define void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+                            %opencl.image2d_t addrspace(1)* %in2, ; write_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %in1) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_2d_wr_2_1:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 1(
+define void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+                            %opencl.image2d_t addrspace(1)* %in2, ; write_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %in2) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_3d_wr_2_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 0(
+define void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+                            %opencl.image3d_t addrspace(1)* %in2, ; write_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
+      %opencl.image3d_t addrspace(1)* %in1) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_3d_wr_2_1:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 1(
+define void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+                            %opencl.image3d_t addrspace(1)* %in2, ; write_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
+      %opencl.image3d_t addrspace(1)* %in2) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; === 3 image args, read_only ==================================================
+
+; FUNC-LABEL: {{^}}test_2d_rd_3_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 2(
+define void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+                            %opencl.image3d_t addrspace(1)* %in2, ; read_only
+                            %opencl.image2d_t addrspace(1)* %in3, ; read_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %in3) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}test_3d_rd_3_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 2(
+define void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+                            %opencl.image2d_t addrspace(1)* %in2, ; read_only
+                            %opencl.image3d_t addrspace(1)* %in3, ; read_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
+      %opencl.image3d_t addrspace(1)* %in3) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; === 3 image args, write_only =================================================
+
+; FUNC-LABEL: {{^}}test_2d_wr_3_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 2(
+define void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+                            %opencl.image3d_t addrspace(1)* %in2, ; write_only
+                            %opencl.image2d_t addrspace(1)* %in3, ; write_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %in3) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}test_3d_wr_3_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 2(
+define void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+                            %opencl.image2d_t addrspace(1)* %in2, ; write_only
+                            %opencl.image3d_t addrspace(1)* %in3, ; write_only
+                            i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
+      %opencl.image3d_t addrspace(1)* %in3) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; === 3 image args, mixed ======================================================
+
+; FUNC-LABEL: {{^}}test_2d_mix_3_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 1(
+define void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
+                             %opencl.image2d_t addrspace(1)* %in3, ; read_only
+                             i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %in3) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_3d_mix_3_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 1(
+define void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
+                             %opencl.image3d_t addrspace(1)* %in3, ; read_only
+                             i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
+      %opencl.image3d_t addrspace(1)* %in3) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_2d_mix_3_1:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 1(
+define void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
+                             %opencl.image2d_t addrspace(1)* %in3, ; write_only
+                             i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %in3) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_3d_mix_3_1:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 1(
+define void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
+                             %opencl.image3d_t addrspace(1)* %in3, ; write_only
+                             i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
+      %opencl.image3d_t addrspace(1)* %in3) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+
+declare i32 @llvm.OpenCL.image.get.resource.id.2d(%opencl.image2d_t addrspace(1)*) #0
+declare i32 @llvm.OpenCL.image.get.resource.id.3d(%opencl.image3d_t addrspace(1)*) #0
+
+attributes #0 = { readnone }
+
+!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13,
+                    !14, !15, !16, !17, !18, !19}
+!0 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_rd_1_0,
+       !110, !120, !130, !140, !150}
+!1 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_rd_1_0,
+       !110, !120, !131, !141, !150}
+!2 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_wr_1_0,
+       !110, !121, !130, !140, !150}
+!3 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_wr_1_0,
+       !110, !121, !131, !141, !150}
+!110 = !{!"kernel_arg_addr_space", i32 1, i32 1}
+!120 = !{!"kernel_arg_access_qual", !"read_only", !"none"}
+!121 = !{!"kernel_arg_access_qual", !"write_only", !"none"}
+!130 = !{!"kernel_arg_type", !"image2d_t", !"int*"}
+!131 = !{!"kernel_arg_type", !"image3d_t", !"int*"}
+!140 = !{!"kernel_arg_base_type", !"image2d_t", !"int*"}
+!141 = !{!"kernel_arg_base_type", !"image3d_t", !"int*"}
+!150 = !{!"kernel_arg_type_qual", !"", !""}
+
+!4  = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
+              i32 addrspace(1)*)* @test_2d_rd_2_0, !112, !122, !132, !142, !152}
+!5  = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
+              i32 addrspace(1)*)* @test_2d_rd_2_1, !112, !122, !132, !142, !152}
+!6  = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
+              i32 addrspace(1)*)* @test_3d_rd_2_0, !112, !122, !133, !143, !152}
+!7  = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
+              i32 addrspace(1)*)* @test_3d_rd_2_1, !112, !122, !133, !143, !152}
+!8  = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
+              i32 addrspace(1)*)* @test_2d_wr_2_0, !112, !123, !132, !142, !152}
+!9  = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
+              i32 addrspace(1)*)* @test_2d_wr_2_1, !112, !123, !132, !142, !152}
+!10 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
+              i32 addrspace(1)*)* @test_3d_wr_2_0, !112, !123, !133, !143, !152}
+!11 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
+              i32 addrspace(1)*)* @test_3d_wr_2_1, !112, !123, !133, !143, !152}
+!112 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!122 = !{!"kernel_arg_access_qual", !"read_only", !"read_only", !"none"}
+!123 = !{!"kernel_arg_access_qual", !"write_only", !"write_only", !"none"}
+!132 = !{!"kernel_arg_type", !"image2d_t", !"image2d_t", !"int*"}
+!133 = !{!"kernel_arg_type", !"image3d_t", !"image3d_t", !"int*"}
+!142 = !{!"kernel_arg_base_type", !"image2d_t", !"image2d_t", !"int*"}
+!143 = !{!"kernel_arg_base_type", !"image3d_t", !"image3d_t", !"int*"}
+!152 = !{!"kernel_arg_type_qual", !"", !"", !""}
+
+!12 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
+              %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_rd_3_0,
+              !114, !124, !134, !144, !154}
+!13 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
+              %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_rd_3_0,
+              !114, !124, !135, !145, !154}
+!14 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
+              %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_wr_3_0,
+              !114, !125, !134, !144, !154}
+!15 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
+              %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_wr_3_0,
+              !114, !125, !135, !145, !154}
+!16 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
+              %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_mix_3_0,
+              !114, !126, !134, !144, !154}
+!17 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
+              %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_mix_3_0,
+              !114, !126, !135, !145, !154}
+!18 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
+              %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_mix_3_1,
+              !114, !127, !134, !144, !154}
+!19 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
+              %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_mix_3_1,
+              !114, !127, !135, !145, !154}
+!114 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1}
+!124 = !{!"kernel_arg_access_qual", !"read_only", !"read_only", !"read_only", !"none"}
+!125 = !{!"kernel_arg_access_qual", !"write_only", !"write_only", !"write_only", !"none"}
+!126 = !{!"kernel_arg_access_qual", !"write_only", !"read_only", !"read_only", !"none"}
+!127 = !{!"kernel_arg_access_qual", !"write_only", !"read_only", !"write_only", !"none"}
+!134 = !{!"kernel_arg_type", !"image2d_t", !"image3d_t", !"image2d_t", !"int*"}
+!135 = !{!"kernel_arg_type", !"image3d_t", !"image2d_t", !"image3d_t", !"int*"}
+!144 = !{!"kernel_arg_base_type", !"image2d_t", !"image3d_t", !"image2d_t", !"int*"}
+!145 = !{!"kernel_arg_base_type", !"image3d_t", !"image2d_t", !"image3d_t", !"int*"}
+!154 = !{!"kernel_arg_type_qual", !"", !"", !"", !""}
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll
index 12eed550eb1fe..8db9ea4ccf314 100644
--- a/test/CodeGen/AMDGPU/imm.ll
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -3,8 +3,7 @@
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
 ; CHECK-LABEL: {{^}}i64_imm_inline_lo:
-; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5
-; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
+; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
 define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
 entry:
@@ -14,8 +13,7 @@ entry:
 
 ; Use a 64-bit value with hi bits that can be represented as an inline constant
 ; CHECK-LABEL: {{^}}i64_imm_inline_hi:
-; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5
-; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
+; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5
 ; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
 define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
 entry:
@@ -24,10 +22,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64:
-; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
-; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
   store i64 -9223372036854775808, i64 addrspace(1) *%out
@@ -523,10 +519,8 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
 
 
 ; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64:
-; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
-; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
   store double -0.0, double addrspace(1)* %out
@@ -606,10 +600,8 @@ define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: {{^}}store_literal_imm_f64:
-; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000
-; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 define void @store_literal_imm_f64(double addrspace(1)* %out) {
   store double 4096.0, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index f551606d63a73..e40cac22725ca 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -5,23 +5,52 @@
 ; indexing of vectors.
 
 ; CHECK-LABEL: {{^}}extract_w_offset:
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
 ; CHECK: s_mov_b32 m0
 ; CHECK-NEXT: v_movrels_b32_e32
 define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
-  %0 = add i32 %in, 1
-  %1 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %0
-  store float %1, float addrspace(1)* %out
+  %idx = add i32 %in, 1
+  %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %idx
+  store float %elt, float addrspace(1)* %out
+  ret void
+}
+
+; XXX: Could do v_or_b32 directly
+; CHECK-LABEL: {{^}}extract_w_offset_salu_use_vector:
+; CHECK-DAG: s_or_b32
+; CHECK-DAG: s_or_b32
+; CHECK-DAG: s_or_b32
+; CHECK-DAG: s_or_b32
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movrels_b32_e32
+define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
+entry:
+  %idx = add i32 %in, 1
+  %vec = or <4 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4>
+  %elt = extractelement <4 x i32> %vec, i32 %idx
+  store i32 %elt, i32 addrspace(1)* %out
   ret void
 }
 
 ; CHECK-LABEL: {{^}}extract_wo_offset:
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
+; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
 ; CHECK: s_mov_b32 m0
 ; CHECK-NEXT: v_movrels_b32_e32
 define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
-  %0 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
-  store float %0, float addrspace(1)* %out
+  %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
+  store float %elt, float addrspace(1)* %out
   ret void
 }
 
@@ -37,6 +66,19 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: {{^}}extract_neg_offset_sgpr_loaded:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
+; CHECK: v_movrels_b32_e32 v{{[0-9]}}, v0
+define void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
+entry:
+  %index = add i32 %offset, -512
+  %or = or <4 x i32> %vec0, %vec1
+  %value = extractelement <4 x i32> %or, i32 %index
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
 ; CHECK-LABEL: {{^}}extract_neg_offset_vgpr:
 ; The offset depends on the register that holds the first element of the vector.
 ; CHECK: v_readfirstlane_b32
@@ -87,6 +129,21 @@ entry:
   ret void
 }
 
+; The vector indexed into is originally loaded into an SGPR rather
+; than built with a reg_sequence
+
+; CHECK-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
+; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}}
+define void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
+entry:
+  %index = add i32 %offset, -512
+  %value = insertelement <4 x i32> %vec, i32 5, i32 %index
+  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK-LABEL: {{^}}insert_neg_offset_vgpr:
 ; The offset depends on the register that holds the first element of the vector.
 ; CHECK: v_readfirstlane_b32
diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll
index d63e1b6c5212f..2a3b29f54fa9b 100644
--- a/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
 
-declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() convergent nounwind
 
 ; SI-LABEL: {{^}}private_access_f64_alloca:
 
@@ -18,7 +18,7 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
   %array = alloca double, i32 16, align 8
   %ptr = getelementptr double, double* %array, i32 %b
   store double %val, double* %ptr, align 8
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  call void @llvm.AMDGPU.barrier.local() convergent nounwind
   %result = load double, double* %ptr, align 8
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -29,20 +29,16 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
 ; SI-ALLOCA: buffer_store_dwordx4
 ; SI-ALLOCA: buffer_load_dwordx4
 
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_read_b64
+; SI-PROMOTE: ds_read_b64
 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
   %array = alloca <2 x double>, i32 16, align 16
   %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
   store <2 x double> %val, <2 x double>* %ptr, align 16
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  call void @llvm.AMDGPU.barrier.local() convergent nounwind
   %result = load <2 x double>, <2 x double>* %ptr, align 16
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -60,7 +56,7 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
   %array = alloca i64, i32 16, align 8
   %ptr = getelementptr i64, i64* %array, i32 %b
   store i64 %val, i64* %ptr, align 8
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  call void @llvm.AMDGPU.barrier.local() convergent nounwind
   %result = load i64, i64* %ptr, align 8
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -71,20 +67,16 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
 ; SI-ALLOCA: buffer_store_dwordx4
 ; SI-ALLOCA: buffer_load_dwordx4
 
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_read_b64
+; SI-PROMOTE: ds_read_b64
 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %array = alloca <2 x i64>, i32 16, align 16
   %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
   store <2 x i64> %val, <2 x i64>* %ptr, align 16
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  call void @llvm.AMDGPU.barrier.local() convergent nounwind
   %result = load <2 x i64>, <2 x i64>* %ptr, align 16
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/AMDGPU/inline-constraints.ll b/test/CodeGen/AMDGPU/inline-constraints.ll
new file mode 100644
index 0000000000000..78868710c6a28
--- /dev/null
+++ b/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}inline_reg_constraints:
+; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GCN: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+
+define void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
+entry:
+  %v32 = tail call i32 asm sideeffect "flat_load_dword   $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
+  %v64 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
+  %v128 = tail call <4 x i32> asm sideeffect "flat_load_dwordx4 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
+  %s32 =  tail call i32 asm sideeffect "s_load_dword $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  %s64 =  tail call <2 x i32> asm sideeffect "s_load_dwordx2 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  %s128 =  tail call <4 x i32> asm sideeffect "s_load_dwordx4 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  %s256 =  tail call <8 x i32> asm sideeffect "s_load_dwordx8 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 6de3d408c4864..7f9579e59782b 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -70,8 +70,9 @@ define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x fl
 }
 
 ; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
+; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
@@ -79,10 +80,11 @@ define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x fl
 }
 
 ; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
+; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
@@ -202,10 +204,28 @@ endif:
 }
 
 ; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
+; SI: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
+; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
+; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}}
+
+; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+
+; SI: s_mov_b32 m0, [[SCALEDIDX]]
+; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
+
+; Increment to next element.
+; FIXME: Should be able to manipulate m0 directly instead of add and
+; copy.
+
+; SI: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1
+; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
+; SI-DAG: s_mov_b32 m0, [[IDX1]]
+; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
+
+; SI: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
@@ -213,9 +233,16 @@ define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x d
   ret void
 }
 
+; FIXME: Inline immediate should be folded into v_movreld_b32.
 ; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
+
+; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}}
+; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}}
+
+; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
+; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
+
+; SI: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
@@ -223,12 +250,29 @@ define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64>
   ret void
 }
 
+; FIXME: Should be able to do without stack access. The used stack
+; space is also 2x what should be required.
+
 ; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
+; SI: SCRATCH_RSRC_DWORD
+
+; Stack store
+; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+
+; Write element
+; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+
+; Stack reload
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+
+; Store result
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 ; SI: s_endpgm
+; SI: ScratchSize: 64
+
 define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
   store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
@@ -236,15 +280,26 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d
 }
 
 ; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
+; SI: SCRATCH_RSRC_DWORD
+
+; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}}
+; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}}
+
+; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 ; SI: s_endpgm
+; SI: ScratchSize: 128
 define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 1dd7c2cb7995d..e9d98ac89e72d 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -4,8 +4,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}i8_arg:
-; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; GCN: buffer_load_ubyte
+; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 
 define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
@@ -39,8 +41,10 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}i16_arg:
-; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; GCN: buffer_load_ushort
+; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 
 define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
@@ -290,8 +294,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
-; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
+; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
+; VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
@@ -307,7 +311,7 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
@@ -409,8 +413,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
@@ -434,8 +438,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
new file mode 100644
index 0000000000000..8347b8c96ec4b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+
+; FIXME: align on alloca seems to be ignored for private_segment_alignment
+
+; ALL-LABEL: {{^}}large_alloca_compute_shader:
+
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
+
+
+; GCNHSA: .amd_kernel_code_t
+
+; GCNHSA: compute_pgm_rsrc2_scratch_en = 1
+; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6
+; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1
+; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0
+; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0
+; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0
+; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+
+; GCNHSA: enable_sgpr_private_segment_buffer = 1
+; GCNHSA: enable_sgpr_dispatch_ptr = 0
+; GCNHSA: enable_sgpr_queue_ptr = 0
+; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1
+; GCNHSA: enable_sgpr_dispatch_id = 0
+; GCNHSA: enable_sgpr_flat_scratch_init = 0
+; GCNHSA: enable_sgpr_private_segment_size = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0
+; GCNHSA: workitem_private_segment_byte_size = 32772
+; GCNHSA: private_segment_alignment = 4
+; GCNHSA: .end_amd_kernel_code_t
+
+
+; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
+; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
+
+; Scratch size = alloca size + emergency stack slot
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind  }
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
new file mode 100644
index 0000000000000..141ee2560152b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+
+; ALL-LABEL: {{^}}large_alloca_pixel_shader:
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg:
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind  }
+attributes #1 = { nounwind "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/large-alloca.ll b/test/CodeGen/AMDGPU/large-alloca.ll
deleted file mode 100644
index 671833d1a33a5..0000000000000
--- a/test/CodeGen/AMDGPU/large-alloca.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; XFAIL: *
-; REQUIRES: asserts
-; RUN: llc -march=amdgcn -mcpu=SI < %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s
-
-define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind {
-  %large = alloca [8192 x i32], align 4
-  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
-  store i32 %x, i32* %gep
-  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
-  %0 = load i32, i32* %gep1
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
index cff1c24f89d6e..9d2320cb2d19f 100644
--- a/test/CodeGen/AMDGPU/literals.ll
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -7,8 +7,8 @@
 ; ADD_INT literal.x KC0[2].Z, 5
 
 ; CHECK: {{^}}i32_literal:
-; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 5
 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -24,8 +24,8 @@ entry:
 ; ADD literal.x KC0[2].Z, 5.0
 
 ; CHECK: {{^}}float_literal:
-; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 1084227584(5.0
 define void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
index 8bf094b8bc7bf..ca8ddbae9fbc7 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
@@ -8,9 +8,7 @@ declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone
 declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone
 
 ; FUNC-LABEL: {{^}}s_abs_i32:
-; SI: s_sub_i32
-; SI: s_max_i32
-; SI: s_endpgm
+; SI: s_abs_i32
 
 ; EG: SUB_INT
 ; EG: MAX_INT
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
index 1168713ca66ee..d56b484572856 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
@@ -425,7 +425,7 @@ define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; SI: buffer_load_dword [[LOAD:v[0-9]+]]
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
 ; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
-; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]]
+; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
 ; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
 ; SI: buffer_store_dword [[TMP2]]
 define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll
deleted file mode 100644
index 301de4b1c82d8..0000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}s_brev_i32:
-; SI: s_load_dword [[VAL:s[0-9]+]],
-; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
-define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
-  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
-  store i32 %ctlz, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_brev_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
-  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
-  store i32 %ctlz, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll
index 805a88b59c721..80eb3b93f8e50 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll
@@ -271,7 +271,8 @@ define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
 ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NOT: vcc
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
@@ -285,7 +286,8 @@ define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
 ; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
 ; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NOT: vcc
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
index f948c987b0385..7dc094ed1b4b7 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
@@ -4,7 +4,6 @@
 ; FIXME: Enable for VI.
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
 declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
 declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
new file mode 100644
index 0000000000000..2e299e30b8c74
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}read_workdim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[2].Z
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+define void @read_workdim(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}read_workdim_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOT: 0xff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @read_workdim_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+  %shl = shl i32 %dim, 24
+  %shr = lshr i32 %shl, 24
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.AMDGPU.read.workdim() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
index 74792e50017f3..a30a8e083eb6f 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
 ; R600: {{^}}amdgpu_trunc:
-; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; R600: TRUNC {{\*? *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: {{^}}amdgpu_trunc:
 ; SI: v_trunc_f32
 
diff --git a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
new file mode 100644
index 0000000000000..0155757632d4f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}main:
+; GCN: v_cvt_pkrtz_f16_f32
+; GCN: v_cvt_pkrtz_f16_f32
+; GCN-NOT: v_cvt_pkrtz_f16_f32
+
+define void @main(float %src) #0 {
+main_body:
+  %p1 = call i32 @llvm.SI.packf16(float undef, float %src)
+  %p2 = call i32 @llvm.SI.packf16(float %src, float undef)
+  %p3 = call i32 @llvm.SI.packf16(float undef, float undef)
+  %f1 = bitcast i32 %p1 to float
+  %f2 = bitcast i32 %p2 to float
+  %f3 = bitcast i32 %p3 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 1, float undef, float %f1, float undef, float %f1)
+  call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 1, float undef, float %f2, float undef, float %f2)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %f3, float undef, float %f2)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
new file mode 100644
index 0000000000000..6d9db65e7d93a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare void @llvm.amdgcn.buffer.wbinvl1() #0
+
+; GCN-LABEL: {{^}}test_buffer_wbinvl1:
+; GCN-NEXT: ; BB#0:
+; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00]
+; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00]
+; GCN-NEXT: s_endpgm
+define void @test_buffer_wbinvl1() #0 {
+  call void @llvm.amdgcn.buffer.wbinvl1()
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
new file mode 100644
index 0000000000000..746298465e580
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=SI %s
+
+declare void @llvm.amdgcn.buffer.wbinvl1.sc() #0
+
+; SI-LABEL: {{^}}test_buffer_wbinvl1_sc:
+; SI-NEXT: ; BB#0:
+; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
+; SI-NEXT: s_endpgm
+define void @test_buffer_wbinvl1_sc() #0 {
+  call void @llvm.amdgcn.buffer.wbinvl1.sc()
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
new file mode 100644
index 0000000000000..cecfcb1bfe7c0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare void @llvm.amdgcn.buffer.wbinvl1.vol() #0
+
+; GCN-LABEL: {{^}}test_buffer_wbinvl1_vol:
+; GCN-NEXT: ; BB#0:
+; CI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
+; VI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00]
+; GCN-NEXT: s_endpgm
+define void @test_buffer_wbinvl1_vol() #0 {
+  call void @llvm.amdgcn.buffer.wbinvl1.vol()
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
new file mode 100644
index 0000000000000..dc95cd1ee012f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test:
+; GCN: enable_sgpr_dispatch_ptr = 1
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+define void @test(i32 addrspace(1)* %out) {
+  %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+  %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
+  %value = load i32, i32 addrspace(2)* %header_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
new file mode 100644
index 0000000000000..a28e1b1eb2413
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
@@ -0,0 +1,30 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+
+;GCN-LABEL: {{^}}v_interp:
+;GCN-NOT: s_wqm
+;GCN: s_mov_b32 m0, s{{[0-9]+}}
+;GCN: v_interp_p1_f32
+;GCN: v_interp_p2_f32
+define void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
+main_body:
+  %i = extractelement <2 x i32> %4, i32 0
+  %j = extractelement <2 x i32> %4, i32 1
+  %p0_0 = call float @llvm.amdgcn.interp.p1(i32 %i, i32 0, i32 0, i32 %3)
+  %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, i32 %j, i32 0, i32 0, i32 %3)
+  %p0_1 = call float @llvm.amdgcn.interp.p1(i32 %i, i32 1, i32 0, i32 %3)
+  %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, i32 %j, i32 1, i32 0, i32 %3)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %p1_1)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.amdgcn.interp.p1(i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.amdgcn.interp.p2(float, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
new file mode 100644
index 0000000000000..02ee2039542ae
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@@ -0,0 +1,24 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
+
+;GCN-LABEL: {{^}}mbcnt_intrinsics:
+;GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
+;SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
+;VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
+
+define void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+main_body:
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1
+  %4 = bitcast i32 %hi to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %4, float %4, float %4, float %4)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
new file mode 100644
index 0000000000000..f8af67c17ec2b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare void @llvm.amdgcn.s.dcache.inv() #0
+
+; GCN-LABEL: {{^}}test_s_dcache_inv:
+; GCN-NEXT: ; BB#0:
+; SI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0xc0,0xc7]
+; VI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0x80,0xc0,0x00,0x00,0x00,0x00]
+; GCN-NEXT: s_endpgm
+define void @test_s_dcache_inv() #0 {
+  call void @llvm.amdgcn.s.dcache.inv()
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_s_dcache_inv_insert_wait:
+; GCN-NEXT: ; BB#0:
+; GCN-NEXT: s_dcache_inv
+; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+define void @test_s_dcache_inv_insert_wait() #0 {
+  call void @llvm.amdgcn.s.dcache.inv()
+  br label %end
+
+end:
+  store volatile i32 3, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
new file mode 100644
index 0000000000000..a8502a7c5033b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare void @llvm.amdgcn.s.dcache.inv.vol() #0
+
+; GCN-LABEL: {{^}}test_s_dcache_inv_vol:
+; GCN-NEXT: ; BB#0:
+; CI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x40,0xc7]
+; VI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x88,0xc0,0x00,0x00,0x00,0x00]
+; GCN-NEXT: s_endpgm
+define void @test_s_dcache_inv_vol() #0 {
+  call void @llvm.amdgcn.s.dcache.inv.vol()
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_s_dcache_inv_vol_insert_wait:
+; GCN-NEXT: ; BB#0:
+; GCN-NEXT: s_dcache_inv_vol
+; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+define void @test_s_dcache_inv_vol_insert_wait() #0 {
+  call void @llvm.amdgcn.s.dcache.inv.vol()
+  br label %end
+
+end:
+  store volatile i32 3, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
new file mode 100644
index 0000000000000..f9ae09b391aac
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
+
+declare void @llvm.amdgcn.s.dcache.wb() #0
+
+; VI-LABEL: {{^}}test_s_dcache_wb:
+; VI-NEXT: ; BB#0:
+; VI-NEXT: s_dcache_wb ; encoding: [0x00,0x00,0x84,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_endpgm
+define void @test_s_dcache_wb() #0 {
+  call void @llvm.amdgcn.s.dcache.wb()
+  ret void
+}
+
+; VI-LABEL: {{^}}test_s_dcache_wb_insert_wait:
+; VI-NEXT: ; BB#0:
+; VI-NEXT: s_dcache_wb
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+define void @test_s_dcache_wb_insert_wait() #0 {
+  call void @llvm.amdgcn.s.dcache.wb()
+  br label %end
+
+end:
+  store volatile i32 3, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
new file mode 100644
index 0000000000000..d9145458a1f6c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
+
+declare void @llvm.amdgcn.s.dcache.wb.vol() #0
+
+; VI-LABEL: {{^}}test_s_dcache_wb_vol:
+; VI-NEXT: ; BB#0:
+; VI-NEXT: s_dcache_wb_vol ; encoding: [0x00,0x00,0x8c,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_endpgm
+define void @test_s_dcache_wb_vol() #0 {
+  call void @llvm.amdgcn.s.dcache.wb.vol()
+  ret void
+}
+
+; VI-LABEL: {{^}}test_s_dcache_wb_vol_insert_wait:
+; VI-NEXT: ; BB#0:
+; VI-NEXT: s_dcache_wb_vol
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+define void @test_s_dcache_wb_vol_insert_wait() #0 {
+  call void @llvm.amdgcn.s.dcache.wb.vol()
+  br label %end
+
+end:
+  store volatile i32 3, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
index a64dd0ebd2dd8..0c3e4ecaa1a0e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
@@ -4,7 +4,7 @@
 declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
 
 ; FUNC-LABEL: {{^}}test_lrp:
-; SI: v_sub_f32
+; SI: v_mad_f32
 ; SI: v_mac_f32_e32
 define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
   %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index d001bcb4db176..b01f8ab2bdf95 100644
--- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -1,11 +1,11 @@
-; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -mattr=-flat-for-global < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_debug_value:
-; CHECK: s_load_dwordx2
-; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- SGPR0_SGPR1
+; CHECK: s_load_dwordx2 s[4:5]
+; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5
 ; CHECK: buffer_store_dword
 ; CHECK: s_endpgm
-define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 {
+define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 addrspace(1)* %globalptr_arg, i64 0, metadata !10, metadata !13), !dbg !14
   store i32 123, i32 addrspace(1)* %globalptr_arg, align 4
@@ -24,13 +24,13 @@ attributes #1 = { nounwind readnone }
 !1 = !DIFile(filename: "/tmp/test_debug_value.cl", directory: "/Users/matt/src/llvm/build_debug")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "test_debug_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, function: void (i32 addrspace(1)*)* @test_debug_value, variables: !9)
+!4 = distinct !DISubprogram(name: "test_debug_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, variables: !9)
 !5 = !DISubroutineType(types: !6)
 !6 = !{null, !7}
 !7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 32)
 !8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !9 = !{!10}
-!10 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "globalptr_arg", arg: 1, scope: !4, file: !1, line: 1, type: !7)
+!10 = !DILocalVariable(name: "globalptr_arg", arg: 1, scope: !4, file: !1, line: 1, type: !7)
 !11 = !{i32 2, !"Dwarf Version", i32 4}
 !12 = !{i32 2, !"Debug Info Version", i32 3}
 !13 = !DIExpression()
diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll
index e491732cf9c5f..d83ab562b7180 100644
--- a/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -132,32 +132,15 @@ define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %
 }
 
 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4:
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
+; SI: ds_read2_b32
+; SI: ds_read2_b32
+; SI: ds_read2_b32
+; SI: ds_read2_b32
 
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
+; SI: ds_write2_b32
+; SI: ds_write2_b32
+; SI: ds_write2_b32
+; SI: ds_write2_b32
 
 ; SI: s_endpgm
 define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
@@ -170,32 +153,15 @@ define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %
 ; FIXME: Use 64-bit ops
 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
 
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
+; SI: ds_read_b64
+; SI: ds_read_b64
+; SI: ds_read_b64
+; SI: ds_read_b64
 
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
+; SI: ds_write_b64
+; SI: ds_write_b64
+; SI: ds_write_b64
+; SI: ds_write_b64
 
 ; SI-DAG: s_endpgm
 define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
new file mode 100644
index 0000000000000..13ebee41e844e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -0,0 +1,184 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}local_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[1].Z
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1
+; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4
+
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_x(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[1].W
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_y(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[2].X
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_z(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xy:
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xy(i32 addrspace(1)* %out) {
+entry:
+  %x = call i32 @llvm.r600.read.local.size.x() #0
+  %y = call i32 @llvm.r600.read.local.size.y() #0
+  %val = mul i32 %x, %y
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xz:
+
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xz(i32 addrspace(1)* %out) {
+entry:
+  %x = call i32 @llvm.r600.read.local.size.x() #0
+  %z = call i32 @llvm.r600.read.local.size.z() #0
+  %val = mul i32 %x, %z
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_yz:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 1
+
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_yz(i32 addrspace(1)* %out) {
+entry:
+  %y = call i32 @llvm.r600.read.local.size.y() #0
+  %z = call i32 @llvm.r600.read.local.size.z() #0
+  %val = mul i32 %y, %z
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xyz:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 1
+
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xyz(i32 addrspace(1)* %out) {
+entry:
+  %x = call i32 @llvm.r600.read.local.size.x() #0
+  %y = call i32 @llvm.r600.read.local.size.y() #0
+  %z = call i32 @llvm.r600.read.local.size.z() #0
+  %xy = mul i32 %x, %y
+  %xyz = add i32 %xy, %z
+  store i32 %xyz, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_x_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.x() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.y() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.z() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.local.size.x() #0
+declare i32 @llvm.r600.read.local.size.y() #0
+declare i32 @llvm.r600.read.local.size.z() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 3d0f57e33280c..6b365dc09e2a9 100644
--- a/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -21,12 +21,9 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
 ; SI-DAG: v_cmp_eq_i32
 
 ; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
-; SI-DAG: v_cmp_gt_i32_e64
+; SI-DAG: v_cmp_gt_i32_e32
 ; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
 
-; SI-DAG: v_cmp_gt_i32_e64
-
-
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
diff --git a/test/CodeGen/AMDGPU/load.ll b/test/CodeGen/AMDGPU/load.ll
index 93b1b51a0d075..6a04261fe47bb 100644
--- a/test/CodeGen/AMDGPU/load.ll
+++ b/test/CodeGen/AMDGPU/load.ll
@@ -277,15 +277,9 @@ entry:
 ; FUNC-LABEL: {{^}}load_v8i32:
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-; XXX: We should be using DWORDX4 instructions on SI.
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
@@ -298,23 +292,11 @@ entry:
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-; XXX: We should be using DWORDX4 instructions on SI.
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
index f501a7ac62748..6b52b80ba0822 100644
--- a/test/CodeGen/AMDGPU/local-memory-two-objects.ll
+++ b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
@@ -10,7 +10,7 @@
 ; EG: .long 166120
 ; EG-NEXT: .long 8
 ; GCN: .long 47180
-; GCN-NEXT: .long 38792
+; GCN-NEXT: .long 32900
 
 ; EG: {{^}}local_memory_two_objects:
 
@@ -30,7 +30,7 @@
 ; constant offsets.
 ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
-; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
+; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], vcc, 16, v{{[0-9]+}}
 ; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
 ; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
 ; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll
index 9494ed75bd0c0..9ffb59e709200 100644
--- a/test/CodeGen/AMDGPU/local-memory.ll
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@@ -9,9 +9,9 @@
 ; EG: .long 166120
 ; EG-NEXT: .long 128
 ; SI: .long 47180
-; SI-NEXT: .long 71560
+; SI-NEXT: .long 65668
 ; CI: .long 47180
-; CI-NEXT: .long 38792
+; CI-NEXT: .long 32900
 
 ; FUNC-LABEL: {{^}}local_memory:
 
diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll
index fef3e2f0a21ca..eeb915c10a960 100644
--- a/test/CodeGen/AMDGPU/max.ll
+++ b/test/CodeGen/AMDGPU/max.ll
@@ -2,7 +2,7 @@
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-; FUNC-LABEL: @v_test_imax_sge_i32
+; FUNC-LABEL: {{^}}v_test_imax_sge_i32:
 ; SI: v_max_i32_e32
 define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -17,6 +17,24 @@ define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_test_imax_sge_v4i32:
+; SI: v_max_i32_e32
+; SI: v_max_i32_e32
+; SI: v_max_i32_e32
+; SI: v_max_i32_e32
+define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %out, i32 %tid
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %gep0, align 4
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %gep1, align 4
+  %cmp = icmp sge <4 x i32> %a, %b
+  %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+  store <4 x i32> %val, <4 x i32> addrspace(1)* %outgep, align 4
+  ret void
+}
+
 ; FUNC-LABEL: @s_test_imax_sge_i32
 ; SI: s_max_i32
 define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -35,6 +53,23 @@ define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_test_imax_sge_i8:
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: v_max_i32_e32
+define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %a = load i8, i8 addrspace(1)* %gep0, align 1
+  %b = load i8, i8 addrspace(1)* %gep1, align 1
+  %cmp = icmp sge i8 %a, %b
+  %val = select i1 %cmp, i8 %a, i8 %b
+  store i8 %val, i8 addrspace(1)* %outgep, align 1
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32:
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
@@ -44,6 +79,15 @@ define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_v2i32:
+; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+  %cmp = icmp sgt <2 x i32> %a, <i32 9, i32 9>
+  %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 9, i32 9>
+  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
 ; FUNC-LABEL: @v_test_imax_sgt_i32
 ; SI: v_max_i32_e32
 define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -92,6 +136,36 @@ define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_umax_uge_v3i32:
+; SI: s_max_u32
+; SI: s_max_u32
+; SI: s_max_u32
+; SI-NOT: s_max_u32
+; SI: s_endpgm
+define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
+  %cmp = icmp uge <3 x i32> %a, %b
+  %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
+  store <3 x i32> %val, <3 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_test_umax_uge_i8:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: v_max_u32_e32
+define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %a = load i8, i8 addrspace(1)* %gep0, align 1
+  %b = load i8, i8 addrspace(1)* %gep1, align 1
+  %cmp = icmp uge i8 %a, %b
+  %val = select i1 %cmp, i8 %a, i8 %b
+  store i8 %val, i8 addrspace(1)* %outgep, align 1
+  ret void
+}
+
 ; FUNC-LABEL: @v_test_umax_ugt_i32
 ; SI: v_max_u32_e32
 define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -107,7 +181,7 @@ define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
   ret void
 }
 
-; FUNC-LABEL: @s_test_umax_ugt_i32
+; FUNC-LABEL: {{^}}s_test_umax_ugt_i32:
 ; SI: s_max_u32
 define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp ugt i32 %a, %b
@@ -116,13 +190,23 @@ define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_umax_ugt_imm_v2i32:
+; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15
+; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23
+define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+  %cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
+  %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
+  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
 ; Make sure redundant and removed
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16:
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI-NEXT: buffer_store_dword [[VMIN]]
+; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
+; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
+; SI-NEXT: buffer_store_dword [[VMAX]]
 define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
@@ -135,13 +219,13 @@ define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i1
 
 ; Make sure redundant sign_extend_inreg removed.
 
-; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
+; FUNC-LABEL: {{^}}simplify_demanded_bits_test_max_slt_i16:
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI-NEXT: buffer_store_dword [[VMIN]]
-define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]]
+; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
+; SI-NEXT: buffer_store_dword [[VMAX]]
+define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp sgt i32 %a.ext, %b.ext
@@ -152,15 +236,13 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16
   ret void
 }
 
-; FIXME: Should get match min/max through extends inserted by
-; legalization.
-
-; FUNC-LABEL: {{^}}s_test_imin_sge_i16:
+; FUNC-LABEL: {{^}}s_test_imax_sge_i16:
+; SI: s_load_dword
+; SI: s_load_dword
 ; SI: s_sext_i32_i16
 ; SI: s_sext_i32_i16
-; SI: v_cmp_ge_i32_e32
-; SI: v_cndmask_b32
-define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+; SI: s_max_i32
+define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
   %cmp = icmp sge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
index 34a2fc7ffa745..65b454b5d8cbb 100644
--- a/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,5 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
+
+; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
 
 ; Run with devices with different unaligned load restrictions.
 
@@ -65,10 +68,8 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
-; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
-; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
-; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
-; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
+; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
+; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
@@ -89,10 +90,8 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
-; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0
-; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}}
-; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]]
-; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]]
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
@@ -121,10 +120,7 @@ define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
-; XGCN: buffer_store_dwordx4
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dwordx2 v
+; GCN: buffer_store_dwordx4
 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
@@ -137,17 +133,9 @@ define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out)
   ret void
 }
 
-; First store is out of order. Because of order of combines, the
-; consecutive store fails because only some of the stores have been
-; replaced with integer constant stores, and then won't merge because
-; the types are different.
-
+; First store is out of order.
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
-; XGCN: buffer_store_dwordx4
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
+; GCN: buffer_store_dwordx4
 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
@@ -160,6 +148,33 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   ret void
 }
 
+; FIXME: Should be able to merge this
+; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+
+; GCN-AA: buffer_store_dwordx2
+; GCN-AA: buffer_store_dword v
+; GCN-AA: buffer_store_dword v
+
+; GCN: s_endpgm
+define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+
+  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
+  %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
+
+  store i32 11, i32 addrspace(1)* %out.gep.1.bc
+  store float 2.0, float addrspace(1)* %out.gep.2
+  store i32 17, i32 addrspace(1)* %out.gep.3.bc
+  store float 8.0, float addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
 ; SI-DAG: buffer_store_dwordx2
 ; SI-DAG: buffer_store_dword
@@ -176,9 +191,7 @@ define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
-; XGCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx4
 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 
@@ -188,13 +201,8 @@ define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
-; XGCN: buffer_store_dwordx4
-; XGCN: buffer_store_dwordx4
-
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
@@ -472,11 +480,15 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1
 ; This works once AA is enabled on the subtarget
 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; XGCN: buffer_store_dwordx4 [[LOAD]]
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
+
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+
+; GCN-AA: buffer_store_dwordx4 [[LOAD]]
+
+; GCN: s_endpgm
 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
@@ -508,10 +520,8 @@ define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 }
 
 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
-; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
-; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
@@ -522,10 +532,15 @@ define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 }
 
 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
-; GCN: ds_write_b32
-; GCN: ds_write_b32
-; GCN: ds_write_b32
-; GCN: ds_write_b32
+; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
+; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
+; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
+
+; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
+; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
+; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
+
+; GCN: s_endpgm
 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
@@ -597,17 +612,9 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
-; XGCN: buffer_store_dwordx4
-; XGCN: buffer_store_dwordx4
-
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
 define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
@@ -627,7 +634,78 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   ret void
 }
 
+; This requires handling of scalar_to_vector for v2i64 to avoid
+; scratch usage.
+; FIXME: Should do single load and store
+
+; GCN-LABEL: {{^}}copy_v3i32_align4:
+; GCN-NOT: SCRATCH_RSRC_DWORD
+; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-NOT: offen
+; GCN: s_waitcnt vmcnt
+; GCN-NOT: offen
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+
+; GCN: ScratchSize: 0{{$}}
+define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
+  store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}copy_v3i64_align4:
+; GCN-NOT: SCRATCH_RSRC_DWORD
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN-NOT: offen
+; GCN: s_waitcnt vmcnt
+; GCN-NOT: offen
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN: ScratchSize: 0{{$}}
+define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
+  store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}copy_v3f32_align4:
+; GCN-NOT: SCRATCH_RSRC_DWORD
+; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-NOT: offen
+; GCN: s_waitcnt vmcnt
+; GCN-NOT: offen
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN: ScratchSize: 0{{$}}
+define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+  %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
+  %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
+  store <3 x float> %fadd, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}copy_v3f64_align4:
+; GCN-NOT: SCRATCH_RSRC_DWORD
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN-NOT: offen
+; GCN: s_waitcnt vmcnt
+; GCN-NOT: offen
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN: ScratchSize: 0{{$}}
+define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+  %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
+  %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
+  store <3 x double> %fadd, <3 x double> addrspace(1)* %out
+  ret void
+}
+
 declare void @llvm.AMDGPU.barrier.local() #1
 
 attributes #0 = { nounwind }
-attributes #1 = { noduplicate nounwind }
+attributes #1 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index 0332d1a8e407c..215dbeb4b2fdd 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -2,7 +2,7 @@
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-; FUNC-LABEL: @v_test_imin_sle_i32
+; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
 ; SI: v_min_i32_e32
 define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -17,7 +17,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
   ret void
 }
 
-; FUNC-LABEL: @s_test_imin_sle_i32
+; FUNC-LABEL: {{^}}s_test_imin_sle_i32:
 ; SI: s_min_i32
 define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sle i32 %a, %b
@@ -26,6 +26,78 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_imin_sle_v1i32:
+; SI: s_min_i32
+define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+  %cmp = icmp sle <1 x i32> %a, %b
+  %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
+  store <1 x i32> %val, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imin_sle_v4i32:
+; SI: s_min_i32
+; SI: s_min_i32
+; SI: s_min_i32
+; SI: s_min_i32
+define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %cmp = icmp sle <4 x i32> %a, %b
+  %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+  store <4 x i32> %val, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imin_sle_i8:
+; SI: s_load_dword
+; SI: s_load_dword
+; SI: s_sext_i32_i8
+; SI: s_sext_i32_i8
+; SI: s_min_i32
+define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
+  %cmp = icmp sle i8 %a, %b
+  %val = select i1 %cmp, i8 %a, i8 %b
+  store i8 %val, i8 addrspace(1)* %out
+  ret void
+}
+
+; XXX - should be able to use s_min if we stop unnecessarily doing
+; extloads with mubuf instructions.
+
+; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+
+; SI: v_min_i32
+; SI: v_min_i32
+; SI: v_min_i32
+; SI: v_min_i32
+
+; SI: s_endpgm
+define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) nounwind {
+  %cmp = icmp sle <4 x i8> %a, %b
+  %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16:
+; SI: v_min_i32
+; SI: v_min_i32
+; SI: v_min_i32
+; SI: v_min_i32
+define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+  %cmp = icmp sle <4 x i16> %a, %b
+  %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
+  store <4 x i16> %val, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: @v_test_imin_slt_i32
 ; SI: v_min_i32_e32
 define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -50,6 +122,16 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_imin_slt_v2i32:
+; SI: s_min_i32
+; SI: s_min_i32
+define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %cmp = icmp slt <2 x i32> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
+  store <2 x i32> %val, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32:
 ; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
@@ -83,6 +165,24 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
   ret void
 }
 
+; FUNC-LABEL: @v_test_umin_ule_v3i32
+; SI: v_min_u32_e32
+; SI: v_min_u32_e32
+; SI: v_min_u32_e32
+; SI-NOT: v_min_u32_e32
+; SI: s_endpgm
+define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %aptr, <3 x i32> addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep0
+  %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep1
+  %cmp = icmp ule <3 x i32> %a, %b
+  %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
+  store <3 x i32> %val, <3 x i32> addrspace(1)* %outgep
+  ret void
+}
 ; FUNC-LABEL: @s_test_umin_ule_i32
 ; SI: s_min_u32
 define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -107,6 +207,23 @@ define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: v_min_u32_e32
+define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %a = load i8, i8 addrspace(1)* %gep0, align 1
+  %b = load i8, i8 addrspace(1)* %gep1, align 1
+  %cmp = icmp ult i8 %a, %b
+  %val = select i1 %cmp, i8 %a, i8 %b
+  store i8 %val, i8 addrspace(1)* %outgep, align 1
+  ret void
+}
+
 ; FUNC-LABEL: @s_test_umin_ult_i32
 ; SI: s_min_u32
 define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -137,6 +254,48 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace
   ret void
 }
 
+
+; FUNC-LABEL: @s_test_umin_ult_v1i32
+; SI: s_min_u32
+define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+  %cmp = icmp ult <1 x i32> %a, %b
+  %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
+  store <1 x i32> %val, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_umin_ult_v8i32:
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+  %cmp = icmp ult <8 x i32> %a, %b
+  %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
+  store <8 x i32> %val, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16:
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+  %cmp = icmp ult <8 x i16> %a, %b
+  %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  store <8 x i16> %val, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
 ; Make sure redundant and removed
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
@@ -173,14 +332,8 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16
   ret void
 }
 
-; FIXME: Should get match min/max through extends inserted by
-; legalization.
-
 ; FUNC-LABEL: {{^}}s_test_imin_sle_i16:
-; SI: s_sext_i32_i16
-; SI: s_sext_i32_i16
-; SI: v_cmp_le_i32_e32
-; SI: v_cndmask_b32
+; SI: s_min_i32
 define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
   %cmp = icmp sle i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
diff --git a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
new file mode 100644
index 0000000000000..e9f641b736d56
--- /dev/null
+++ b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
+
+; Check that when mubuf addr64 instruction is handled in moveToVALU
+; from the pointer, dead register writes are not emitted.
+
+; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32
+
+; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
+; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
+
+; GCN-NOT: v_mov_b32
+; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
+; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
+; GCN-NOT: v_mov_b32
+
+; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
+; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
+; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
+
+define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
+bb:
+  %tmp = icmp sgt i32 %arg3, 0
+  br i1 %tmp, label %bb4, label %bb17
+
+bb4:
+  %tmp14 = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %ptrarg
+  %tmp15 = getelementptr inbounds i8, i8 addrspace(1)* %tmp14, i64 %arg1
+  %tmp16 = load volatile i8, i8 addrspace(1)* %tmp15
+  br label %bb17
+
+bb17:
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
new file mode 100644
index 0000000000000..8bca0575ecd23
--- /dev/null
+++ b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; FIXME: broken on VI because flat instructions need to be emitted
+; instead of addr64 equivalent of the _OFFSET variants.
+
+; Check that moving the pointer out of the resource descriptor to
+; vaddr works for atomics.
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; GCN-LABEL: {{^}}atomic_max_i32:
+; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}}
+define void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
+  %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
+  %xor = xor i32 %tid, 1
+  %cmp = icmp ne i32 %xor, 0
+  br i1 %cmp, label %atomic, label %exit
+
+atomic:
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 100
+  %ret = atomicrmw max i32 addrspace(1)* %gep, i32 %y seq_cst
+  store i32 %ret, i32 addrspace(1)* %out
+  br label %exit
+
+exit:
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_noret:
+; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}}
+define void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
+  %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
+  %xor = xor i32 %tid, 1
+  %cmp = icmp ne i32 %xor, 0
+  br i1 %cmp, label %atomic, label %exit
+
+atomic:
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 100
+  %ret = atomicrmw max i32 addrspace(1)* %gep, i32 %y seq_cst
+  br label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll b/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll
new file mode 100644
index 0000000000000..73a146710a9ff
--- /dev/null
+++ b/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll
@@ -0,0 +1,18 @@
+; RUN: not llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported non-compute shaders with HSA in pixel_shader
+define void @pixel_shader() #0 {
+  ret void
+}
+
+define void @vertex_shader() #1 {
+  ret void
+}
+
+define void @geometry_shader() #2 {
+  ret void
+}
+
+attributes #0 = { nounwind "ShaderType"="0" }
+attributes #1 = { nounwind "ShaderType"="1" }
+attributes #2 = { nounwind "ShaderType"="2" }
diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index e4328ecbaca8d..f81911aafe220 100644
--- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -189,3 +189,15 @@ define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace
   store i8 %trunc, i8 addrspace(1)* %gep.out
   ret void
 }
+
+; FUNC-LABEL: {{^}}smrd_mask_i32_to_i16
+; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0
+; SI: s_waitcnt lgkmcnt(0)
+; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff
+define void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+entry:
+  %val = load i32, i32 addrspace(2)* %in
+  %mask = and i32 %val, 65535
+  store i32 %mask, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/test/CodeGen/AMDGPU/opencl-image-metadata.ll
new file mode 100644
index 0000000000000..bc467e47dc316
--- /dev/null
+++ b/test/CodeGen/AMDGPU/opencl-image-metadata.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; Make sure the OpenCL Image lowering pass doesn't crash when argument metadata
+; is not in expected order.
+
+; EG: CF_END
+; SI: s_endpgm
+define void @kernel(i32 addrspace(1)* %out) {
+entry:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #3 = { nounwind }
+
+!opencl.kernels = !{!0}
+
+!0 = !{void (i32 addrspace(1)*)* @kernel, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none"}
+!3 = !{!"kernel_arg_type", !"int*"}
+!4 = !{!"kernel_arg_type_qual", !""}
+!5 = !{!"kernel_arg_name", !""}
diff --git a/test/CodeGen/AMDGPU/operand-folding.ll b/test/CodeGen/AMDGPU/operand-folding.ll
index 816755efb07ce..9e514ef9970ac 100644
--- a/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/test/CodeGen/AMDGPU/operand-folding.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
 
 ; CHECK-LABEL: {{^}}fold_sgpr:
-; CHECK: v_add_i32_e32 v{{[0-9]+}}, s
+; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
 define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
 entry:
   %tmp0 = icmp ne i32 %fold, 0
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index 1c04090b407ff..e40f18f040b7a 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -153,7 +153,7 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 }
 
 ; FUNC-LABEL: {{^}}or_i1:
-; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
 
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
 define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
diff --git a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
new file mode 100644
index 0000000000000..51985af42a290
--- /dev/null
+++ b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs -verify-coalescing < %s
+
+; The original and requires materializing a 64-bit immediate for
+; s_and_b64. This is split into 2 x v_and_i32, part of the immediate
+; is folded through the reg_sequence into the v_and_i32 operand, and
+; only half of the result is ever used.
+;
+; During live interval construction, the first sub register def is
+; incorrectly marked as dead.
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+define void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %val = load i64, i64 addrspace(1)* %in.gep
+
+  %lshr = shl i64 %val, 24
+  %and1 = and i64 %lshr, 2190433320969 ; (255 << 33) | 9
+  %vec = bitcast i64 %and1 to <2 x i32>
+  %elt1 = extractelement <2 x i32> %vec, i32 1
+
+  store i32 %elt1, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/private-memory.ll b/test/CodeGen/AMDGPU/private-memory.ll
index 645dc04f44202..79778eebd802b 100644
--- a/test/CodeGen/AMDGPU/private-memory.ll
+++ b/test/CodeGen/AMDGPU/private-memory.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
 ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
 ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
 ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
 ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
 
@@ -13,11 +15,21 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; R600: LDS_READ
 ; R600: LDS_READ
 
+; HSA-PROMOTE: .amd_kernel_code_t
+; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
+; HSA-PROMOTE: .end_amd_kernel_code_t
+
 ; SI-PROMOTE: ds_write_b32
 ; SI-PROMOTE: ds_write_b32
 ; SI-PROMOTE: ds_read_b32
 ; SI-PROMOTE: ds_read_b32
 
+; HSA-ALLOCA: .amd_kernel_code_t
+; FIXME: Creating the emergency stack slots causes us to over-estimate scratch
+; by 4 bytes.
+; HSA-ALLOCA: workitem_private_segment_byte_size = 24
+; HSA-ALLOCA: .end_amd_kernel_code_t
+
 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
diff --git a/test/CodeGen/AMDGPU/register-count-comments.ll b/test/CodeGen/AMDGPU/register-count-comments.ll
index de6bfb3108836..4bb315049be4b 100644
--- a/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
index 187650ff9a537..d5e10d0be883d 100644
--- a/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -2,14 +2,10 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
-; SI: buffer_load_dwordx2
-; SI: buffer_load_dwordx2
-; SI: buffer_load_dwordx2
-; SI: buffer_load_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
@@ -34,46 +30,16 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace
 }
 
 ; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
 
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll
index 6b1a36c979c2a..47c7fbb6dd6a8 100644
--- a/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -3,10 +3,9 @@
 
 ; SI-LABEL: {{^}}s_movk_i32_k0:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -17,10 +16,9 @@ define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 
 ; SI-LABEL: {{^}}s_movk_i32_k1:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -31,10 +29,9 @@ define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 
 ; SI-LABEL: {{^}}s_movk_i32_k2:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}}
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -45,10 +42,9 @@ define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 
 ; SI-LABEL: {{^}}s_movk_i32_k3:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -59,10 +55,9 @@ define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 
 ; SI-LABEL: {{^}}s_movk_i32_k4:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
@@ -87,10 +82,9 @@ define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 
 ; SI-LABEL: {{^}}s_movk_i32_k6:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}}
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index 0b9649576545d..a30c25e700aba 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -1,4 +1,8 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.tidig.y() #0
 
 ; In this test both the pointer and the offset operands to the
 ; BUFFER_LOAD instructions end up being stored in vgprs.  This
@@ -7,94 +11,267 @@
 ; sgpr register pair and use that for the pointer operand
 ; (low 64-bits of srsrc).
 
-; CHECK-LABEL: {{^}}mubuf:
+; GCN-LABEL: {{^}}mubuf:
 
 ; Make sure we aren't using VGPRs for the source operand of s_mov_b64
-; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v
+; GCN-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v
 
 ; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
 ; instructions
-; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
-; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
-define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+
+define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
 entry:
-  %0 = call i32 @llvm.r600.read.tidig.x() #1
-  %1 = call i32 @llvm.r600.read.tidig.y() #1
-  %2 = sext i32 %0 to i64
-  %3 = sext i32 %1 to i64
+  %tmp = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = call i32 @llvm.r600.read.tidig.y()
+  %tmp2 = sext i32 %tmp to i64
+  %tmp3 = sext i32 %tmp1 to i64
   br label %loop
 
-loop:
-  %4 = phi i64 [0, %entry], [%5, %loop]
-  %5 = add i64 %2, %4
-  %6 = getelementptr i8, i8 addrspace(1)* %in, i64 %5
-  %7 = load i8, i8 addrspace(1)* %6, align 1
-  %8 = or i64 %5, 1
-  %9 = getelementptr i8, i8 addrspace(1)* %in, i64 %8
-  %10 = load i8, i8 addrspace(1)* %9, align 1
-  %11 = add i8 %7, %10
-  %12 = sext i8 %11 to i32
-  store i32 %12, i32 addrspace(1)* %out
-  %13 = icmp slt i64 %5, 10
-  br i1 %13, label %loop, label %done
-
-done:
+loop:                                             ; preds = %loop, %entry
+  %tmp4 = phi i64 [ 0, %entry ], [ %tmp5, %loop ]
+  %tmp5 = add i64 %tmp2, %tmp4
+  %tmp6 = getelementptr i8, i8 addrspace(1)* %in, i64 %tmp5
+  %tmp7 = load i8, i8 addrspace(1)* %tmp6, align 1
+  %tmp8 = or i64 %tmp5, 1
+  %tmp9 = getelementptr i8, i8 addrspace(1)* %in, i64 %tmp8
+  %tmp10 = load i8, i8 addrspace(1)* %tmp9, align 1
+  %tmp11 = add i8 %tmp7, %tmp10
+  %tmp12 = sext i8 %tmp11 to i32
+  store i32 %tmp12, i32 addrspace(1)* %out
+  %tmp13 = icmp slt i64 %tmp5, 10
+  br i1 %tmp13, label %loop, label %done
+
+done:                                             ; preds = %loop
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #1
-declare i32 @llvm.r600.read.tidig.y() #1
-
-attributes #1 = { nounwind readnone }
-
 ; Test moving an SMRD instruction to the VALU
 
-; CHECK-LABEL: {{^}}smrd_valu:
-; CHECK: buffer_load_dword [[OUT:v[0-9]+]]
-; CHECK: buffer_store_dword [[OUT]]
-
-define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) {
+; GCN-LABEL: {{^}}smrd_valu:
+; GCN: buffer_load_dword [[OUT:v[0-9]+]]
+; GCN: buffer_store_dword [[OUT]]
+define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
 entry:
-  %0 = icmp ne i32 %a, 0
-  br i1 %0, label %if, label %else
+  %tmp = icmp ne i32 %a, 0
+  br i1 %tmp, label %if, label %else
 
-if:
-  %1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
+if:                                               ; preds = %entry
+  %tmp1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
   br label %endif
 
-else:
-  %2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
-  %3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %2
+else:                                             ; preds = %entry
+  %tmp2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
+  %tmp3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %tmp2
   br label %endif
 
-endif:
-  %4 = phi i32 addrspace(2)*  [%1, %if], [%3, %else]
-  %5 = getelementptr i32, i32 addrspace(2)* %4, i32 3000
-  %6 = load i32, i32 addrspace(2)* %5
-  store i32 %6, i32 addrspace(1)* %out
+endif:                                            ; preds = %else, %if
+  %tmp4 = phi i32 addrspace(2)* [ %tmp1, %if ], [ %tmp3, %else ]
+  %tmp5 = getelementptr i32, i32 addrspace(2)* %tmp4, i32 3000
+  %tmp6 = load i32, i32 addrspace(2)* %tmp5
+  store i32 %tmp6, i32 addrspace(1)* %out
   ret void
 }
 
-; Test moving ann SMRD with an immediate offset to the VALU
+; Test moving an SMRD with an immediate offset to the VALU
 
-; CHECK-LABEL: {{^}}smrd_valu2:
-; CHECK: buffer_load_dword
-define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
+; GCN-LABEL: {{^}}smrd_valu2:
+; GCN-NOT: v_add
+; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
 entry:
-  %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %1 = add i32 %0, 4
-  %2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %0, i32 4
-  %3 = load i32, i32 addrspace(2)* %2
-  store i32 %3, i32 addrspace(1)* %out
+  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp1 = add i32 %tmp, 4
+  %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4
+  %tmp3 = load i32, i32 addrspace(2)* %tmp2
+  store i32 %tmp3, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}s_load_imm_v8i32:
-; CHECK: buffer_load_dwordx4
-; CHECK: buffer_load_dwordx4
-define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
+; Use a big offset that will use the SMRD literal offset on CI
+; GCN-LABEL: {{^}}smrd_valu_ci_offset:
+; GCN-NOT: v_add
+; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
+; GCN-NOT: v_add
+; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
+; GCN: v_add_i32_e32
+; GCN: buffer_store_dword
+define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
 entry:
-  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp
+  %tmp3 = getelementptr i32, i32 addrspace(2)* %tmp2, i32 5000
+  %tmp4 = load i32, i32 addrspace(2)* %tmp3
+  %tmp5 = add i32 %tmp4, %c
+  store i32 %tmp5, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2:
+; GCN-NOT: v_add
+; GCN: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
+; GCN-NOT: v_add
+; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: buffer_store_dwordx2
+define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp
+  %tmp3 = getelementptr i64, i64 addrspace(2)* %tmp2, i32 5000
+  %tmp4 = load i64, i64 addrspace(2)* %tmp3
+  %tmp5 = or i64 %tmp4, %c
+  store i64 %tmp5, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4:
+; GCN-NOT: v_add
+; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
+; GCN-NOT: v_add
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: buffer_store_dwordx4
+define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp
+  %tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %tmp2, i32 1234
+  %tmp4 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp3
+  %tmp5 = or <4 x i32> %tmp4, %c
+  store <4 x i32> %tmp5, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Original scalar load uses SGPR offset on SI and 32-bit literal on
+; CI.
+
+; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
+; GCN-NOT: v_add
+; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
+; GCN-NOT: v_add
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; GCN-NOT: v_add
+; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
+; GCN-NOT: v_add
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
+
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp
+  %tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %tmp2, i32 1234
+  %tmp4 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp3
+  %tmp5 = or <8 x i32> %tmp4, %c
+  store <8 x i32> %tmp5, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
+
+; GCN-NOT: v_add
+; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
+; GCN-NOT: v_add
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; GCN-NOT: v_add
+; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
+; GCN-NOT: v_add
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
+; GCN-NOT: v_add
+; GCN: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
+; GCN-NOT: v_add
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
+; GCN-NOT: v_add
+; GCN: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
+; GCN-NOT: v_add
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
+
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+
+; GCN: s_endpgm
+define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp
+  %tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %tmp2, i32 1234
+  %tmp4 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp3
+  %tmp5 = or <16 x i32> %tmp4, %c
+  store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}smrd_valu2_salu_user:
+; GCN: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
+; GCN: buffer_store_dword [[ADD]]
+define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp1 = add i32 %tmp, 4
+  %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4
+  %tmp3 = load i32, i32 addrspace(2)* %tmp2
+  %tmp4 = add i32 %tmp3, %a
+  store i32 %tmp4, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
+; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
+define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp1 = add i32 %tmp, 4
+  %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 255
+  %tmp3 = load i32, i32 addrspace(2)* %tmp2
+  store i32 %tmp3, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset:
+; GCN-NOT: v_add
+; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
+define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp1 = add i32 %tmp, 4
+  %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 256
+  %tmp3 = load i32, i32 addrspace(2)* %tmp2
+  store i32 %tmp3, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_load_imm_v8i32:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
   %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
@@ -102,12 +279,51 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: {{^}}s_load_imm_v16i32:
-; CHECK: buffer_load_dwordx4
-; CHECK: buffer_load_dwordx4
-; CHECK: buffer_load_dwordx4
-; CHECK: buffer_load_dwordx4
-define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
+; GCN-LABEL: {{^}}s_load_imm_v8i32_salu_user:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: buffer_store_dword
+define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
+  %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
+
+  %elt0 = extractelement <8 x i32> %tmp3, i32 0
+  %elt1 = extractelement <8 x i32> %tmp3, i32 1
+  %elt2 = extractelement <8 x i32> %tmp3, i32 2
+  %elt3 = extractelement <8 x i32> %tmp3, i32 3
+  %elt4 = extractelement <8 x i32> %tmp3, i32 4
+  %elt5 = extractelement <8 x i32> %tmp3, i32 5
+  %elt6 = extractelement <8 x i32> %tmp3, i32 6
+  %elt7 = extractelement <8 x i32> %tmp3, i32 7
+
+  %add0 = add i32 %elt0, %elt1
+  %add1 = add i32 %add0, %elt2
+  %add2 = add i32 %add1, %elt3
+  %add3 = add i32 %add2, %elt4
+  %add4 = add i32 %add3, %elt5
+  %add5 = add i32 %add4, %elt6
+  %add6 = add i32 %add5, %elt7
+
+  store i32 %add6, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_load_imm_v16i32:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -116,3 +332,71 @@ entry:
   store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
   ret void
 }
+
+; GCN-LABEL: {{^}}s_load_imm_v16i32_salu_user:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: buffer_store_dword
+define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
+  %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
+
+  %elt0 = extractelement <16 x i32> %tmp3, i32 0
+  %elt1 = extractelement <16 x i32> %tmp3, i32 1
+  %elt2 = extractelement <16 x i32> %tmp3, i32 2
+  %elt3 = extractelement <16 x i32> %tmp3, i32 3
+  %elt4 = extractelement <16 x i32> %tmp3, i32 4
+  %elt5 = extractelement <16 x i32> %tmp3, i32 5
+  %elt6 = extractelement <16 x i32> %tmp3, i32 6
+  %elt7 = extractelement <16 x i32> %tmp3, i32 7
+  %elt8 = extractelement <16 x i32> %tmp3, i32 8
+  %elt9 = extractelement <16 x i32> %tmp3, i32 9
+  %elt10 = extractelement <16 x i32> %tmp3, i32 10
+  %elt11 = extractelement <16 x i32> %tmp3, i32 11
+  %elt12 = extractelement <16 x i32> %tmp3, i32 12
+  %elt13 = extractelement <16 x i32> %tmp3, i32 13
+  %elt14 = extractelement <16 x i32> %tmp3, i32 14
+  %elt15 = extractelement <16 x i32> %tmp3, i32 15
+
+  %add0 = add i32 %elt0, %elt1
+  %add1 = add i32 %add0, %elt2
+  %add2 = add i32 %add1, %elt3
+  %add3 = add i32 %add2, %elt4
+  %add4 = add i32 %add3, %elt5
+  %add5 = add i32 %add4, %elt6
+  %add6 = add i32 %add5, %elt7
+  %add7 = add i32 %add6, %elt8
+  %add8 = add i32 %add7, %elt9
+  %add9 = add i32 %add8, %elt10
+  %add10 = add i32 %add9, %elt11
+  %add11 = add i32 %add10, %elt12
+  %add12 = add i32 %add11, %elt13
+  %add13 = add i32 %add12, %elt14
+  %add14 = add i32 %add13, %elt15
+
+  store i32 %add14, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/sampler-resource-id.ll b/test/CodeGen/AMDGPU/sampler-resource-id.ll
new file mode 100644
index 0000000000000..c41d345369bf6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sampler-resource-id.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=r600 -mcpu=juniper < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_0:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 0(
+define void @test_0(i32 %in0, i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in0) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_1:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 1(
+define void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in1) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_2:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], literal.x
+; EG-NEXT: LSHR
+; EG-NEXT: 2(
+define void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in2) #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+
+declare i32 @llvm.OpenCL.sampler.get.resource.id(i32) #0
+
+attributes #0 = { readnone }
+
+!opencl.kernels = !{!0, !1, !2}
+
+!0 = !{void (i32, i32 addrspace(1)*)* @test_0, !10, !20, !30, !40, !50}
+!10 = !{!"kernel_arg_addr_space", i32 0, i32 1}
+!20 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!30 = !{!"kernel_arg_type", !"sampler_t", !"int*"}
+!40 = !{!"kernel_arg_base_type", !"sampler_t", !"int*"}
+!50 = !{!"kernel_arg_type_qual", !"", !""}
+
+!1 = !{void (i32, i32, i32 addrspace(1)*)* @test_1, !11, !21, !31, !41, !51}
+!11 = !{!"kernel_arg_addr_space", i32 0, i32 0, i32 1}
+!21 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!31 = !{!"kernel_arg_type", !"sampler_t", !"sampler_t", !"int*"}
+!41 = !{!"kernel_arg_base_type", !"sampler_t", !"sampler_t", !"int*"}
+!51 = !{!"kernel_arg_type_qual", !"", !"", !""}
+
+!2 = !{void (i32, i32, i32, i32 addrspace(1)*)* @test_2, !12, !22, !32, !42, !52}
+!12 = !{!"kernel_arg_addr_space", i32 0, i32 0, i32 0, i32 1}
+!22 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
+!32 = !{!"kernel_arg_type", !"sampler_t", !"sampler_t", !"sampler_t", !"int*"}
+!42 = !{!"kernel_arg_base_type", !"sampler_t", !"sampler_t", !"sampler_t", !"int*"}
+!52 = !{!"kernel_arg_type_qual", !"", !"", !"", !""}
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
index 3863afda5dd3a..e4b16c0a165f4 100644
--- a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
 ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
 
-declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+declare void @llvm.AMDGPU.barrier.local() nounwind convergent
 
 
 ; SI-LABEL: {{^}}main(
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll
index 268869daaa321..d43de47660576 100644
--- a/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -51,7 +51,7 @@ done:
 
 ; GCN-LABEL: {{^}}legal_offset_fi_offset
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
-; GCN: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000
+; GCN: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8000
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
 define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll
index 13fb575b2b151..a68fdecb00af7 100644
--- a/test/CodeGen/AMDGPU/select64.ll
+++ b/test/CodeGen/AMDGPU/select64.ll
@@ -51,12 +51,8 @@ define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspa
 }
 
 ; CHECK-LABEL: {{^}}v_select_i64_split_imm:
-; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63
-; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0
-; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]]
-; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]]
-; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
-; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
+; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}}
 ; CHECK: s_endpgm
 define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll
index 53694dcffa66e..57365a6e1fc37 100644
--- a/test/CodeGen/AMDGPU/set-dx10.ll
+++ b/test/CodeGen/AMDGPU/set-dx10.ll
@@ -5,8 +5,8 @@
 ; SET*DX10 instructions.
 
 ; CHECK: {{^}}fcmp_une_select_fptosi:
-; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -19,8 +19,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_une_select_i32:
-; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -31,8 +31,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oeq_select_fptosi:
-; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -45,8 +45,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oeq_select_i32:
-; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -57,8 +57,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ogt_select_fptosi:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -71,8 +71,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ogt_select_i32:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -83,8 +83,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oge_select_fptosi:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -97,8 +97,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oge_select_i32:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -109,8 +109,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ole_select_fptosi:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -123,8 +123,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ole_select_i32:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -135,8 +135,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_olt_select_fptosi:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -149,8 +149,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_olt_select_i32:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll
index 4e6a10d6b78d7..63d74820f9613 100644
--- a/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -142,11 +142,14 @@ define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}cmp_zext_k_i8max:
-; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}}
-; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]]
+; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff
+; GCN: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]]
+; GCN: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]]
+; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[VK255]]
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
 define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
@@ -187,11 +190,14 @@ define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) n
 ; Should do a buffer_load_sbyte and compare with -1
 
 ; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg:
-; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}}
-; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}}
+; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff
+; GCN: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]]
+; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
+; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[VK]]{{$}}
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
 define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = sext i8 %b to i32
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 5aedda2ce1a9c..23ae3b967971d 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -12,8 +12,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: buffer_store_dword [[EXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
-; EG-NEXT: LSHR * [[ADDR]]
+; EG: LSHR * [[ADDR]]
+; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
 define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
@@ -609,3 +609,53 @@ define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; Make sure we propagate the VALUness to users of a moved scalar BFE.
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64_move_use:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
+; SI-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]]
+; SI-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 63
+  %ashr = ashr i64 %shl, 63
+
+  %and = and i64 %ashr, %s.val
+  store i64 %and, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64_move_use:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; SI-DAG: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
+; SI-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]]
+; SI-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[SHR]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 32
+  %ashr = ashr i64 %shl, 32
+  %and = and i64 %ashr, %s.val
+  store i64 %and, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index 6f81a39ed96aa..55db80731c900 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -53,14 +53,14 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG: {{^}}shl_i64:
+;EG-LABEL: {{^}}shl_i64:
 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
+;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]|PV.[XYZW]}}
 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
@@ -80,7 +80,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-;EG: {{^}}shl_v2i64:
+;EG-LABEL: {{^}}shl_v2i64:
 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
@@ -185,8 +185,7 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in
 ; Make sure load width gets reduced to i32 load.
 ; GCN-LABEL: {{^}}s_shl_32_i64:
 ; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
-; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll
index b1485bfaaebb0..dfb2bf3383fc5 100644
--- a/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: {{^}}shl_2_add_9_i32:
 ; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]]
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 36, [[REG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
@@ -20,7 +20,7 @@ define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 }
 
 ; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses:
-; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}}
+; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], vcc, 9, {{v[0-9]+}}
 ; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI-DAG: buffer_store_dword [[ADDREG]]
 ; SI-DAG: buffer_store_dword [[SHLREG]]
@@ -40,7 +40,7 @@ define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1
 
 ; FUNC-LABEL: {{^}}shl_2_add_999_i32:
 ; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]]
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xf9c, [[REG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 6671e909cd1dd..ac94824bd61f1 100644
--- a/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -35,7 +35,7 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad
 ; SI-LABEL: {{^}}load_shl_base_lds_1:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
-; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}}
+; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
 ; SI-DAG: buffer_store_dword [[RESULT]]
 ; SI-DAG: buffer_store_dword [[ADDUSE]]
 ; SI: s_endpgm
diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
new file mode 100644
index 0000000000000..27a8e70aae137
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -0,0 +1,16 @@
+; RUN: llc -o /dev/null %s -march=amdgcn -mcpu=verde -verify-machineinstrs -stop-after expand-isel-pseudos 2>&1 | FileCheck %s
+; This test verifies that the instruction selection will add the implicit
+; register operands in the correct order when modifying the opcode of an
+; instruction to V_ADD_I32_e32.
+
+; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/si-literal-folding.ll b/test/CodeGen/AMDGPU/si-literal-folding.ll
new file mode 100644
index 0000000000000..901b3c3453fc3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-literal-folding.ll
@@ -0,0 +1,17 @@
+; XFAIL: *
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0xbf4353f8
+
+define void @main(float) #0 {
+main_body:
+  %1 = fmul float %0, 0x3FE86A7F00000000
+  %2 = fmul float %0, 0xBFE86A7F00000000
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %1, float %1, float %2, float %2)
+  ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index 84652701f7731..d7b35fc631ebb 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -6,6 +6,16 @@
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: s_wqm
+
+; Make sure not emitting unused scratch resource descriptor setup
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+
+; CHECK: s_mov_b32 m0
+
+
 ; Writing to M0 from an SMRD instruction will hang the GPU.
 ; CHECK-NOT: s_buffer_load_dword m0
 ; CHECK: s_endpgm
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 5a6129aaa3fa0..bc766dbcac676 100644
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -155,9 +155,9 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: @reorder_local_offsets
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
 ; CI: buffer_store_dword
@@ -181,9 +181,10 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
 }
 
 ; FUNC-LABEL: @reorder_global_offsets
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
 ; CI: buffer_store_dword
@@ -233,4 +234,4 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { nounwind noduplicate }
+attributes #2 = { nounwind convergent }
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 0db7cdc171b54..a94ccc32e61c6 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -46,9 +46,9 @@ define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
 
 ; SI-LABEL: @v_sint_to_fp_i64_to_f64
 ; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; SI: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
-; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
-; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI-DAG: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll
new file mode 100644
index 0000000000000..e646605f7da1d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sminmax.ll
@@ -0,0 +1,130 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}s_abs_i32:
+; GCN: s_abs_i32
+; GCN: s_add_i32
+define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
+  %neg = sub i32 0, %val
+  %cond = icmp sgt i32 %val, %neg
+  %res = select i1 %cond, i32 %val, i32 %neg
+  %res2 = add i32 %res, 2
+  store i32 %res2, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_abs_i32:
+; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
+; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]]
+; GCN: v_add_i32
+define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+  %val = load i32, i32 addrspace(1)* %src, align 4
+  %neg = sub i32 0, %val
+  %cond = icmp sgt i32 %val, %neg
+  %res = select i1 %cond, i32 %val, i32 %neg
+  %res2 = add i32 %res, 2
+  store i32 %res2, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_abs_v2i32:
+; GCN: s_abs_i32
+; GCN: s_abs_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
+  %z0 = insertelement <2 x i32> undef, i32 0, i32 0
+  %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
+  %t0 = insertelement <2 x i32> undef, i32 2, i32 0
+  %t1 = insertelement <2 x i32> %t0, i32 2, i32 1
+  %neg = sub <2 x i32> %z1, %val
+  %cond = icmp sgt <2 x i32> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
+  %res2 = add <2 x i32> %res, %t1
+  store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_abs_v2i32:
+; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
+; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
+
+; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+
+; GCN: v_add_i32
+; GCN: v_add_i32
+define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
+  %z0 = insertelement <2 x i32> undef, i32 0, i32 0
+  %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
+  %t0 = insertelement <2 x i32> undef, i32 2, i32 0
+  %t1 = insertelement <2 x i32> %t0, i32 2, i32 1
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %src, align 4
+  %neg = sub <2 x i32> %z1, %val
+  %cond = icmp sgt <2 x i32> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
+  %res2 = add <2 x i32> %res, %t1
+  store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_abs_v4i32:
+; TODO: this should use s_abs_i32
+; GCN: s_abs_i32
+; GCN: s_abs_i32
+; GCN: s_abs_i32
+; GCN: s_abs_i32
+
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
+  %z0 = insertelement <4 x i32> undef, i32 0, i32 0
+  %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
+  %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
+  %z3 = insertelement <4 x i32> %z2, i32 0, i32 3
+  %t0 = insertelement <4 x i32> undef, i32 2, i32 0
+  %t1 = insertelement <4 x i32> %t0, i32 2, i32 1
+  %t2 = insertelement <4 x i32> %t1, i32 2, i32 2
+  %t3 = insertelement <4 x i32> %t2, i32 2, i32 3
+  %neg = sub <4 x i32> %z3, %val
+  %cond = icmp sgt <4 x i32> %val, %neg
+  %res = select <4 x i1> %cond, <4 x i32> %val, <4 x i32> %neg
+  %res2 = add <4 x i32> %res, %t3
+  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_abs_v4i32:
+; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
+; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
+; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
+; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
+
+; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
+; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
+
+; GCN: v_add_i32
+; GCN: v_add_i32
+; GCN: v_add_i32
+; GCN: v_add_i32
+define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
+  %z0 = insertelement <4 x i32> undef, i32 0, i32 0
+  %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
+  %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
+  %z3 = insertelement <4 x i32> %z2, i32 0, i32 3
+  %t0 = insertelement <4 x i32> undef, i32 2, i32 0
+  %t1 = insertelement <4 x i32> %t0, i32 2, i32 1
+  %t2 = insertelement <4 x i32> %t1, i32 2, i32 2
+  %t3 = insertelement <4 x i32> %t2, i32 2, i32 3
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %src, align 4
+  %neg = sub <4 x i32> %z3, %val
+  %cond = icmp sgt <4 x i32> %val, %neg
+  %res = select <4 x i1> %cond, <4 x i32> %val, <4 x i32> %neg
+  %res2 = add <4 x i32> %res, %t3
+  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 0598208e13173..1d6bb9ece8c68 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s
+; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=CI --check-prefix=GCN  %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s
 
 ; SMRD load with an immediate offset.
 ; GCN-LABEL: {{^}}smrd0:
-; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
 define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
@@ -15,7 +16,7 @@ entry:
 
 ; SMRD load with the largest possible immediate offset.
 ; GCN-LABEL: {{^}}smrd1:
-; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
 define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
@@ -29,6 +30,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd2:
 ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
 define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
@@ -54,9 +56,37 @@ entry:
   ret void
 }
 
+; SMRD load with the largest possible immediate offset on VI
+; GCN-LABEL: {{^}}smrd4:
+; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
+define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with an offset greater than the largest possible immediate on VI
+; GCN-LABEL: {{^}}smrd5:
+; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
+; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
+; GCN: s_endpgm
+define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
 ; SMRD load using the load.const intrinsic with an immediate offset
 ; GCN-LABEL: {{^}}smrd_load_const0:
-; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
 define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
@@ -70,7 +100,7 @@ main_body:
 ; SMRD load using the load.const intrinsic with the largest possible immediate
 ; offset.
 ; GCN-LABEL: {{^}}smrd_load_const1:
-; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
 define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
@@ -86,6 +116,7 @@ main_body:
 ; GCN-LABEL: {{^}}smrd_load_const2:
 ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
@@ -96,6 +127,36 @@ main_body:
   ret void
 }
 
+; SMRD load with the largest possible immediate offset on VI
+; GCN-LABEL: {{^}}smrd_load_const3:
+; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
+define void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048572)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+
+; SMRD load with an offset greater than the largest possible immediate on VI
+; GCN-LABEL: {{^}}smrd_load_const4:
+; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
+; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
+; GCN: s_endpgm
+define void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048576)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+
 ; Function Attrs: nounwind readnone
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index 46409cdfae1c7..9e181bc14d9db 100644
--- a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() readnone
 
@@ -8,9 +8,22 @@ declare i32 @llvm.r600.read.tidig.x() readnone
 ; scc instead.
 
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
-; SI: v_add_i32
-; SI: v_addc_u32
-define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
+; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}}
+; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
+define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
+  %v.val = load volatile i32, i32 addrspace(1)* %in
+  %vec.0 = insertelement <2 x i32> undef, i32 %s.val, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, 399
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_0:
+; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x18f
+; SI: s_addc_u32 {{s[0-9]+}}, 0xf423f, 0
+define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
@@ -22,7 +35,20 @@ define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
+define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+  %v.val = load volatile i32, i32 addrspace(1)* %in
+  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, %val1
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_1:
+; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_addc_u32 {{s[0-9]+}}, 0x1869f, {{s[0-9]+}}
+define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
@@ -32,9 +58,9 @@ define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64
 }
 
 ; Doesn't use constants
-; FUNC-LABEL @imp_def_vcc_split_i64_add_2
-; SI: v_add_i32
-; SI: v_addc_u32
+; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_2:
+; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc
 define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
new file mode 100644
index 0000000000000..4c82ed6affc23
--- /dev/null
+++ b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
@@ -0,0 +1,104 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s
+
+@sPrivateStorage = external addrspace(3) global [256 x [8 x <4 x i64>]]
+
+; GCN-LABEL: {{^}}ds_reorder_vector_split:
+
+; Write zeroinitializer
+; GCN-DAG: ds_write_b64 [[PTR:v[0-9]+]], [[VAL:v\[[0-9]+:[0-9]+\]]] offset:24
+; GCN-DAG: ds_write_b64 [[PTR]], [[VAL]] offset:16
+; GCN-DAG: ds_write_b64 [[PTR]], [[VAL]] offset:8
+; GCN-DAG: ds_write_b64 [[PTR]], [[VAL]]{{$}}
+
+; GCN: s_waitcnt vmcnt
+
+; GCN-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24
+; GCN-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16
+; GCN-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
+
+; GCN: s_waitcnt lgkmcnt
+
+; GCN-DAG ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:8
+; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16
+; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24
+
+; Appears to be dead store of vector component.
+; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}}
+
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+; GCN: s_endpgm
+define void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 {
+entry:
+  %tmp = tail call i32 @llvm.r600.read.local.size.y()
+  %tmp1 = tail call i32 @llvm.r600.read.local.size.z()
+  %tmp2 = tail call i32 @llvm.r600.read.tidig.x()
+  %tmp3 = tail call i32 @llvm.r600.read.tidig.y()
+  %tmp4 = tail call i32 @llvm.r600.read.tidig.z()
+  %tmp6 = mul i32 %tmp2, %tmp
+  %tmp10 = add i32 %tmp3, %tmp6
+  %tmp11 = mul i32 %tmp10, %tmp1
+  %tmp9 = add i32 %tmp11, %tmp4
+  %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %x.i.12.i = tail call i32 @llvm.r600.read.local.size.x() #1
+  %mul.26.i = mul i32 %x.i.12.i, %x.i.i
+  %add.i = add i32 %tmp2, %mul.26.i
+  %arrayidx = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 %add.i
+  store <4 x i64> zeroinitializer, <4 x i64> addrspace(3)* %arrayidx
+  %tmp12 = sext i32 %add.i to i64
+  %arrayidx1 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %srcValues, i64 %tmp12
+  %tmp13 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %tmp12
+  %tmp14 = load i32, i32 addrspace(1)* %arrayidx2
+  %add.ptr = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 0, i32 %alignmentOffset
+  %mul.i = shl i32 %tmp14, 2
+  %arrayidx.i = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr, i32 %mul.i
+  %tmp15 = bitcast i64 addrspace(3)* %arrayidx.i to <4 x i64> addrspace(3)*
+  store <4 x i64> %tmp13, <4 x i64> addrspace(3)* %tmp15
+  %add.ptr6 = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 %tmp14, i32 %alignmentOffset
+  %tmp16 = sext i32 %tmp14 to i64
+  %tmp17 = sext i32 %alignmentOffset to i64
+  %add.ptr9 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %destBuffer, i64 %tmp16, i64 %tmp17
+  %tmp18 = bitcast <4 x i64> %tmp13 to i256
+  %trunc = trunc i256 %tmp18 to i64
+  store i64 %trunc, i64 addrspace(1)* %add.ptr9
+  %arrayidx10.1 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 1
+  %tmp19 = load i64, i64 addrspace(3)* %arrayidx10.1
+  %arrayidx11.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 1
+  store i64 %tmp19, i64 addrspace(1)* %arrayidx11.1
+  %arrayidx10.2 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 2
+  %tmp20 = load i64, i64 addrspace(3)* %arrayidx10.2
+  %arrayidx11.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 2
+  store i64 %tmp20, i64 addrspace(1)* %arrayidx11.2
+  %arrayidx10.3 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 3
+  %tmp21 = load i64, i64 addrspace(3)* %arrayidx10.3
+  %arrayidx11.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 3
+  store i64 %tmp21, i64 addrspace(1)* %arrayidx11.3
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.z() #1
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index bcbc32f4c0539..3b59bbfb18c03 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -70,11 +70,11 @@ entry:
 ;EG-LABEL: {{^}}ashr_i64_2:
 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}}
 ;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
 ;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index 0dad91e709d99..bbd9543563222 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -65,14 +65,14 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 
 ; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
-; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]|PV\.[XYZW]}}
 ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}}
+; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]]
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
@@ -190,8 +190,7 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
 ; Make sure load width gets reduced to i32 load.
 ; GCN-LABEL: {{^}}s_lshr_32_i64:
 ; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}}
-; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
diff --git a/test/CodeGen/AMDGPU/store-barrier.ll b/test/CodeGen/AMDGPU/store-barrier.ll
index 4a72b4d090adf..ba4049f28a6e5 100644
--- a/test/CodeGen/AMDGPU/store-barrier.ll
+++ b/test/CodeGen/AMDGPU/store-barrier.ll
@@ -36,7 +36,7 @@ bb:
   ret void
 }
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.local() #2
 
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/store.ll b/test/CodeGen/AMDGPU/store.ll
index 0f89405e073b0..d22f43fa05ef3 100644
--- a/test/CodeGen/AMDGPU/store.ll
+++ b/test/CodeGen/AMDGPU/store.ll
@@ -287,16 +287,33 @@ entry:
 ; CM: LDS_WRITE
 ; CM: LDS_WRITE
 
-; SI: ds_write_b32
-; SI: ds_write_b32
-; SI: ds_write_b32
-; SI: ds_write_b32
+; SI: ds_write_b64
+; SI: ds_write_b64
 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}store_local_v4i32_align4:
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+
+; SI: ds_write2_b32
+; SI: ds_write2_b32
+define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}store_local_i64_i8:
 ; EG: LDS_BYTE_WRITE
 ; SI: ds_write_b8
diff --git a/test/CodeGen/AMDGPU/store_typed.ll b/test/CodeGen/AMDGPU/store_typed.ll
new file mode 100644
index 0000000000000..515fcf04f4067
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store_typed.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman  < %s | FileCheck --check-prefix=CM --check-prefix=FUNC %s
+
+; store to rat 0
+; FUNC-LABEL: {{^}}store_typed_rat0:
+; EG: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}, 1
+; CM: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}
+
+define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
+  call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 0)
+  ret void
+}
+
+; store to rat 11
+; FUNC-LABEL: {{^}}store_typed_rat11:
+; EG: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}, 1
+; CM: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}
+
+define void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) {
+  call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 11)
+  ret void
+}
+
+declare void @llvm.r600.rat.store.typed(<4 x i32>, <4 x i32>, i32)
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index b7fba0efa5b29..9f9446a4e6087 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -7,7 +7,7 @@ declare i32 @llvm.r600.read.tidig.x() readnone
 ; FUNC-LABEL: {{^}}test_sub_i32:
 ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
@@ -22,8 +22,8 @@ define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
 define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -40,10 +40,10 @@ define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)
 ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
 define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll
index bf690ca4cb282..ad52d0f2e2380 100644
--- a/test/CodeGen/AMDGPU/trunc.ll
+++ b/test/CodeGen/AMDGPU/trunc.ll
@@ -61,7 +61,7 @@ define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
 }
 
 ; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1:
-; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+}}, 1, s{{[0-9]+}}
 ; SI: v_cmp_eq_i32
 define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
   %trunc = trunc i32 %a to i1
@@ -72,9 +72,9 @@ define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
 
 ; SI-LABEL: {{^}}s_trunc_i64_to_i1:
 ; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]]
-; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]]
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
+; SI: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
+; SI: v_cmp_eq_i32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], 1, [[MASKED]]
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
 define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll
index b3837f28209af..f692b7dfdc271 100644
--- a/test/CodeGen/AMDGPU/udivrem.ll
+++ b/test/CodeGen/AMDGPU/udivrem.ll
@@ -30,19 +30,19 @@
 ; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]]
 ; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]]
 ; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]]
-; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
+; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], vcc, 0, [[RCP_LO]]
 ; SI: v_cndmask_b32_e64
 ; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
-; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]]
-; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]]
+; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], vcc, [[E]], [[RCP]]
+; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], vcc, [[E]], [[RCP]]
 ; SI: v_cndmask_b32_e64
 ; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
 ; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]]
+; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], vcc, {{[vs][0-9]+}}, [[Num_S_Remainder]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]]
+; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]]
 ; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]],
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
@@ -110,15 +110,15 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
 ; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
 ; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[FIRST_RCP_LO]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], vcc, [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
@@ -133,15 +133,15 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
 ; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
 ; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[SECOND_RCP_LO]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], vcc, [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
@@ -257,83 +257,83 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 
-; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: v_rcp_iflag_f32_e32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_sub_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_and_b32_e32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_rcp_iflag_f32_e32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_sub_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_and_b32_e32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]]
-; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]]
-; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
+; SI-DAG: v_rcp_iflag_f32_e32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_sub_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
-; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}}
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_and_b32_e32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]]
-; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]]
-; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
+; SI-DAG: v_rcp_iflag_f32_e32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_sub_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
-; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
 define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 6f608df5e9f55..65fe580792a59 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -4,9 +4,9 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64
 ; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; SI: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
-; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
-; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI-DAG: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll
index 8ab4faf2f1458..d120111a71fb3 100644
--- a/test/CodeGen/AMDGPU/unsupported-cc.ll
+++ b/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -3,8 +3,8 @@
 ; These tests are for condition codes that are not supported by the hardware
 
 ; CHECK-LABEL: {{^}}slt:
-; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -15,8 +15,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ult_i32:
-; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -40,8 +40,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ult_float_native:
-; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ult_float_native(float addrspace(1)* %out, float %in) {
 entry:
@@ -52,8 +52,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}olt:
-; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @olt(float addrspace(1)* %out, float %in) {
 entry:
@@ -64,8 +64,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}sle:
-; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -76,8 +76,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ule_i32:
-; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -101,8 +101,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ule_float_native:
-; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ule_float_native(float addrspace(1)* %out, float %in) {
 entry:
@@ -113,8 +113,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ole:
-; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
 define void @ole(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index f26f30022b4f3..87b925a24a041 100644
--- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 declare float @llvm.fma.f32(float, float, float) #1
+declare double @llvm.fma.f64(double, double, double) #1
 declare float @llvm.fmuladd.f32(float, float, float) #1
 declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1
 
@@ -40,6 +41,32 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
   ret void
 }
 
+; GCN-LABEL: {{^}}test_use_s_v_s:
+; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+
+; GCN: buffer_load_dword [[VA0:v[0-9]+]]
+; GCN-NOT: v_mov_b32
+; GCN: buffer_load_dword [[VA1:v[0-9]+]]
+
+; GCN-NOT: v_mov_b32
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN-NOT: v_mov_b32
+
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
+; GCN: buffer_store_dword [[RESULT0]]
+; GCN: buffer_store_dword [[RESULT1]]
+define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
+  %va0 = load volatile float, float addrspace(1)* %in
+  %va1 = load volatile float, float addrspace(1)* %in
+  %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1
+  %fma1 = call float @llvm.fma.f32(float %a, float %va1, float %b) #1
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %fma1, float addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
 ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
@@ -99,5 +126,145 @@ define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32
   ret void
 }
 
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_kimm:
+; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s:
+; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
+; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT0]]
+define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
+  store float %fma, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s_x2:
+; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR0]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], [[SGPR1]]
+; GCN: buffer_store_dword [[RESULT0]]
+; GCN: buffer_store_dword [[RESULT1]]
+; GCN: s_endpgm
+define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
+  %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %fma1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k:
+; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
+  store float %fma, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k_x2:
+; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VK]], [[VK]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR1]], [[VK]], [[VK]]
+; GCN: buffer_store_dword [[RESULT0]]
+; GCN: buffer_store_dword [[RESULT1]]
+; GCN: s_endpgm
+define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
+  %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %fma1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k:
+; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
+  store float %fma, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k_x2:
+; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VK]], [[VK]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR1]], [[VK]], [[VK]]
+; GCN: buffer_store_dword [[RESULT0]]
+; GCN: buffer_store_dword [[RESULT1]]
+; GCN: s_endpgm
+define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
+  %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %fma1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_s0_s1_k_f32:
+; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
+; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]]
+
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]]
+; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]]
+
+; GCN: buffer_store_dword [[RESULT0]]
+; GCN: buffer_store_dword [[RESULT1]]
+define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1
+  %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %fma1, float addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Immediate in SGPRs just copied to VGPRs
+; GCN-LABEL: {{^}}test_s0_s1_k_f64:
+; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
+; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], 0x40900000
+; GCN-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}}
+
+; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
+; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
+; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}}
+
+; Same zero component is re-used for half of each immediate.
+; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
+; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
+
+; GCN: buffer_store_dwordx2 [[RESULT0]]
+; GCN: buffer_store_dwordx2 [[RESULT1]]
+define void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
+  %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1
+  %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1
+  store volatile double %fma0, double addrspace(1)* %out
+  store volatile double %fma1, double addrspace(1)* %out
+  ret void
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
index 7d0ebd139f518..1cbefba60c95f 100644
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -78,8 +78,8 @@ exit:
 
 ; SI: BB2_3:
 ; SI: buffer_load_dword
-; SI: buffer_store_dword
-; SI: v_cmp_eq_i32_e32 vcc,
+; SI-DAG: buffer_store_dword
+; SI-DAG: v_cmp_eq_i32_e32 vcc,
 ; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
 ; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
 ; SI: s_cbranch_execnz BB2_3
@@ -128,18 +128,18 @@ exit:
 ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
 ; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
-; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]]
-; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]]
+; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
+; SI: s_xor_b64 [[ORNEG2]], exec, [[ORNEG2]]
 ; SI: s_cbranch_execz BB3_5
 
 ; SI: BB#4:
 ; SI: buffer_store_dword
-; SI: v_cmp_ge_i64_e32 vcc
-; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
+; SI: v_cmp_ge_i64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]]
+; SI: s_or_b64 [[COND_STATE]], [[CMP]], [[COND_STATE]]
 
 ; SI: BB3_5:
-; SI: s_or_b64 exec, exec, [[ORNEG1]]
-; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]]
+; SI: s_or_b64 exec, exec, [[ORNEG2]]
+; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[COND_STATE]]
 ; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
 ; SI: s_cbranch_execnz BB3_3
 
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
new file mode 100644
index 0000000000000..cd7c78f408ddd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -0,0 +1,585 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s
+; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s
+
+; This ends up using all 256 registers and requires register
+; scavenging which will fail to find an unsued register.
+
+; Check the ScratchSize to avoid regressions from spilling
+; intermediate register class copies.
+
+; FIXME: The same register is initialized to 0 for every spill.
+
+declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.r600.read.tgid.y() #1
+declare i32 @llvm.r600.read.tgid.z() #1
+
+; GCN-LABEL: {{^}}spill_vgpr_compute:
+
+; GCN: s_mov_b32 s16, s3
+; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s15, 0x80f000
+; VI-NEXT: s_mov_b32 s15, 0x800000
+
+
+; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+
+; GCN: NumVgprs: 256
+; GCN: ScratchSize: 1024
+
+; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset.
+define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
+bb:
+  %tmp = add i32 %arg1, %arg2
+  %tmp7 = extractelement <4 x float> %arg6, i32 0
+  %tmp8 = extractelement <4 x float> %arg6, i32 1
+  %tmp9 = extractelement <4 x float> %arg6, i32 2
+  %tmp10 = extractelement <4 x float> %arg6, i32 3
+  %tmp11 = bitcast float %arg5 to i32
+  br label %bb12
+
+bb12:                                             ; preds = %bb145, %bb
+  %tmp13 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb145 ]
+  %tmp14 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb145 ]
+  %tmp15 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb145 ]
+  %tmp16 = phi float [ 0.000000e+00, %bb ], [ %tmp339, %bb145 ]
+  %tmp17 = phi float [ 0.000000e+00, %bb ], [ %tmp335, %bb145 ]
+  %tmp18 = phi float [ 0.000000e+00, %bb ], [ %tmp334, %bb145 ]
+  %tmp19 = phi float [ 0.000000e+00, %bb ], [ %tmp333, %bb145 ]
+  %tmp20 = phi float [ 0.000000e+00, %bb ], [ %tmp340, %bb145 ]
+  %tmp21 = phi float [ 0.000000e+00, %bb ], [ %tmp332, %bb145 ]
+  %tmp22 = phi float [ 0.000000e+00, %bb ], [ %tmp331, %bb145 ]
+  %tmp23 = phi float [ 0.000000e+00, %bb ], [ %tmp330, %bb145 ]
+  %tmp24 = phi float [ 0.000000e+00, %bb ], [ %tmp341, %bb145 ]
+  %tmp25 = phi float [ 0.000000e+00, %bb ], [ %tmp329, %bb145 ]
+  %tmp26 = phi float [ 0.000000e+00, %bb ], [ %tmp328, %bb145 ]
+  %tmp27 = phi float [ 0.000000e+00, %bb ], [ %tmp327, %bb145 ]
+  %tmp28 = phi float [ 0.000000e+00, %bb ], [ %tmp342, %bb145 ]
+  %tmp29 = phi float [ 0.000000e+00, %bb ], [ %tmp326, %bb145 ]
+  %tmp30 = phi float [ 0.000000e+00, %bb ], [ %tmp325, %bb145 ]
+  %tmp31 = phi float [ 0.000000e+00, %bb ], [ %tmp324, %bb145 ]
+  %tmp32 = phi float [ 0.000000e+00, %bb ], [ %tmp343, %bb145 ]
+  %tmp33 = phi float [ 0.000000e+00, %bb ], [ %tmp323, %bb145 ]
+  %tmp34 = phi float [ 0.000000e+00, %bb ], [ %tmp322, %bb145 ]
+  %tmp35 = phi float [ 0.000000e+00, %bb ], [ %tmp321, %bb145 ]
+  %tmp36 = phi float [ 0.000000e+00, %bb ], [ %tmp344, %bb145 ]
+  %tmp37 = phi float [ 0.000000e+00, %bb ], [ %tmp320, %bb145 ]
+  %tmp38 = phi float [ 0.000000e+00, %bb ], [ %tmp319, %bb145 ]
+  %tmp39 = phi float [ 0.000000e+00, %bb ], [ %tmp318, %bb145 ]
+  %tmp40 = phi float [ 0.000000e+00, %bb ], [ %tmp345, %bb145 ]
+  %tmp41 = phi float [ 0.000000e+00, %bb ], [ %tmp317, %bb145 ]
+  %tmp42 = phi float [ 0.000000e+00, %bb ], [ %tmp316, %bb145 ]
+  %tmp43 = phi float [ 0.000000e+00, %bb ], [ %tmp315, %bb145 ]
+  %tmp44 = phi float [ 0.000000e+00, %bb ], [ %tmp346, %bb145 ]
+  %tmp45 = phi float [ 0.000000e+00, %bb ], [ %tmp314, %bb145 ]
+  %tmp46 = phi float [ 0.000000e+00, %bb ], [ %tmp313, %bb145 ]
+  %tmp47 = phi float [ 0.000000e+00, %bb ], [ %tmp312, %bb145 ]
+  %tmp48 = phi float [ 0.000000e+00, %bb ], [ %tmp347, %bb145 ]
+  %tmp49 = phi float [ 0.000000e+00, %bb ], [ %tmp311, %bb145 ]
+  %tmp50 = phi float [ 0.000000e+00, %bb ], [ %tmp310, %bb145 ]
+  %tmp51 = phi float [ 0.000000e+00, %bb ], [ %tmp309, %bb145 ]
+  %tmp52 = phi float [ 0.000000e+00, %bb ], [ %tmp348, %bb145 ]
+  %tmp53 = phi float [ 0.000000e+00, %bb ], [ %tmp308, %bb145 ]
+  %tmp54 = phi float [ 0.000000e+00, %bb ], [ %tmp307, %bb145 ]
+  %tmp55 = phi float [ 0.000000e+00, %bb ], [ %tmp306, %bb145 ]
+  %tmp56 = phi float [ 0.000000e+00, %bb ], [ %tmp349, %bb145 ]
+  %tmp57 = phi float [ 0.000000e+00, %bb ], [ %tmp305, %bb145 ]
+  %tmp58 = phi float [ 0.000000e+00, %bb ], [ %tmp304, %bb145 ]
+  %tmp59 = phi float [ 0.000000e+00, %bb ], [ %tmp303, %bb145 ]
+  %tmp60 = phi float [ 0.000000e+00, %bb ], [ %tmp350, %bb145 ]
+  %tmp61 = phi float [ 0.000000e+00, %bb ], [ %tmp302, %bb145 ]
+  %tmp62 = phi float [ 0.000000e+00, %bb ], [ %tmp301, %bb145 ]
+  %tmp63 = phi float [ 0.000000e+00, %bb ], [ %tmp300, %bb145 ]
+  %tmp64 = phi float [ 0.000000e+00, %bb ], [ %tmp351, %bb145 ]
+  %tmp65 = phi float [ 0.000000e+00, %bb ], [ %tmp299, %bb145 ]
+  %tmp66 = phi float [ 0.000000e+00, %bb ], [ %tmp298, %bb145 ]
+  %tmp67 = phi float [ 0.000000e+00, %bb ], [ %tmp297, %bb145 ]
+  %tmp68 = phi float [ 0.000000e+00, %bb ], [ %tmp352, %bb145 ]
+  %tmp69 = phi float [ 0.000000e+00, %bb ], [ %tmp296, %bb145 ]
+  %tmp70 = phi float [ 0.000000e+00, %bb ], [ %tmp295, %bb145 ]
+  %tmp71 = phi float [ 0.000000e+00, %bb ], [ %tmp294, %bb145 ]
+  %tmp72 = phi float [ 0.000000e+00, %bb ], [ %tmp353, %bb145 ]
+  %tmp73 = phi float [ 0.000000e+00, %bb ], [ %tmp293, %bb145 ]
+  %tmp74 = phi float [ 0.000000e+00, %bb ], [ %tmp292, %bb145 ]
+  %tmp75 = phi float [ 0.000000e+00, %bb ], [ %tmp291, %bb145 ]
+  %tmp76 = phi float [ 0.000000e+00, %bb ], [ %tmp354, %bb145 ]
+  %tmp77 = phi float [ 0.000000e+00, %bb ], [ %tmp290, %bb145 ]
+  %tmp78 = phi float [ 0.000000e+00, %bb ], [ %tmp289, %bb145 ]
+  %tmp79 = phi float [ 0.000000e+00, %bb ], [ %tmp288, %bb145 ]
+  %tmp80 = phi float [ 0.000000e+00, %bb ], [ %tmp355, %bb145 ]
+  %tmp81 = phi float [ 0.000000e+00, %bb ], [ %tmp287, %bb145 ]
+  %tmp82 = phi float [ 0.000000e+00, %bb ], [ %tmp286, %bb145 ]
+  %tmp83 = phi float [ 0.000000e+00, %bb ], [ %tmp285, %bb145 ]
+  %tmp84 = phi float [ 0.000000e+00, %bb ], [ %tmp356, %bb145 ]
+  %tmp85 = phi float [ 0.000000e+00, %bb ], [ %tmp284, %bb145 ]
+  %tmp86 = phi float [ 0.000000e+00, %bb ], [ %tmp283, %bb145 ]
+  %tmp87 = phi float [ 0.000000e+00, %bb ], [ %tmp282, %bb145 ]
+  %tmp88 = phi float [ 0.000000e+00, %bb ], [ %tmp357, %bb145 ]
+  %tmp89 = phi float [ 0.000000e+00, %bb ], [ %tmp281, %bb145 ]
+  %tmp90 = phi float [ 0.000000e+00, %bb ], [ %tmp280, %bb145 ]
+  %tmp91 = phi float [ 0.000000e+00, %bb ], [ %tmp279, %bb145 ]
+  %tmp92 = phi float [ 0.000000e+00, %bb ], [ %tmp358, %bb145 ]
+  %tmp93 = phi float [ 0.000000e+00, %bb ], [ %tmp359, %bb145 ]
+  %tmp94 = phi float [ 0.000000e+00, %bb ], [ %tmp360, %bb145 ]
+  %tmp95 = phi float [ 0.000000e+00, %bb ], [ %tmp409, %bb145 ]
+  %tmp96 = phi float [ 0.000000e+00, %bb ], [ %tmp361, %bb145 ]
+  %tmp97 = phi float [ 0.000000e+00, %bb ], [ %tmp362, %bb145 ]
+  %tmp98 = phi float [ 0.000000e+00, %bb ], [ %tmp363, %bb145 ]
+  %tmp99 = phi float [ 0.000000e+00, %bb ], [ %tmp364, %bb145 ]
+  %tmp100 = phi float [ 0.000000e+00, %bb ], [ %tmp365, %bb145 ]
+  %tmp101 = phi float [ 0.000000e+00, %bb ], [ %tmp366, %bb145 ]
+  %tmp102 = phi float [ 0.000000e+00, %bb ], [ %tmp367, %bb145 ]
+  %tmp103 = phi float [ 0.000000e+00, %bb ], [ %tmp368, %bb145 ]
+  %tmp104 = phi float [ 0.000000e+00, %bb ], [ %tmp369, %bb145 ]
+  %tmp105 = phi float [ 0.000000e+00, %bb ], [ %tmp370, %bb145 ]
+  %tmp106 = phi float [ 0.000000e+00, %bb ], [ %tmp371, %bb145 ]
+  %tmp107 = phi float [ 0.000000e+00, %bb ], [ %tmp372, %bb145 ]
+  %tmp108 = phi float [ 0.000000e+00, %bb ], [ %tmp373, %bb145 ]
+  %tmp109 = phi float [ 0.000000e+00, %bb ], [ %tmp374, %bb145 ]
+  %tmp110 = phi float [ 0.000000e+00, %bb ], [ %tmp375, %bb145 ]
+  %tmp111 = phi float [ 0.000000e+00, %bb ], [ %tmp376, %bb145 ]
+  %tmp112 = phi float [ 0.000000e+00, %bb ], [ %tmp377, %bb145 ]
+  %tmp113 = phi float [ 0.000000e+00, %bb ], [ %tmp378, %bb145 ]
+  %tmp114 = phi float [ 0.000000e+00, %bb ], [ %tmp379, %bb145 ]
+  %tmp115 = phi float [ 0.000000e+00, %bb ], [ %tmp380, %bb145 ]
+  %tmp116 = phi float [ 0.000000e+00, %bb ], [ %tmp381, %bb145 ]
+  %tmp117 = phi float [ 0.000000e+00, %bb ], [ %tmp382, %bb145 ]
+  %tmp118 = phi float [ 0.000000e+00, %bb ], [ %tmp383, %bb145 ]
+  %tmp119 = phi float [ 0.000000e+00, %bb ], [ %tmp384, %bb145 ]
+  %tmp120 = phi float [ 0.000000e+00, %bb ], [ %tmp385, %bb145 ]
+  %tmp121 = phi float [ 0.000000e+00, %bb ], [ %tmp386, %bb145 ]
+  %tmp122 = phi float [ 0.000000e+00, %bb ], [ %tmp387, %bb145 ]
+  %tmp123 = phi float [ 0.000000e+00, %bb ], [ %tmp388, %bb145 ]
+  %tmp124 = phi float [ 0.000000e+00, %bb ], [ %tmp389, %bb145 ]
+  %tmp125 = phi float [ 0.000000e+00, %bb ], [ %tmp390, %bb145 ]
+  %tmp126 = phi float [ 0.000000e+00, %bb ], [ %tmp391, %bb145 ]
+  %tmp127 = phi float [ 0.000000e+00, %bb ], [ %tmp392, %bb145 ]
+  %tmp128 = phi float [ 0.000000e+00, %bb ], [ %tmp393, %bb145 ]
+  %tmp129 = phi float [ 0.000000e+00, %bb ], [ %tmp394, %bb145 ]
+  %tmp130 = phi float [ 0.000000e+00, %bb ], [ %tmp395, %bb145 ]
+  %tmp131 = phi float [ 0.000000e+00, %bb ], [ %tmp396, %bb145 ]
+  %tmp132 = phi float [ 0.000000e+00, %bb ], [ %tmp397, %bb145 ]
+  %tmp133 = phi float [ 0.000000e+00, %bb ], [ %tmp398, %bb145 ]
+  %tmp134 = phi float [ 0.000000e+00, %bb ], [ %tmp399, %bb145 ]
+  %tmp135 = phi float [ 0.000000e+00, %bb ], [ %tmp400, %bb145 ]
+  %tmp136 = phi float [ 0.000000e+00, %bb ], [ %tmp401, %bb145 ]
+  %tmp137 = phi float [ 0.000000e+00, %bb ], [ %tmp402, %bb145 ]
+  %tmp138 = phi float [ 0.000000e+00, %bb ], [ %tmp403, %bb145 ]
+  %tmp139 = phi float [ 0.000000e+00, %bb ], [ %tmp404, %bb145 ]
+  %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ]
+  %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ]
+  %tmp142 = bitcast float %tmp95 to i32
+  %tmp143 = icmp sgt i32 %tmp142, 125
+  br i1 %tmp143, label %bb144, label %bb145
+
+bb144:                                            ; preds = %bb12
+  store volatile float %arg3, float addrspace(1)* %arg
+  store volatile float %tmp91, float addrspace(1)* %arg
+  store volatile float %tmp90, float addrspace(1)* %arg
+  store volatile float %tmp89, float addrspace(1)* %arg
+  store volatile float %tmp87, float addrspace(1)* %arg
+  store volatile float %tmp86, float addrspace(1)* %arg
+  store volatile float %tmp85, float addrspace(1)* %arg
+  store volatile float %tmp83, float addrspace(1)* %arg
+  store volatile float %tmp82, float addrspace(1)* %arg
+  store volatile float %tmp81, float addrspace(1)* %arg
+  store volatile float %tmp79, float addrspace(1)* %arg
+  store volatile float %tmp78, float addrspace(1)* %arg
+  store volatile float %tmp77, float addrspace(1)* %arg
+  store volatile float %tmp75, float addrspace(1)* %arg
+  store volatile float %tmp74, float addrspace(1)* %arg
+  store volatile float %tmp73, float addrspace(1)* %arg
+  store volatile float %tmp71, float addrspace(1)* %arg
+  store volatile float %tmp70, float addrspace(1)* %arg
+  store volatile float %tmp69, float addrspace(1)* %arg
+  store volatile float %tmp67, float addrspace(1)* %arg
+  store volatile float %tmp66, float addrspace(1)* %arg
+  store volatile float %tmp65, float addrspace(1)* %arg
+  store volatile float %tmp63, float addrspace(1)* %arg
+  store volatile float %tmp62, float addrspace(1)* %arg
+  store volatile float %tmp61, float addrspace(1)* %arg
+  store volatile float %tmp59, float addrspace(1)* %arg
+  store volatile float %tmp58, float addrspace(1)* %arg
+  store volatile float %tmp57, float addrspace(1)* %arg
+  store volatile float %tmp55, float addrspace(1)* %arg
+  store volatile float %tmp54, float addrspace(1)* %arg
+  store volatile float %tmp53, float addrspace(1)* %arg
+  store volatile float %tmp51, float addrspace(1)* %arg
+  store volatile float %tmp50, float addrspace(1)* %arg
+  store volatile float %tmp49, float addrspace(1)* %arg
+  store volatile float %tmp47, float addrspace(1)* %arg
+  store volatile float %tmp46, float addrspace(1)* %arg
+  store volatile float %tmp45, float addrspace(1)* %arg
+  store volatile float %tmp43, float addrspace(1)* %arg
+  store volatile float %tmp42, float addrspace(1)* %arg
+  store volatile float %tmp41, float addrspace(1)* %arg
+  store volatile float %tmp39, float addrspace(1)* %arg
+  store volatile float %tmp38, float addrspace(1)* %arg
+  store volatile float %tmp37, float addrspace(1)* %arg
+  store volatile float %tmp35, float addrspace(1)* %arg
+  store volatile float %tmp34, float addrspace(1)* %arg
+  store volatile float %tmp33, float addrspace(1)* %arg
+  store volatile float %tmp31, float addrspace(1)* %arg
+  store volatile float %tmp30, float addrspace(1)* %arg
+  store volatile float %tmp29, float addrspace(1)* %arg
+  store volatile float %tmp27, float addrspace(1)* %arg
+  store volatile float %tmp26, float addrspace(1)* %arg
+  store volatile float %tmp25, float addrspace(1)* %arg
+  store volatile float %tmp23, float addrspace(1)* %arg
+  store volatile float %tmp22, float addrspace(1)* %arg
+  store volatile float %tmp21, float addrspace(1)* %arg
+  store volatile float %tmp19, float addrspace(1)* %arg
+  store volatile float %tmp18, float addrspace(1)* %arg
+  store volatile float %tmp17, float addrspace(1)* %arg
+  store volatile float %tmp15, float addrspace(1)* %arg
+  store volatile float %tmp14, float addrspace(1)* %arg
+  store volatile float %tmp13, float addrspace(1)* %arg
+  store volatile float %tmp16, float addrspace(1)* %arg
+  store volatile float %tmp20, float addrspace(1)* %arg
+  store volatile float %tmp24, float addrspace(1)* %arg
+  store volatile float %tmp28, float addrspace(1)* %arg
+  store volatile float %tmp32, float addrspace(1)* %arg
+  store volatile float %tmp36, float addrspace(1)* %arg
+  store volatile float %tmp40, float addrspace(1)* %arg
+  store volatile float %tmp44, float addrspace(1)* %arg
+  store volatile float %tmp48, float addrspace(1)* %arg
+  store volatile float %tmp52, float addrspace(1)* %arg
+  store volatile float %tmp56, float addrspace(1)* %arg
+  store volatile float %tmp60, float addrspace(1)* %arg
+  store volatile float %tmp64, float addrspace(1)* %arg
+  store volatile float %tmp68, float addrspace(1)* %arg
+  store volatile float %tmp72, float addrspace(1)* %arg
+  store volatile float %tmp76, float addrspace(1)* %arg
+  store volatile float %tmp80, float addrspace(1)* %arg
+  store volatile float %tmp84, float addrspace(1)* %arg
+  store volatile float %tmp88, float addrspace(1)* %arg
+  store volatile float %tmp92, float addrspace(1)* %arg
+  store volatile float %tmp93, float addrspace(1)* %arg
+  store volatile float %tmp94, float addrspace(1)* %arg
+  store volatile float %tmp96, float addrspace(1)* %arg
+  store volatile float %tmp97, float addrspace(1)* %arg
+  store volatile float %tmp98, float addrspace(1)* %arg
+  store volatile float %tmp99, float addrspace(1)* %arg
+  store volatile float %tmp100, float addrspace(1)* %arg
+  store volatile float %tmp101, float addrspace(1)* %arg
+  store volatile float %tmp102, float addrspace(1)* %arg
+  store volatile float %tmp103, float addrspace(1)* %arg
+  store volatile float %tmp104, float addrspace(1)* %arg
+  store volatile float %tmp105, float addrspace(1)* %arg
+  store volatile float %tmp106, float addrspace(1)* %arg
+  store volatile float %tmp107, float addrspace(1)* %arg
+  store volatile float %tmp108, float addrspace(1)* %arg
+  store volatile float %tmp109, float addrspace(1)* %arg
+  store volatile float %tmp110, float addrspace(1)* %arg
+  store volatile float %tmp111, float addrspace(1)* %arg
+  store volatile float %tmp112, float addrspace(1)* %arg
+  store volatile float %tmp113, float addrspace(1)* %arg
+  store volatile float %tmp114, float addrspace(1)* %arg
+  store volatile float %tmp115, float addrspace(1)* %arg
+  store volatile float %tmp116, float addrspace(1)* %arg
+  store volatile float %tmp117, float addrspace(1)* %arg
+  store volatile float %tmp118, float addrspace(1)* %arg
+  store volatile float %tmp119, float addrspace(1)* %arg
+  store volatile float %tmp120, float addrspace(1)* %arg
+  store volatile float %tmp121, float addrspace(1)* %arg
+  store volatile float %tmp122, float addrspace(1)* %arg
+  store volatile float %tmp123, float addrspace(1)* %arg
+  store volatile float %tmp124, float addrspace(1)* %arg
+  store volatile float %tmp125, float addrspace(1)* %arg
+  store volatile float %tmp126, float addrspace(1)* %arg
+  store volatile float %tmp127, float addrspace(1)* %arg
+  store volatile float %tmp128, float addrspace(1)* %arg
+  store volatile float %tmp129, float addrspace(1)* %arg
+  store volatile float %tmp130, float addrspace(1)* %arg
+  store volatile float %tmp131, float addrspace(1)* %arg
+  store volatile float %tmp132, float addrspace(1)* %arg
+  store volatile float %tmp133, float addrspace(1)* %arg
+  store volatile float %tmp134, float addrspace(1)* %arg
+  store volatile float %tmp135, float addrspace(1)* %arg
+  store volatile float %tmp136, float addrspace(1)* %arg
+  store volatile float %tmp137, float addrspace(1)* %arg
+  store volatile float %tmp138, float addrspace(1)* %arg
+  store volatile float %tmp139, float addrspace(1)* %arg
+  store volatile float %arg4, float addrspace(1)* %arg
+  store volatile float %tmp7, float addrspace(1)* %arg
+  store volatile float %tmp8, float addrspace(1)* %arg
+  store volatile float %tmp9, float addrspace(1)* %arg
+  store volatile float %tmp10, float addrspace(1)* %arg
+  ret void
+
+bb145:                                            ; preds = %bb12
+  %tmp146 = bitcast float %tmp95 to i32
+  %tmp147 = bitcast float %tmp95 to i32
+  %tmp148 = add i32 %tmp11, %tmp147
+  %tmp149 = bitcast i32 %tmp148 to float
+  %tmp150 = insertelement <128 x float> undef, float %tmp91, i32 0
+  %tmp151 = insertelement <128 x float> %tmp150, float %tmp90, i32 1
+  %tmp152 = insertelement <128 x float> %tmp151, float %tmp89, i32 2
+  %tmp153 = insertelement <128 x float> %tmp152, float %tmp87, i32 3
+  %tmp154 = insertelement <128 x float> %tmp153, float %tmp86, i32 4
+  %tmp155 = insertelement <128 x float> %tmp154, float %tmp85, i32 5
+  %tmp156 = insertelement <128 x float> %tmp155, float %tmp83, i32 6
+  %tmp157 = insertelement <128 x float> %tmp156, float %tmp82, i32 7
+  %tmp158 = insertelement <128 x float> %tmp157, float %tmp81, i32 8
+  %tmp159 = insertelement <128 x float> %tmp158, float %tmp79, i32 9
+  %tmp160 = insertelement <128 x float> %tmp159, float %tmp78, i32 10
+  %tmp161 = insertelement <128 x float> %tmp160, float %tmp77, i32 11
+  %tmp162 = insertelement <128 x float> %tmp161, float %tmp75, i32 12
+  %tmp163 = insertelement <128 x float> %tmp162, float %tmp74, i32 13
+  %tmp164 = insertelement <128 x float> %tmp163, float %tmp73, i32 14
+  %tmp165 = insertelement <128 x float> %tmp164, float %tmp71, i32 15
+  %tmp166 = insertelement <128 x float> %tmp165, float %tmp70, i32 16
+  %tmp167 = insertelement <128 x float> %tmp166, float %tmp69, i32 17
+  %tmp168 = insertelement <128 x float> %tmp167, float %tmp67, i32 18
+  %tmp169 = insertelement <128 x float> %tmp168, float %tmp66, i32 19
+  %tmp170 = insertelement <128 x float> %tmp169, float %tmp65, i32 20
+  %tmp171 = insertelement <128 x float> %tmp170, float %tmp63, i32 21
+  %tmp172 = insertelement <128 x float> %tmp171, float %tmp62, i32 22
+  %tmp173 = insertelement <128 x float> %tmp172, float %tmp61, i32 23
+  %tmp174 = insertelement <128 x float> %tmp173, float %tmp59, i32 24
+  %tmp175 = insertelement <128 x float> %tmp174, float %tmp58, i32 25
+  %tmp176 = insertelement <128 x float> %tmp175, float %tmp57, i32 26
+  %tmp177 = insertelement <128 x float> %tmp176, float %tmp55, i32 27
+  %tmp178 = insertelement <128 x float> %tmp177, float %tmp54, i32 28
+  %tmp179 = insertelement <128 x float> %tmp178, float %tmp53, i32 29
+  %tmp180 = insertelement <128 x float> %tmp179, float %tmp51, i32 30
+  %tmp181 = insertelement <128 x float> %tmp180, float %tmp50, i32 31
+  %tmp182 = insertelement <128 x float> %tmp181, float %tmp49, i32 32
+  %tmp183 = insertelement <128 x float> %tmp182, float %tmp47, i32 33
+  %tmp184 = insertelement <128 x float> %tmp183, float %tmp46, i32 34
+  %tmp185 = insertelement <128 x float> %tmp184, float %tmp45, i32 35
+  %tmp186 = insertelement <128 x float> %tmp185, float %tmp43, i32 36
+  %tmp187 = insertelement <128 x float> %tmp186, float %tmp42, i32 37
+  %tmp188 = insertelement <128 x float> %tmp187, float %tmp41, i32 38
+  %tmp189 = insertelement <128 x float> %tmp188, float %tmp39, i32 39
+  %tmp190 = insertelement <128 x float> %tmp189, float %tmp38, i32 40
+  %tmp191 = insertelement <128 x float> %tmp190, float %tmp37, i32 41
+  %tmp192 = insertelement <128 x float> %tmp191, float %tmp35, i32 42
+  %tmp193 = insertelement <128 x float> %tmp192, float %tmp34, i32 43
+  %tmp194 = insertelement <128 x float> %tmp193, float %tmp33, i32 44
+  %tmp195 = insertelement <128 x float> %tmp194, float %tmp31, i32 45
+  %tmp196 = insertelement <128 x float> %tmp195, float %tmp30, i32 46
+  %tmp197 = insertelement <128 x float> %tmp196, float %tmp29, i32 47
+  %tmp198 = insertelement <128 x float> %tmp197, float %tmp27, i32 48
+  %tmp199 = insertelement <128 x float> %tmp198, float %tmp26, i32 49
+  %tmp200 = insertelement <128 x float> %tmp199, float %tmp25, i32 50
+  %tmp201 = insertelement <128 x float> %tmp200, float %tmp23, i32 51
+  %tmp202 = insertelement <128 x float> %tmp201, float %tmp22, i32 52
+  %tmp203 = insertelement <128 x float> %tmp202, float %tmp21, i32 53
+  %tmp204 = insertelement <128 x float> %tmp203, float %tmp19, i32 54
+  %tmp205 = insertelement <128 x float> %tmp204, float %tmp18, i32 55
+  %tmp206 = insertelement <128 x float> %tmp205, float %tmp17, i32 56
+  %tmp207 = insertelement <128 x float> %tmp206, float %tmp15, i32 57
+  %tmp208 = insertelement <128 x float> %tmp207, float %tmp14, i32 58
+  %tmp209 = insertelement <128 x float> %tmp208, float %tmp13, i32 59
+  %tmp210 = insertelement <128 x float> %tmp209, float %tmp16, i32 60
+  %tmp211 = insertelement <128 x float> %tmp210, float %tmp20, i32 61
+  %tmp212 = insertelement <128 x float> %tmp211, float %tmp24, i32 62
+  %tmp213 = insertelement <128 x float> %tmp212, float %tmp28, i32 63
+  %tmp214 = insertelement <128 x float> %tmp213, float %tmp32, i32 64
+  %tmp215 = insertelement <128 x float> %tmp214, float %tmp36, i32 65
+  %tmp216 = insertelement <128 x float> %tmp215, float %tmp40, i32 66
+  %tmp217 = insertelement <128 x float> %tmp216, float %tmp44, i32 67
+  %tmp218 = insertelement <128 x float> %tmp217, float %tmp48, i32 68
+  %tmp219 = insertelement <128 x float> %tmp218, float %tmp52, i32 69
+  %tmp220 = insertelement <128 x float> %tmp219, float %tmp56, i32 70
+  %tmp221 = insertelement <128 x float> %tmp220, float %tmp60, i32 71
+  %tmp222 = insertelement <128 x float> %tmp221, float %tmp64, i32 72
+  %tmp223 = insertelement <128 x float> %tmp222, float %tmp68, i32 73
+  %tmp224 = insertelement <128 x float> %tmp223, float %tmp72, i32 74
+  %tmp225 = insertelement <128 x float> %tmp224, float %tmp76, i32 75
+  %tmp226 = insertelement <128 x float> %tmp225, float %tmp80, i32 76
+  %tmp227 = insertelement <128 x float> %tmp226, float %tmp84, i32 77
+  %tmp228 = insertelement <128 x float> %tmp227, float %tmp88, i32 78
+  %tmp229 = insertelement <128 x float> %tmp228, float %tmp92, i32 79
+  %tmp230 = insertelement <128 x float> %tmp229, float %tmp93, i32 80
+  %tmp231 = insertelement <128 x float> %tmp230, float %tmp94, i32 81
+  %tmp232 = insertelement <128 x float> %tmp231, float %tmp96, i32 82
+  %tmp233 = insertelement <128 x float> %tmp232, float %tmp97, i32 83
+  %tmp234 = insertelement <128 x float> %tmp233, float %tmp98, i32 84
+  %tmp235 = insertelement <128 x float> %tmp234, float %tmp99, i32 85
+  %tmp236 = insertelement <128 x float> %tmp235, float %tmp100, i32 86
+  %tmp237 = insertelement <128 x float> %tmp236, float %tmp101, i32 87
+  %tmp238 = insertelement <128 x float> %tmp237, float %tmp102, i32 88
+  %tmp239 = insertelement <128 x float> %tmp238, float %tmp103, i32 89
+  %tmp240 = insertelement <128 x float> %tmp239, float %tmp104, i32 90
+  %tmp241 = insertelement <128 x float> %tmp240, float %tmp105, i32 91
+  %tmp242 = insertelement <128 x float> %tmp241, float %tmp106, i32 92
+  %tmp243 = insertelement <128 x float> %tmp242, float %tmp107, i32 93
+  %tmp244 = insertelement <128 x float> %tmp243, float %tmp108, i32 94
+  %tmp245 = insertelement <128 x float> %tmp244, float %tmp109, i32 95
+  %tmp246 = insertelement <128 x float> %tmp245, float %tmp110, i32 96
+  %tmp247 = insertelement <128 x float> %tmp246, float %tmp111, i32 97
+  %tmp248 = insertelement <128 x float> %tmp247, float %tmp112, i32 98
+  %tmp249 = insertelement <128 x float> %tmp248, float %tmp113, i32 99
+  %tmp250 = insertelement <128 x float> %tmp249, float %tmp114, i32 100
+  %tmp251 = insertelement <128 x float> %tmp250, float %tmp115, i32 101
+  %tmp252 = insertelement <128 x float> %tmp251, float %tmp116, i32 102
+  %tmp253 = insertelement <128 x float> %tmp252, float %tmp117, i32 103
+  %tmp254 = insertelement <128 x float> %tmp253, float %tmp118, i32 104
+  %tmp255 = insertelement <128 x float> %tmp254, float %tmp119, i32 105
+  %tmp256 = insertelement <128 x float> %tmp255, float %tmp120, i32 106
+  %tmp257 = insertelement <128 x float> %tmp256, float %tmp121, i32 107
+  %tmp258 = insertelement <128 x float> %tmp257, float %tmp122, i32 108
+  %tmp259 = insertelement <128 x float> %tmp258, float %tmp123, i32 109
+  %tmp260 = insertelement <128 x float> %tmp259, float %tmp124, i32 110
+  %tmp261 = insertelement <128 x float> %tmp260, float %tmp125, i32 111
+  %tmp262 = insertelement <128 x float> %tmp261, float %tmp126, i32 112
+  %tmp263 = insertelement <128 x float> %tmp262, float %tmp127, i32 113
+  %tmp264 = insertelement <128 x float> %tmp263, float %tmp128, i32 114
+  %tmp265 = insertelement <128 x float> %tmp264, float %tmp129, i32 115
+  %tmp266 = insertelement <128 x float> %tmp265, float %tmp130, i32 116
+  %tmp267 = insertelement <128 x float> %tmp266, float %tmp131, i32 117
+  %tmp268 = insertelement <128 x float> %tmp267, float %tmp132, i32 118
+  %tmp269 = insertelement <128 x float> %tmp268, float %tmp133, i32 119
+  %tmp270 = insertelement <128 x float> %tmp269, float %tmp134, i32 120
+  %tmp271 = insertelement <128 x float> %tmp270, float %tmp135, i32 121
+  %tmp272 = insertelement <128 x float> %tmp271, float %tmp136, i32 122
+  %tmp273 = insertelement <128 x float> %tmp272, float %tmp137, i32 123
+  %tmp274 = insertelement <128 x float> %tmp273, float %tmp138, i32 124
+  %tmp275 = insertelement <128 x float> %tmp274, float %tmp139, i32 125
+  %tmp276 = insertelement <128 x float> %tmp275, float %tmp140, i32 126
+  %tmp277 = insertelement <128 x float> %tmp276, float %tmp141, i32 127
+  %tmp278 = insertelement <128 x float> %tmp277, float %tmp149, i32 %tmp146
+  %tmp279 = extractelement <128 x float> %tmp278, i32 0
+  %tmp280 = extractelement <128 x float> %tmp278, i32 1
+  %tmp281 = extractelement <128 x float> %tmp278, i32 2
+  %tmp282 = extractelement <128 x float> %tmp278, i32 3
+  %tmp283 = extractelement <128 x float> %tmp278, i32 4
+  %tmp284 = extractelement <128 x float> %tmp278, i32 5
+  %tmp285 = extractelement <128 x float> %tmp278, i32 6
+  %tmp286 = extractelement <128 x float> %tmp278, i32 7
+  %tmp287 = extractelement <128 x float> %tmp278, i32 8
+  %tmp288 = extractelement <128 x float> %tmp278, i32 9
+  %tmp289 = extractelement <128 x float> %tmp278, i32 10
+  %tmp290 = extractelement <128 x float> %tmp278, i32 11
+  %tmp291 = extractelement <128 x float> %tmp278, i32 12
+  %tmp292 = extractelement <128 x float> %tmp278, i32 13
+  %tmp293 = extractelement <128 x float> %tmp278, i32 14
+  %tmp294 = extractelement <128 x float> %tmp278, i32 15
+  %tmp295 = extractelement <128 x float> %tmp278, i32 16
+  %tmp296 = extractelement <128 x float> %tmp278, i32 17
+  %tmp297 = extractelement <128 x float> %tmp278, i32 18
+  %tmp298 = extractelement <128 x float> %tmp278, i32 19
+  %tmp299 = extractelement <128 x float> %tmp278, i32 20
+  %tmp300 = extractelement <128 x float> %tmp278, i32 21
+  %tmp301 = extractelement <128 x float> %tmp278, i32 22
+  %tmp302 = extractelement <128 x float> %tmp278, i32 23
+  %tmp303 = extractelement <128 x float> %tmp278, i32 24
+  %tmp304 = extractelement <128 x float> %tmp278, i32 25
+  %tmp305 = extractelement <128 x float> %tmp278, i32 26
+  %tmp306 = extractelement <128 x float> %tmp278, i32 27
+  %tmp307 = extractelement <128 x float> %tmp278, i32 28
+  %tmp308 = extractelement <128 x float> %tmp278, i32 29
+  %tmp309 = extractelement <128 x float> %tmp278, i32 30
+  %tmp310 = extractelement <128 x float> %tmp278, i32 31
+  %tmp311 = extractelement <128 x float> %tmp278, i32 32
+  %tmp312 = extractelement <128 x float> %tmp278, i32 33
+  %tmp313 = extractelement <128 x float> %tmp278, i32 34
+  %tmp314 = extractelement <128 x float> %tmp278, i32 35
+  %tmp315 = extractelement <128 x float> %tmp278, i32 36
+  %tmp316 = extractelement <128 x float> %tmp278, i32 37
+  %tmp317 = extractelement <128 x float> %tmp278, i32 38
+  %tmp318 = extractelement <128 x float> %tmp278, i32 39
+  %tmp319 = extractelement <128 x float> %tmp278, i32 40
+  %tmp320 = extractelement <128 x float> %tmp278, i32 41
+  %tmp321 = extractelement <128 x float> %tmp278, i32 42
+  %tmp322 = extractelement <128 x float> %tmp278, i32 43
+  %tmp323 = extractelement <128 x float> %tmp278, i32 44
+  %tmp324 = extractelement <128 x float> %tmp278, i32 45
+  %tmp325 = extractelement <128 x float> %tmp278, i32 46
+  %tmp326 = extractelement <128 x float> %tmp278, i32 47
+  %tmp327 = extractelement <128 x float> %tmp278, i32 48
+  %tmp328 = extractelement <128 x float> %tmp278, i32 49
+  %tmp329 = extractelement <128 x float> %tmp278, i32 50
+  %tmp330 = extractelement <128 x float> %tmp278, i32 51
+  %tmp331 = extractelement <128 x float> %tmp278, i32 52
+  %tmp332 = extractelement <128 x float> %tmp278, i32 53
+  %tmp333 = extractelement <128 x float> %tmp278, i32 54
+  %tmp334 = extractelement <128 x float> %tmp278, i32 55
+  %tmp335 = extractelement <128 x float> %tmp278, i32 56
+  %tmp336 = extractelement <128 x float> %tmp278, i32 57
+  %tmp337 = extractelement <128 x float> %tmp278, i32 58
+  %tmp338 = extractelement <128 x float> %tmp278, i32 59
+  %tmp339 = extractelement <128 x float> %tmp278, i32 60
+  %tmp340 = extractelement <128 x float> %tmp278, i32 61
+  %tmp341 = extractelement <128 x float> %tmp278, i32 62
+  %tmp342 = extractelement <128 x float> %tmp278, i32 63
+  %tmp343 = extractelement <128 x float> %tmp278, i32 64
+  %tmp344 = extractelement <128 x float> %tmp278, i32 65
+  %tmp345 = extractelement <128 x float> %tmp278, i32 66
+  %tmp346 = extractelement <128 x float> %tmp278, i32 67
+  %tmp347 = extractelement <128 x float> %tmp278, i32 68
+  %tmp348 = extractelement <128 x float> %tmp278, i32 69
+  %tmp349 = extractelement <128 x float> %tmp278, i32 70
+  %tmp350 = extractelement <128 x float> %tmp278, i32 71
+  %tmp351 = extractelement <128 x float> %tmp278, i32 72
+  %tmp352 = extractelement <128 x float> %tmp278, i32 73
+  %tmp353 = extractelement <128 x float> %tmp278, i32 74
+  %tmp354 = extractelement <128 x float> %tmp278, i32 75
+  %tmp355 = extractelement <128 x float> %tmp278, i32 76
+  %tmp356 = extractelement <128 x float> %tmp278, i32 77
+  %tmp357 = extractelement <128 x float> %tmp278, i32 78
+  %tmp358 = extractelement <128 x float> %tmp278, i32 79
+  %tmp359 = extractelement <128 x float> %tmp278, i32 80
+  %tmp360 = extractelement <128 x float> %tmp278, i32 81
+  %tmp361 = extractelement <128 x float> %tmp278, i32 82
+  %tmp362 = extractelement <128 x float> %tmp278, i32 83
+  %tmp363 = extractelement <128 x float> %tmp278, i32 84
+  %tmp364 = extractelement <128 x float> %tmp278, i32 85
+  %tmp365 = extractelement <128 x float> %tmp278, i32 86
+  %tmp366 = extractelement <128 x float> %tmp278, i32 87
+  %tmp367 = extractelement <128 x float> %tmp278, i32 88
+  %tmp368 = extractelement <128 x float> %tmp278, i32 89
+  %tmp369 = extractelement <128 x float> %tmp278, i32 90
+  %tmp370 = extractelement <128 x float> %tmp278, i32 91
+  %tmp371 = extractelement <128 x float> %tmp278, i32 92
+  %tmp372 = extractelement <128 x float> %tmp278, i32 93
+  %tmp373 = extractelement <128 x float> %tmp278, i32 94
+  %tmp374 = extractelement <128 x float> %tmp278, i32 95
+  %tmp375 = extractelement <128 x float> %tmp278, i32 96
+  %tmp376 = extractelement <128 x float> %tmp278, i32 97
+  %tmp377 = extractelement <128 x float> %tmp278, i32 98
+  %tmp378 = extractelement <128 x float> %tmp278, i32 99
+  %tmp379 = extractelement <128 x float> %tmp278, i32 100
+  %tmp380 = extractelement <128 x float> %tmp278, i32 101
+  %tmp381 = extractelement <128 x float> %tmp278, i32 102
+  %tmp382 = extractelement <128 x float> %tmp278, i32 103
+  %tmp383 = extractelement <128 x float> %tmp278, i32 104
+  %tmp384 = extractelement <128 x float> %tmp278, i32 105
+  %tmp385 = extractelement <128 x float> %tmp278, i32 106
+  %tmp386 = extractelement <128 x float> %tmp278, i32 107
+  %tmp387 = extractelement <128 x float> %tmp278, i32 108
+  %tmp388 = extractelement <128 x float> %tmp278, i32 109
+  %tmp389 = extractelement <128 x float> %tmp278, i32 110
+  %tmp390 = extractelement <128 x float> %tmp278, i32 111
+  %tmp391 = extractelement <128 x float> %tmp278, i32 112
+  %tmp392 = extractelement <128 x float> %tmp278, i32 113
+  %tmp393 = extractelement <128 x float> %tmp278, i32 114
+  %tmp394 = extractelement <128 x float> %tmp278, i32 115
+  %tmp395 = extractelement <128 x float> %tmp278, i32 116
+  %tmp396 = extractelement <128 x float> %tmp278, i32 117
+  %tmp397 = extractelement <128 x float> %tmp278, i32 118
+  %tmp398 = extractelement <128 x float> %tmp278, i32 119
+  %tmp399 = extractelement <128 x float> %tmp278, i32 120
+  %tmp400 = extractelement <128 x float> %tmp278, i32 121
+  %tmp401 = extractelement <128 x float> %tmp278, i32 122
+  %tmp402 = extractelement <128 x float> %tmp278, i32 123
+  %tmp403 = extractelement <128 x float> %tmp278, i32 124
+  %tmp404 = extractelement <128 x float> %tmp278, i32 125
+  %tmp405 = extractelement <128 x float> %tmp278, i32 126
+  %tmp406 = extractelement <128 x float> %tmp278, i32 127
+  %tmp407 = bitcast float %tmp95 to i32
+  %tmp408 = add i32 %tmp407, 1
+  %tmp409 = bitcast i32 %tmp408 to float
+  br label %bb12
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
new file mode 100644
index 0000000000000..16abb89bb0b80
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -0,0 +1,494 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; This ends up using all 255 registers and requires register
+; scavenging which will fail to find an unsued register.
+
+; Check the ScratchSize to avoid regressions from spilling
+; intermediate register class copies.
+
+; FIXME: The same register is initialized to 0 for every spill.
+
+; GCN-LABEL: {{^}}main:
+
+; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s15, 0x80f000
+; VI-NEXT: s_mov_b32 s15, 0x800000
+
+; s12 is offset user SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; GCN: NumVgprs: 256
+; GCN: ScratchSize: 1024
+
+define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
+bb:
+  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i64 0, i64 0
+  %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0
+  %tmp12 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 0)
+  %tmp13 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 16)
+  %tmp14 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 32)
+  %tmp15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0
+  %tmp16 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp15, align 16, !tbaa !0
+  %tmp17 = add i32 %arg5, %arg7
+  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp16, i32 0, i32 %tmp17)
+  %tmp19 = extractelement <4 x float> %tmp18, i32 0
+  %tmp20 = extractelement <4 x float> %tmp18, i32 1
+  %tmp21 = extractelement <4 x float> %tmp18, i32 2
+  %tmp22 = extractelement <4 x float> %tmp18, i32 3
+  %tmp23 = bitcast float %tmp14 to i32
+  br label %bb24
+
+bb24:                                             ; preds = %bb157, %bb
+  %tmp25 = phi float [ 0.000000e+00, %bb ], [ %tmp350, %bb157 ]
+  %tmp26 = phi float [ 0.000000e+00, %bb ], [ %tmp349, %bb157 ]
+  %tmp27 = phi float [ 0.000000e+00, %bb ], [ %tmp348, %bb157 ]
+  %tmp28 = phi float [ 0.000000e+00, %bb ], [ %tmp351, %bb157 ]
+  %tmp29 = phi float [ 0.000000e+00, %bb ], [ %tmp347, %bb157 ]
+  %tmp30 = phi float [ 0.000000e+00, %bb ], [ %tmp346, %bb157 ]
+  %tmp31 = phi float [ 0.000000e+00, %bb ], [ %tmp345, %bb157 ]
+  %tmp32 = phi float [ 0.000000e+00, %bb ], [ %tmp352, %bb157 ]
+  %tmp33 = phi float [ 0.000000e+00, %bb ], [ %tmp344, %bb157 ]
+  %tmp34 = phi float [ 0.000000e+00, %bb ], [ %tmp343, %bb157 ]
+  %tmp35 = phi float [ 0.000000e+00, %bb ], [ %tmp342, %bb157 ]
+  %tmp36 = phi float [ 0.000000e+00, %bb ], [ %tmp353, %bb157 ]
+  %tmp37 = phi float [ 0.000000e+00, %bb ], [ %tmp341, %bb157 ]
+  %tmp38 = phi float [ 0.000000e+00, %bb ], [ %tmp340, %bb157 ]
+  %tmp39 = phi float [ 0.000000e+00, %bb ], [ %tmp339, %bb157 ]
+  %tmp40 = phi float [ 0.000000e+00, %bb ], [ %tmp354, %bb157 ]
+  %tmp41 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb157 ]
+  %tmp42 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb157 ]
+  %tmp43 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb157 ]
+  %tmp44 = phi float [ 0.000000e+00, %bb ], [ %tmp355, %bb157 ]
+  %tmp45 = phi float [ 0.000000e+00, %bb ], [ %tmp335, %bb157 ]
+  %tmp46 = phi float [ 0.000000e+00, %bb ], [ %tmp334, %bb157 ]
+  %tmp47 = phi float [ 0.000000e+00, %bb ], [ %tmp333, %bb157 ]
+  %tmp48 = phi float [ 0.000000e+00, %bb ], [ %tmp356, %bb157 ]
+  %tmp49 = phi float [ 0.000000e+00, %bb ], [ %tmp332, %bb157 ]
+  %tmp50 = phi float [ 0.000000e+00, %bb ], [ %tmp331, %bb157 ]
+  %tmp51 = phi float [ 0.000000e+00, %bb ], [ %tmp330, %bb157 ]
+  %tmp52 = phi float [ 0.000000e+00, %bb ], [ %tmp357, %bb157 ]
+  %tmp53 = phi float [ 0.000000e+00, %bb ], [ %tmp329, %bb157 ]
+  %tmp54 = phi float [ 0.000000e+00, %bb ], [ %tmp328, %bb157 ]
+  %tmp55 = phi float [ 0.000000e+00, %bb ], [ %tmp327, %bb157 ]
+  %tmp56 = phi float [ 0.000000e+00, %bb ], [ %tmp358, %bb157 ]
+  %tmp57 = phi float [ 0.000000e+00, %bb ], [ %tmp326, %bb157 ]
+  %tmp58 = phi float [ 0.000000e+00, %bb ], [ %tmp325, %bb157 ]
+  %tmp59 = phi float [ 0.000000e+00, %bb ], [ %tmp324, %bb157 ]
+  %tmp60 = phi float [ 0.000000e+00, %bb ], [ %tmp359, %bb157 ]
+  %tmp61 = phi float [ 0.000000e+00, %bb ], [ %tmp323, %bb157 ]
+  %tmp62 = phi float [ 0.000000e+00, %bb ], [ %tmp322, %bb157 ]
+  %tmp63 = phi float [ 0.000000e+00, %bb ], [ %tmp321, %bb157 ]
+  %tmp64 = phi float [ 0.000000e+00, %bb ], [ %tmp360, %bb157 ]
+  %tmp65 = phi float [ 0.000000e+00, %bb ], [ %tmp320, %bb157 ]
+  %tmp66 = phi float [ 0.000000e+00, %bb ], [ %tmp319, %bb157 ]
+  %tmp67 = phi float [ 0.000000e+00, %bb ], [ %tmp318, %bb157 ]
+  %tmp68 = phi float [ 0.000000e+00, %bb ], [ %tmp361, %bb157 ]
+  %tmp69 = phi float [ 0.000000e+00, %bb ], [ %tmp317, %bb157 ]
+  %tmp70 = phi float [ 0.000000e+00, %bb ], [ %tmp316, %bb157 ]
+  %tmp71 = phi float [ 0.000000e+00, %bb ], [ %tmp315, %bb157 ]
+  %tmp72 = phi float [ 0.000000e+00, %bb ], [ %tmp362, %bb157 ]
+  %tmp73 = phi float [ 0.000000e+00, %bb ], [ %tmp314, %bb157 ]
+  %tmp74 = phi float [ 0.000000e+00, %bb ], [ %tmp313, %bb157 ]
+  %tmp75 = phi float [ 0.000000e+00, %bb ], [ %tmp312, %bb157 ]
+  %tmp76 = phi float [ 0.000000e+00, %bb ], [ %tmp363, %bb157 ]
+  %tmp77 = phi float [ 0.000000e+00, %bb ], [ %tmp311, %bb157 ]
+  %tmp78 = phi float [ 0.000000e+00, %bb ], [ %tmp310, %bb157 ]
+  %tmp79 = phi float [ 0.000000e+00, %bb ], [ %tmp309, %bb157 ]
+  %tmp80 = phi float [ 0.000000e+00, %bb ], [ %tmp364, %bb157 ]
+  %tmp81 = phi float [ 0.000000e+00, %bb ], [ %tmp308, %bb157 ]
+  %tmp82 = phi float [ 0.000000e+00, %bb ], [ %tmp307, %bb157 ]
+  %tmp83 = phi float [ 0.000000e+00, %bb ], [ %tmp306, %bb157 ]
+  %tmp84 = phi float [ 0.000000e+00, %bb ], [ %tmp365, %bb157 ]
+  %tmp85 = phi float [ 0.000000e+00, %bb ], [ %tmp305, %bb157 ]
+  %tmp86 = phi float [ 0.000000e+00, %bb ], [ %tmp304, %bb157 ]
+  %tmp87 = phi float [ 0.000000e+00, %bb ], [ %tmp303, %bb157 ]
+  %tmp88 = phi float [ 0.000000e+00, %bb ], [ %tmp366, %bb157 ]
+  %tmp89 = phi float [ 0.000000e+00, %bb ], [ %tmp302, %bb157 ]
+  %tmp90 = phi float [ 0.000000e+00, %bb ], [ %tmp301, %bb157 ]
+  %tmp91 = phi float [ 0.000000e+00, %bb ], [ %tmp300, %bb157 ]
+  %tmp92 = phi float [ 0.000000e+00, %bb ], [ %tmp367, %bb157 ]
+  %tmp93 = phi float [ 0.000000e+00, %bb ], [ %tmp299, %bb157 ]
+  %tmp94 = phi float [ 0.000000e+00, %bb ], [ %tmp298, %bb157 ]
+  %tmp95 = phi float [ 0.000000e+00, %bb ], [ %tmp297, %bb157 ]
+  %tmp96 = phi float [ 0.000000e+00, %bb ], [ %tmp368, %bb157 ]
+  %tmp97 = phi float [ 0.000000e+00, %bb ], [ %tmp296, %bb157 ]
+  %tmp98 = phi float [ 0.000000e+00, %bb ], [ %tmp295, %bb157 ]
+  %tmp99 = phi float [ 0.000000e+00, %bb ], [ %tmp294, %bb157 ]
+  %tmp100 = phi float [ 0.000000e+00, %bb ], [ %tmp369, %bb157 ]
+  %tmp101 = phi float [ 0.000000e+00, %bb ], [ %tmp293, %bb157 ]
+  %tmp102 = phi float [ 0.000000e+00, %bb ], [ %tmp292, %bb157 ]
+  %tmp103 = phi float [ 0.000000e+00, %bb ], [ %tmp291, %bb157 ]
+  %tmp104 = phi float [ 0.000000e+00, %bb ], [ %tmp370, %bb157 ]
+  %tmp105 = phi float [ 0.000000e+00, %bb ], [ %tmp371, %bb157 ]
+  %tmp106 = phi float [ 0.000000e+00, %bb ], [ %tmp372, %bb157 ]
+  %tmp107 = phi float [ 0.000000e+00, %bb ], [ %tmp421, %bb157 ]
+  %tmp108 = phi float [ 0.000000e+00, %bb ], [ %tmp373, %bb157 ]
+  %tmp109 = phi float [ 0.000000e+00, %bb ], [ %tmp374, %bb157 ]
+  %tmp110 = phi float [ 0.000000e+00, %bb ], [ %tmp375, %bb157 ]
+  %tmp111 = phi float [ 0.000000e+00, %bb ], [ %tmp376, %bb157 ]
+  %tmp112 = phi float [ 0.000000e+00, %bb ], [ %tmp377, %bb157 ]
+  %tmp113 = phi float [ 0.000000e+00, %bb ], [ %tmp378, %bb157 ]
+  %tmp114 = phi float [ 0.000000e+00, %bb ], [ %tmp379, %bb157 ]
+  %tmp115 = phi float [ 0.000000e+00, %bb ], [ %tmp380, %bb157 ]
+  %tmp116 = phi float [ 0.000000e+00, %bb ], [ %tmp381, %bb157 ]
+  %tmp117 = phi float [ 0.000000e+00, %bb ], [ %tmp382, %bb157 ]
+  %tmp118 = phi float [ 0.000000e+00, %bb ], [ %tmp383, %bb157 ]
+  %tmp119 = phi float [ 0.000000e+00, %bb ], [ %tmp384, %bb157 ]
+  %tmp120 = phi float [ 0.000000e+00, %bb ], [ %tmp385, %bb157 ]
+  %tmp121 = phi float [ 0.000000e+00, %bb ], [ %tmp386, %bb157 ]
+  %tmp122 = phi float [ 0.000000e+00, %bb ], [ %tmp387, %bb157 ]
+  %tmp123 = phi float [ 0.000000e+00, %bb ], [ %tmp388, %bb157 ]
+  %tmp124 = phi float [ 0.000000e+00, %bb ], [ %tmp389, %bb157 ]
+  %tmp125 = phi float [ 0.000000e+00, %bb ], [ %tmp390, %bb157 ]
+  %tmp126 = phi float [ 0.000000e+00, %bb ], [ %tmp391, %bb157 ]
+  %tmp127 = phi float [ 0.000000e+00, %bb ], [ %tmp392, %bb157 ]
+  %tmp128 = phi float [ 0.000000e+00, %bb ], [ %tmp393, %bb157 ]
+  %tmp129 = phi float [ 0.000000e+00, %bb ], [ %tmp394, %bb157 ]
+  %tmp130 = phi float [ 0.000000e+00, %bb ], [ %tmp395, %bb157 ]
+  %tmp131 = phi float [ 0.000000e+00, %bb ], [ %tmp396, %bb157 ]
+  %tmp132 = phi float [ 0.000000e+00, %bb ], [ %tmp397, %bb157 ]
+  %tmp133 = phi float [ 0.000000e+00, %bb ], [ %tmp398, %bb157 ]
+  %tmp134 = phi float [ 0.000000e+00, %bb ], [ %tmp399, %bb157 ]
+  %tmp135 = phi float [ 0.000000e+00, %bb ], [ %tmp400, %bb157 ]
+  %tmp136 = phi float [ 0.000000e+00, %bb ], [ %tmp401, %bb157 ]
+  %tmp137 = phi float [ 0.000000e+00, %bb ], [ %tmp402, %bb157 ]
+  %tmp138 = phi float [ 0.000000e+00, %bb ], [ %tmp403, %bb157 ]
+  %tmp139 = phi float [ 0.000000e+00, %bb ], [ %tmp404, %bb157 ]
+  %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb157 ]
+  %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb157 ]
+  %tmp142 = phi float [ 0.000000e+00, %bb ], [ %tmp407, %bb157 ]
+  %tmp143 = phi float [ 0.000000e+00, %bb ], [ %tmp408, %bb157 ]
+  %tmp144 = phi float [ 0.000000e+00, %bb ], [ %tmp409, %bb157 ]
+  %tmp145 = phi float [ 0.000000e+00, %bb ], [ %tmp410, %bb157 ]
+  %tmp146 = phi float [ 0.000000e+00, %bb ], [ %tmp411, %bb157 ]
+  %tmp147 = phi float [ 0.000000e+00, %bb ], [ %tmp412, %bb157 ]
+  %tmp148 = phi float [ 0.000000e+00, %bb ], [ %tmp413, %bb157 ]
+  %tmp149 = phi float [ 0.000000e+00, %bb ], [ %tmp414, %bb157 ]
+  %tmp150 = phi float [ 0.000000e+00, %bb ], [ %tmp415, %bb157 ]
+  %tmp151 = phi float [ 0.000000e+00, %bb ], [ %tmp416, %bb157 ]
+  %tmp152 = phi float [ 0.000000e+00, %bb ], [ %tmp417, %bb157 ]
+  %tmp153 = phi float [ 0.000000e+00, %bb ], [ %tmp418, %bb157 ]
+  %tmp154 = bitcast float %tmp107 to i32
+  %tmp155 = icmp sgt i32 %tmp154, 125
+  br i1 %tmp155, label %bb156, label %bb157
+
+bb156:                                            ; preds = %bb24
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp12, float %tmp103, float %tmp102, float %tmp101)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 33, i32 0, float %tmp99, float %tmp98, float %tmp97, float %tmp95)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 34, i32 0, float %tmp94, float %tmp93, float %tmp91, float %tmp90)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 35, i32 0, float %tmp89, float %tmp87, float %tmp86, float %tmp85)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 36, i32 0, float %tmp83, float %tmp82, float %tmp81, float %tmp79)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 37, i32 0, float %tmp78, float %tmp77, float %tmp75, float %tmp74)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 38, i32 0, float %tmp73, float %tmp71, float %tmp70, float %tmp69)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 39, i32 0, float %tmp67, float %tmp66, float %tmp65, float %tmp63)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 40, i32 0, float %tmp62, float %tmp61, float %tmp59, float %tmp58)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 41, i32 0, float %tmp57, float %tmp55, float %tmp54, float %tmp53)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 42, i32 0, float %tmp51, float %tmp50, float %tmp49, float %tmp47)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 43, i32 0, float %tmp46, float %tmp45, float %tmp43, float %tmp42)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 44, i32 0, float %tmp41, float %tmp39, float %tmp38, float %tmp37)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 45, i32 0, float %tmp35, float %tmp34, float %tmp33, float %tmp31)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 46, i32 0, float %tmp30, float %tmp29, float %tmp27, float %tmp26)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 47, i32 0, float %tmp25, float %tmp28, float %tmp32, float %tmp36)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 48, i32 0, float %tmp40, float %tmp44, float %tmp48, float %tmp52)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 49, i32 0, float %tmp56, float %tmp60, float %tmp64, float %tmp68)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 50, i32 0, float %tmp72, float %tmp76, float %tmp80, float %tmp84)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 51, i32 0, float %tmp88, float %tmp92, float %tmp96, float %tmp100)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 52, i32 0, float %tmp104, float %tmp105, float %tmp106, float %tmp108)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 53, i32 0, float %tmp109, float %tmp110, float %tmp111, float %tmp112)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 54, i32 0, float %tmp113, float %tmp114, float %tmp115, float %tmp116)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 55, i32 0, float %tmp117, float %tmp118, float %tmp119, float %tmp120)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 56, i32 0, float %tmp121, float %tmp122, float %tmp123, float %tmp124)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 57, i32 0, float %tmp125, float %tmp126, float %tmp127, float %tmp128)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 58, i32 0, float %tmp129, float %tmp130, float %tmp131, float %tmp132)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 59, i32 0, float %tmp133, float %tmp134, float %tmp135, float %tmp136)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 60, i32 0, float %tmp137, float %tmp138, float %tmp139, float %tmp140)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 61, i32 0, float %tmp141, float %tmp142, float %tmp143, float %tmp144)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 62, i32 0, float %tmp145, float %tmp146, float %tmp147, float %tmp148)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float %tmp149, float %tmp150, float %tmp151, float %tmp13)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
+  ret void
+
+bb157:                                            ; preds = %bb24
+  %tmp158 = bitcast float %tmp107 to i32
+  %tmp159 = bitcast float %tmp107 to i32
+  %tmp160 = add i32 %tmp23, %tmp159
+  %tmp161 = bitcast i32 %tmp160 to float
+  %tmp162 = insertelement <128 x float> undef, float %tmp103, i32 0
+  %tmp163 = insertelement <128 x float> %tmp162, float %tmp102, i32 1
+  %tmp164 = insertelement <128 x float> %tmp163, float %tmp101, i32 2
+  %tmp165 = insertelement <128 x float> %tmp164, float %tmp99, i32 3
+  %tmp166 = insertelement <128 x float> %tmp165, float %tmp98, i32 4
+  %tmp167 = insertelement <128 x float> %tmp166, float %tmp97, i32 5
+  %tmp168 = insertelement <128 x float> %tmp167, float %tmp95, i32 6
+  %tmp169 = insertelement <128 x float> %tmp168, float %tmp94, i32 7
+  %tmp170 = insertelement <128 x float> %tmp169, float %tmp93, i32 8
+  %tmp171 = insertelement <128 x float> %tmp170, float %tmp91, i32 9
+  %tmp172 = insertelement <128 x float> %tmp171, float %tmp90, i32 10
+  %tmp173 = insertelement <128 x float> %tmp172, float %tmp89, i32 11
+  %tmp174 = insertelement <128 x float> %tmp173, float %tmp87, i32 12
+  %tmp175 = insertelement <128 x float> %tmp174, float %tmp86, i32 13
+  %tmp176 = insertelement <128 x float> %tmp175, float %tmp85, i32 14
+  %tmp177 = insertelement <128 x float> %tmp176, float %tmp83, i32 15
+  %tmp178 = insertelement <128 x float> %tmp177, float %tmp82, i32 16
+  %tmp179 = insertelement <128 x float> %tmp178, float %tmp81, i32 17
+  %tmp180 = insertelement <128 x float> %tmp179, float %tmp79, i32 18
+  %tmp181 = insertelement <128 x float> %tmp180, float %tmp78, i32 19
+  %tmp182 = insertelement <128 x float> %tmp181, float %tmp77, i32 20
+  %tmp183 = insertelement <128 x float> %tmp182, float %tmp75, i32 21
+  %tmp184 = insertelement <128 x float> %tmp183, float %tmp74, i32 22
+  %tmp185 = insertelement <128 x float> %tmp184, float %tmp73, i32 23
+  %tmp186 = insertelement <128 x float> %tmp185, float %tmp71, i32 24
+  %tmp187 = insertelement <128 x float> %tmp186, float %tmp70, i32 25
+  %tmp188 = insertelement <128 x float> %tmp187, float %tmp69, i32 26
+  %tmp189 = insertelement <128 x float> %tmp188, float %tmp67, i32 27
+  %tmp190 = insertelement <128 x float> %tmp189, float %tmp66, i32 28
+  %tmp191 = insertelement <128 x float> %tmp190, float %tmp65, i32 29
+  %tmp192 = insertelement <128 x float> %tmp191, float %tmp63, i32 30
+  %tmp193 = insertelement <128 x float> %tmp192, float %tmp62, i32 31
+  %tmp194 = insertelement <128 x float> %tmp193, float %tmp61, i32 32
+  %tmp195 = insertelement <128 x float> %tmp194, float %tmp59, i32 33
+  %tmp196 = insertelement <128 x float> %tmp195, float %tmp58, i32 34
+  %tmp197 = insertelement <128 x float> %tmp196, float %tmp57, i32 35
+  %tmp198 = insertelement <128 x float> %tmp197, float %tmp55, i32 36
+  %tmp199 = insertelement <128 x float> %tmp198, float %tmp54, i32 37
+  %tmp200 = insertelement <128 x float> %tmp199, float %tmp53, i32 38
+  %tmp201 = insertelement <128 x float> %tmp200, float %tmp51, i32 39
+  %tmp202 = insertelement <128 x float> %tmp201, float %tmp50, i32 40
+  %tmp203 = insertelement <128 x float> %tmp202, float %tmp49, i32 41
+  %tmp204 = insertelement <128 x float> %tmp203, float %tmp47, i32 42
+  %tmp205 = insertelement <128 x float> %tmp204, float %tmp46, i32 43
+  %tmp206 = insertelement <128 x float> %tmp205, float %tmp45, i32 44
+  %tmp207 = insertelement <128 x float> %tmp206, float %tmp43, i32 45
+  %tmp208 = insertelement <128 x float> %tmp207, float %tmp42, i32 46
+  %tmp209 = insertelement <128 x float> %tmp208, float %tmp41, i32 47
+  %tmp210 = insertelement <128 x float> %tmp209, float %tmp39, i32 48
+  %tmp211 = insertelement <128 x float> %tmp210, float %tmp38, i32 49
+  %tmp212 = insertelement <128 x float> %tmp211, float %tmp37, i32 50
+  %tmp213 = insertelement <128 x float> %tmp212, float %tmp35, i32 51
+  %tmp214 = insertelement <128 x float> %tmp213, float %tmp34, i32 52
+  %tmp215 = insertelement <128 x float> %tmp214, float %tmp33, i32 53
+  %tmp216 = insertelement <128 x float> %tmp215, float %tmp31, i32 54
+  %tmp217 = insertelement <128 x float> %tmp216, float %tmp30, i32 55
+  %tmp218 = insertelement <128 x float> %tmp217, float %tmp29, i32 56
+  %tmp219 = insertelement <128 x float> %tmp218, float %tmp27, i32 57
+  %tmp220 = insertelement <128 x float> %tmp219, float %tmp26, i32 58
+  %tmp221 = insertelement <128 x float> %tmp220, float %tmp25, i32 59
+  %tmp222 = insertelement <128 x float> %tmp221, float %tmp28, i32 60
+  %tmp223 = insertelement <128 x float> %tmp222, float %tmp32, i32 61
+  %tmp224 = insertelement <128 x float> %tmp223, float %tmp36, i32 62
+  %tmp225 = insertelement <128 x float> %tmp224, float %tmp40, i32 63
+  %tmp226 = insertelement <128 x float> %tmp225, float %tmp44, i32 64
+  %tmp227 = insertelement <128 x float> %tmp226, float %tmp48, i32 65
+  %tmp228 = insertelement <128 x float> %tmp227, float %tmp52, i32 66
+  %tmp229 = insertelement <128 x float> %tmp228, float %tmp56, i32 67
+  %tmp230 = insertelement <128 x float> %tmp229, float %tmp60, i32 68
+  %tmp231 = insertelement <128 x float> %tmp230, float %tmp64, i32 69
+  %tmp232 = insertelement <128 x float> %tmp231, float %tmp68, i32 70
+  %tmp233 = insertelement <128 x float> %tmp232, float %tmp72, i32 71
+  %tmp234 = insertelement <128 x float> %tmp233, float %tmp76, i32 72
+  %tmp235 = insertelement <128 x float> %tmp234, float %tmp80, i32 73
+  %tmp236 = insertelement <128 x float> %tmp235, float %tmp84, i32 74
+  %tmp237 = insertelement <128 x float> %tmp236, float %tmp88, i32 75
+  %tmp238 = insertelement <128 x float> %tmp237, float %tmp92, i32 76
+  %tmp239 = insertelement <128 x float> %tmp238, float %tmp96, i32 77
+  %tmp240 = insertelement <128 x float> %tmp239, float %tmp100, i32 78
+  %tmp241 = insertelement <128 x float> %tmp240, float %tmp104, i32 79
+  %tmp242 = insertelement <128 x float> %tmp241, float %tmp105, i32 80
+  %tmp243 = insertelement <128 x float> %tmp242, float %tmp106, i32 81
+  %tmp244 = insertelement <128 x float> %tmp243, float %tmp108, i32 82
+  %tmp245 = insertelement <128 x float> %tmp244, float %tmp109, i32 83
+  %tmp246 = insertelement <128 x float> %tmp245, float %tmp110, i32 84
+  %tmp247 = insertelement <128 x float> %tmp246, float %tmp111, i32 85
+  %tmp248 = insertelement <128 x float> %tmp247, float %tmp112, i32 86
+  %tmp249 = insertelement <128 x float> %tmp248, float %tmp113, i32 87
+  %tmp250 = insertelement <128 x float> %tmp249, float %tmp114, i32 88
+  %tmp251 = insertelement <128 x float> %tmp250, float %tmp115, i32 89
+  %tmp252 = insertelement <128 x float> %tmp251, float %tmp116, i32 90
+  %tmp253 = insertelement <128 x float> %tmp252, float %tmp117, i32 91
+  %tmp254 = insertelement <128 x float> %tmp253, float %tmp118, i32 92
+  %tmp255 = insertelement <128 x float> %tmp254, float %tmp119, i32 93
+  %tmp256 = insertelement <128 x float> %tmp255, float %tmp120, i32 94
+  %tmp257 = insertelement <128 x float> %tmp256, float %tmp121, i32 95
+  %tmp258 = insertelement <128 x float> %tmp257, float %tmp122, i32 96
+  %tmp259 = insertelement <128 x float> %tmp258, float %tmp123, i32 97
+  %tmp260 = insertelement <128 x float> %tmp259, float %tmp124, i32 98
+  %tmp261 = insertelement <128 x float> %tmp260, float %tmp125, i32 99
+  %tmp262 = insertelement <128 x float> %tmp261, float %tmp126, i32 100
+  %tmp263 = insertelement <128 x float> %tmp262, float %tmp127, i32 101
+  %tmp264 = insertelement <128 x float> %tmp263, float %tmp128, i32 102
+  %tmp265 = insertelement <128 x float> %tmp264, float %tmp129, i32 103
+  %tmp266 = insertelement <128 x float> %tmp265, float %tmp130, i32 104
+  %tmp267 = insertelement <128 x float> %tmp266, float %tmp131, i32 105
+  %tmp268 = insertelement <128 x float> %tmp267, float %tmp132, i32 106
+  %tmp269 = insertelement <128 x float> %tmp268, float %tmp133, i32 107
+  %tmp270 = insertelement <128 x float> %tmp269, float %tmp134, i32 108
+  %tmp271 = insertelement <128 x float> %tmp270, float %tmp135, i32 109
+  %tmp272 = insertelement <128 x float> %tmp271, float %tmp136, i32 110
+  %tmp273 = insertelement <128 x float> %tmp272, float %tmp137, i32 111
+  %tmp274 = insertelement <128 x float> %tmp273, float %tmp138, i32 112
+  %tmp275 = insertelement <128 x float> %tmp274, float %tmp139, i32 113
+  %tmp276 = insertelement <128 x float> %tmp275, float %tmp140, i32 114
+  %tmp277 = insertelement <128 x float> %tmp276, float %tmp141, i32 115
+  %tmp278 = insertelement <128 x float> %tmp277, float %tmp142, i32 116
+  %tmp279 = insertelement <128 x float> %tmp278, float %tmp143, i32 117
+  %tmp280 = insertelement <128 x float> %tmp279, float %tmp144, i32 118
+  %tmp281 = insertelement <128 x float> %tmp280, float %tmp145, i32 119
+  %tmp282 = insertelement <128 x float> %tmp281, float %tmp146, i32 120
+  %tmp283 = insertelement <128 x float> %tmp282, float %tmp147, i32 121
+  %tmp284 = insertelement <128 x float> %tmp283, float %tmp148, i32 122
+  %tmp285 = insertelement <128 x float> %tmp284, float %tmp149, i32 123
+  %tmp286 = insertelement <128 x float> %tmp285, float %tmp150, i32 124
+  %tmp287 = insertelement <128 x float> %tmp286, float %tmp151, i32 125
+  %tmp288 = insertelement <128 x float> %tmp287, float %tmp152, i32 126
+  %tmp289 = insertelement <128 x float> %tmp288, float %tmp153, i32 127
+  %tmp290 = insertelement <128 x float> %tmp289, float %tmp161, i32 %tmp158
+  %tmp291 = extractelement <128 x float> %tmp290, i32 0
+  %tmp292 = extractelement <128 x float> %tmp290, i32 1
+  %tmp293 = extractelement <128 x float> %tmp290, i32 2
+  %tmp294 = extractelement <128 x float> %tmp290, i32 3
+  %tmp295 = extractelement <128 x float> %tmp290, i32 4
+  %tmp296 = extractelement <128 x float> %tmp290, i32 5
+  %tmp297 = extractelement <128 x float> %tmp290, i32 6
+  %tmp298 = extractelement <128 x float> %tmp290, i32 7
+  %tmp299 = extractelement <128 x float> %tmp290, i32 8
+  %tmp300 = extractelement <128 x float> %tmp290, i32 9
+  %tmp301 = extractelement <128 x float> %tmp290, i32 10
+  %tmp302 = extractelement <128 x float> %tmp290, i32 11
+  %tmp303 = extractelement <128 x float> %tmp290, i32 12
+  %tmp304 = extractelement <128 x float> %tmp290, i32 13
+  %tmp305 = extractelement <128 x float> %tmp290, i32 14
+  %tmp306 = extractelement <128 x float> %tmp290, i32 15
+  %tmp307 = extractelement <128 x float> %tmp290, i32 16
+  %tmp308 = extractelement <128 x float> %tmp290, i32 17
+  %tmp309 = extractelement <128 x float> %tmp290, i32 18
+  %tmp310 = extractelement <128 x float> %tmp290, i32 19
+  %tmp311 = extractelement <128 x float> %tmp290, i32 20
+  %tmp312 = extractelement <128 x float> %tmp290, i32 21
+  %tmp313 = extractelement <128 x float> %tmp290, i32 22
+  %tmp314 = extractelement <128 x float> %tmp290, i32 23
+  %tmp315 = extractelement <128 x float> %tmp290, i32 24
+  %tmp316 = extractelement <128 x float> %tmp290, i32 25
+  %tmp317 = extractelement <128 x float> %tmp290, i32 26
+  %tmp318 = extractelement <128 x float> %tmp290, i32 27
+  %tmp319 = extractelement <128 x float> %tmp290, i32 28
+  %tmp320 = extractelement <128 x float> %tmp290, i32 29
+  %tmp321 = extractelement <128 x float> %tmp290, i32 30
+  %tmp322 = extractelement <128 x float> %tmp290, i32 31
+  %tmp323 = extractelement <128 x float> %tmp290, i32 32
+  %tmp324 = extractelement <128 x float> %tmp290, i32 33
+  %tmp325 = extractelement <128 x float> %tmp290, i32 34
+  %tmp326 = extractelement <128 x float> %tmp290, i32 35
+  %tmp327 = extractelement <128 x float> %tmp290, i32 36
+  %tmp328 = extractelement <128 x float> %tmp290, i32 37
+  %tmp329 = extractelement <128 x float> %tmp290, i32 38
+  %tmp330 = extractelement <128 x float> %tmp290, i32 39
+  %tmp331 = extractelement <128 x float> %tmp290, i32 40
+  %tmp332 = extractelement <128 x float> %tmp290, i32 41
+  %tmp333 = extractelement <128 x float> %tmp290, i32 42
+  %tmp334 = extractelement <128 x float> %tmp290, i32 43
+  %tmp335 = extractelement <128 x float> %tmp290, i32 44
+  %tmp336 = extractelement <128 x float> %tmp290, i32 45
+  %tmp337 = extractelement <128 x float> %tmp290, i32 46
+  %tmp338 = extractelement <128 x float> %tmp290, i32 47
+  %tmp339 = extractelement <128 x float> %tmp290, i32 48
+  %tmp340 = extractelement <128 x float> %tmp290, i32 49
+  %tmp341 = extractelement <128 x float> %tmp290, i32 50
+  %tmp342 = extractelement <128 x float> %tmp290, i32 51
+  %tmp343 = extractelement <128 x float> %tmp290, i32 52
+  %tmp344 = extractelement <128 x float> %tmp290, i32 53
+  %tmp345 = extractelement <128 x float> %tmp290, i32 54
+  %tmp346 = extractelement <128 x float> %tmp290, i32 55
+  %tmp347 = extractelement <128 x float> %tmp290, i32 56
+  %tmp348 = extractelement <128 x float> %tmp290, i32 57
+  %tmp349 = extractelement <128 x float> %tmp290, i32 58
+  %tmp350 = extractelement <128 x float> %tmp290, i32 59
+  %tmp351 = extractelement <128 x float> %tmp290, i32 60
+  %tmp352 = extractelement <128 x float> %tmp290, i32 61
+  %tmp353 = extractelement <128 x float> %tmp290, i32 62
+  %tmp354 = extractelement <128 x float> %tmp290, i32 63
+  %tmp355 = extractelement <128 x float> %tmp290, i32 64
+  %tmp356 = extractelement <128 x float> %tmp290, i32 65
+  %tmp357 = extractelement <128 x float> %tmp290, i32 66
+  %tmp358 = extractelement <128 x float> %tmp290, i32 67
+  %tmp359 = extractelement <128 x float> %tmp290, i32 68
+  %tmp360 = extractelement <128 x float> %tmp290, i32 69
+  %tmp361 = extractelement <128 x float> %tmp290, i32 70
+  %tmp362 = extractelement <128 x float> %tmp290, i32 71
+  %tmp363 = extractelement <128 x float> %tmp290, i32 72
+  %tmp364 = extractelement <128 x float> %tmp290, i32 73
+  %tmp365 = extractelement <128 x float> %tmp290, i32 74
+  %tmp366 = extractelement <128 x float> %tmp290, i32 75
+  %tmp367 = extractelement <128 x float> %tmp290, i32 76
+  %tmp368 = extractelement <128 x float> %tmp290, i32 77
+  %tmp369 = extractelement <128 x float> %tmp290, i32 78
+  %tmp370 = extractelement <128 x float> %tmp290, i32 79
+  %tmp371 = extractelement <128 x float> %tmp290, i32 80
+  %tmp372 = extractelement <128 x float> %tmp290, i32 81
+  %tmp373 = extractelement <128 x float> %tmp290, i32 82
+  %tmp374 = extractelement <128 x float> %tmp290, i32 83
+  %tmp375 = extractelement <128 x float> %tmp290, i32 84
+  %tmp376 = extractelement <128 x float> %tmp290, i32 85
+  %tmp377 = extractelement <128 x float> %tmp290, i32 86
+  %tmp378 = extractelement <128 x float> %tmp290, i32 87
+  %tmp379 = extractelement <128 x float> %tmp290, i32 88
+  %tmp380 = extractelement <128 x float> %tmp290, i32 89
+  %tmp381 = extractelement <128 x float> %tmp290, i32 90
+  %tmp382 = extractelement <128 x float> %tmp290, i32 91
+  %tmp383 = extractelement <128 x float> %tmp290, i32 92
+  %tmp384 = extractelement <128 x float> %tmp290, i32 93
+  %tmp385 = extractelement <128 x float> %tmp290, i32 94
+  %tmp386 = extractelement <128 x float> %tmp290, i32 95
+  %tmp387 = extractelement <128 x float> %tmp290, i32 96
+  %tmp388 = extractelement <128 x float> %tmp290, i32 97
+  %tmp389 = extractelement <128 x float> %tmp290, i32 98
+  %tmp390 = extractelement <128 x float> %tmp290, i32 99
+  %tmp391 = extractelement <128 x float> %tmp290, i32 100
+  %tmp392 = extractelement <128 x float> %tmp290, i32 101
+  %tmp393 = extractelement <128 x float> %tmp290, i32 102
+  %tmp394 = extractelement <128 x float> %tmp290, i32 103
+  %tmp395 = extractelement <128 x float> %tmp290, i32 104
+  %tmp396 = extractelement <128 x float> %tmp290, i32 105
+  %tmp397 = extractelement <128 x float> %tmp290, i32 106
+  %tmp398 = extractelement <128 x float> %tmp290, i32 107
+  %tmp399 = extractelement <128 x float> %tmp290, i32 108
+  %tmp400 = extractelement <128 x float> %tmp290, i32 109
+  %tmp401 = extractelement <128 x float> %tmp290, i32 110
+  %tmp402 = extractelement <128 x float> %tmp290, i32 111
+  %tmp403 = extractelement <128 x float> %tmp290, i32 112
+  %tmp404 = extractelement <128 x float> %tmp290, i32 113
+  %tmp405 = extractelement <128 x float> %tmp290, i32 114
+  %tmp406 = extractelement <128 x float> %tmp290, i32 115
+  %tmp407 = extractelement <128 x float> %tmp290, i32 116
+  %tmp408 = extractelement <128 x float> %tmp290, i32 117
+  %tmp409 = extractelement <128 x float> %tmp290, i32 118
+  %tmp410 = extractelement <128 x float> %tmp290, i32 119
+  %tmp411 = extractelement <128 x float> %tmp290, i32 120
+  %tmp412 = extractelement <128 x float> %tmp290, i32 121
+  %tmp413 = extractelement <128 x float> %tmp290, i32 122
+  %tmp414 = extractelement <128 x float> %tmp290, i32 123
+  %tmp415 = extractelement <128 x float> %tmp290, i32 124
+  %tmp416 = extractelement <128 x float> %tmp290, i32 125
+  %tmp417 = extractelement <128 x float> %tmp290, i32 126
+  %tmp418 = extractelement <128 x float> %tmp290, i32 127
+  %tmp419 = bitcast float %tmp107 to i32
+  %tmp420 = add i32 %tmp419, 1
+  %tmp421 = bitcast i32 %tmp420 to float
+  br label %bb24
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/vop-shrink.ll b/test/CodeGen/AMDGPU/vop-shrink.ll
index 9b2f229c05af1..2bfe1b2bd6ec2 100644
--- a/test/CodeGen/AMDGPU/vop-shrink.ll
+++ b/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -3,8 +3,8 @@
 
 ; Test that we correctly commute a sub instruction
 ; FUNC-LABEL: {{^}}sub_rev:
-; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s
-; SI: v_subrev_i32_e32 v{{[0-9]+}}, s
+; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, vcc, s
+; SI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, s
 
 ; ModuleID = 'vop-shrink.ll'
 
diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll
index 5cc7577cad332..107e84b33be9a 100644
--- a/test/CodeGen/AMDGPU/wait.ll
+++ b/test/CodeGen/AMDGPU/wait.ll
@@ -1,11 +1,16 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: s_load_dwordx4
-; CHECK: s_load_dwordx4
-; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; CHECK: s_endpgm
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT
+; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX
+; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX
+; The ilpmax scheduler is used for the second test to get the ordering we want for the test.
+
+; DEFAULT-LABEL: {{^}}main:
+; DEFAULT: s_load_dwordx4
+; DEFAULT: s_load_dwordx4
+; DEFAULT: s_waitcnt vmcnt(0)
+; DEFAULT: exp
+; DEFAULT: s_waitcnt lgkmcnt(0)
+; DEFAULT: s_endpgm
 define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
@@ -29,7 +34,43 @@ main_body:
   ret void
 }
 
-; Function Attrs: noduplicate nounwind
+; ILPMAX-LABEL: {{^}}main2:
+; ILPMAX: s_load_dwordx4
+; ILPMAX: s_waitcnt lgkmcnt(0)
+; ILPMAX: buffer_load
+; ILPMAX: s_load_dwordx4
+; ILPMAX: s_waitcnt lgkmcnt(0)
+; ILPMAX: buffer_load
+; ILPMAX: s_waitcnt vmcnt(1)
+; ILPMAX: s_waitcnt vmcnt(0)
+; ILPMAX: s_endpgm
+
+define void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
+byval, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
+main_body:
+  %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
+  %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0
+  %13 = add i32 %5, %7
+  %14 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %12, i32 0, i32 %13)
+  %15 = extractelement <4 x float> %14, i32 0
+  %16 = extractelement <4 x float> %14, i32 1
+  %17 = extractelement <4 x float> %14, i32 2
+  %18 = extractelement <4 x float> %14, i32 3
+  %19 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 1
+  %20 = load <16 x i8>, <16 x i8> addrspace(2)* %19, align 16, !tbaa !0
+  %21 = add i32 %5, %7
+  %22 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %20, i32 0, i32 %21)
+  %23 = extractelement <4 x float> %22, i32 0
+  %24 = extractelement <4 x float> %22, i32 1
+  %25 = extractelement <4 x float> %22, i32 2
+  %26 = extractelement <4 x float> %22, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %15, float %16, float %17, float %18)
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %23, float %24, float %25, float %26)
+  ret void
+}
+
+
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.global() #1
 
 ; Function Attrs: nounwind readnone
@@ -38,7 +79,7 @@ declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
 attributes #0 = { "ShaderType"="1" }
-attributes #1 = { noduplicate nounwind }
+attributes #1 = { convergent nounwind }
 attributes #2 = { nounwind readnone }
 
 !0 = !{!1, !1, i64 0, i32 1}
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
index 4328e964c1bf8..e7fcd1ff36501 100644
--- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
@@ -1,15 +1,34 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}ngroups_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].X
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
+
+; HSA: .amd_kernel_code_t
+
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: enable_sgpr_dispatch_id = 0
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: enable_sgpr_private_segment_size = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; HSA: .end_amd_kernel_code_t
+
+
+; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 
-; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
 define void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
@@ -19,12 +38,12 @@ entry:
 
 ; FUNC-LABEL: {{^}}ngroups_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].Y
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
@@ -34,12 +53,12 @@ entry:
 
 ; FUNC-LABEL: {{^}}ngroups_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].Z
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
@@ -49,12 +68,12 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].W
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
@@ -64,12 +83,12 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].X
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
@@ -79,12 +98,12 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Y
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
@@ -92,74 +111,34 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_size_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_x (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.x() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].W
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_y (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.y() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].X
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_z (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.z() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}get_work_dim:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @get_work_dim (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; The tgid values are stored in sgprs offset by the number of user sgprs.
-; Currently we always use exactly 2 user sgprs for the pointer to the
-; kernel arguments, but this may change in the future.
+; The tgid values are stored in sgprs offset by the number of user
+; sgprs.
 
 ; FUNC-LABEL: {{^}}tgid_x:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
-; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_x (i32 addrspace(1)* %out) {
+; HSA: .amd_kernel_code_t
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; HSA: .end_amd_kernel_code_t
+
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
+; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}}
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+; HSA: flat_store_dword [[VVAL]]
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -167,9 +146,26 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tgid_y:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
-; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_y (i32 addrspace(1)* %out) {
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 1
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
+; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+; HSA: flat_store_dword [[VVAL]]
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -177,36 +173,81 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tgid_z:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
-; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_z (i32 addrspace(1)* %out) {
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 1
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: enable_sgpr_dispatch_id = 0
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: enable_sgpr_private_segment_size = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
+; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}}
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+; HSA: flat_store_dword [[VVAL]]
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 132{{$}}
+
 ; FUNC-LABEL: {{^}}tidig_x:
-; GCN: buffer_store_dword v0
-define void @tidig_x (i32 addrspace(1)* %out) {
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; GCN-NOHSA: buffer_store_dword v0
+; HSA: flat_store_dword v0
+define void @tidig_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 2180{{$}}
+
 ; FUNC-LABEL: {{^}}tidig_y:
-; GCN: buffer_store_dword v1
-define void @tidig_y (i32 addrspace(1)* %out) {
+
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
+; GCN-NOHSA: buffer_store_dword v1
+; HSA: flat_store_dword v1
+define void @tidig_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 4228{{$}}
+
 ; FUNC-LABEL: {{^}}tidig_z:
-; GCN: buffer_store_dword v2
-define void @tidig_z (i32 addrspace(1)* %out) {
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
+; GCN-NOHSA: buffer_store_dword v2
+; HSA: flat_store_dword v2
+define void @tidig_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -221,10 +262,6 @@ declare i32 @llvm.r600.read.global.size.x() #0
 declare i32 @llvm.r600.read.global.size.y() #0
 declare i32 @llvm.r600.read.global.size.z() #0
 
-declare i32 @llvm.r600.read.local.size.x() #0
-declare i32 @llvm.r600.read.local.size.y() #0
-declare i32 @llvm.r600.read.local.size.z() #0
-
 declare i32 @llvm.r600.read.tgid.x() #0
 declare i32 @llvm.r600.read.tgid.y() #0
 declare i32 @llvm.r600.read.tgid.z() #0
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index ddb920af29d84..655655d92f08f 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -38,7 +38,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 }
 
 ; FUNC-LABEL: {{^}}xor_i1:
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}}
 
 ; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
 ; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll
index 033055db185a4..35ddf2b0a465b 100644
--- a/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/test/CodeGen/AMDGPU/zero_extend.ll
@@ -7,8 +7,7 @@
 ; R600: MEM_RAT_CACHELESS STORE_RAW
 
 ; SI: {{^}}test:
-; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}}
-; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
+; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
 ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
diff --git a/test/CodeGen/ARM/2007-03-13-InstrSched.ll b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
index 9c0143be06c37..81a6bb64971d6 100644
--- a/test/CodeGen/ARM/2007-03-13-InstrSched.ll
+++ b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \
 ; RUN:   -mattr=+v6 | grep r9
 ; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \
-; RUN:   -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats 2>&1 | grep asm-printer
+; RUN:   -mattr=+v6,+reserve-r9 -ifcvt-limit=0 -stats 2>&1 | grep asm-printer
 ; | grep 35
 
 define void @test(i32 %tmp56222, i32 %tmp36224, i32 %tmp46223, i32 %i.0196.0.ph, i32 %tmp8, i32* %tmp1011, i32** %tmp1, i32* %d2.1.out, i32* %d3.1.out, i32* %d0.1.out, i32* %d1.1.out) {
diff --git a/test/CodeGen/ARM/2009-10-16-Scope.ll b/test/CodeGen/ARM/2009-10-16-Scope.ll
index 3f47488372b8c..613694f091d1e 100644
--- a/test/CodeGen/ARM/2009-10-16-Scope.ll
+++ b/test/CodeGen/ARM/2009-10-16-Scope.ll
@@ -24,9 +24,9 @@ declare i32 @foo(i32) ssp
 
 !0 = !DILocation(line: 5, column: 2, scope: !1)
 !1 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !2)
-!2 = !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !3)
-!3 = !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: 0, file: !8, retainedTypes: !9)
-!4 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "count_", line: 5, scope: !5, file: !3, type: !6)
+!2 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !3)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: 0, file: !8, retainedTypes: !9)
+!4 = !DILocalVariable(name: "count_", line: 5, scope: !5, file: !3, type: !6)
 !5 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !1)
 !6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !7 = !DILocation(line: 6, column: 1, scope: !2)
diff --git a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
index 638b26c73146d..1341830b4a4b1 100644
--- a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
+++ b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
@@ -14,11 +14,11 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!15}
-!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "b", line: 93, arg: 0, scope: !1, file: !2, type: !6)
-!1 = !DISubprogram(name: "__addvsi3", linkageName: "__addvsi3", line: 94, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !12, scope: null, type: !4)
+!0 = !DILocalVariable(name: "b", line: 93, arg: 2, scope: !1, file: !2, type: !6)
+!1 = distinct !DISubprogram(name: "__addvsi3", linkageName: "__addvsi3", line: 94, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !12, scope: null, type: !4)
 !2 = !DIFile(filename: "libgcc2.c", directory: "/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc")
 !12 = !DIFile(filename: "libgcc2.c", directory: "/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc")
-!3 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", isOptimized: true, emissionKind: 0, file: !12, enums: !13, retainedTypes: !13, subprograms: !14)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", isOptimized: true, emissionKind: 0, file: !12, enums: !13, retainedTypes: !13, subprograms: !14)
 !4 = !DISubroutineType(types: !5)
 !5 = !{!6, !6, !6}
 !6 = !DIDerivedType(tag: DW_TAG_typedef, name: "SItype", line: 152, file: !12, baseType: !8)
diff --git a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
index cfaffd8234ba2..171b6d2bcc5c9 100644
--- a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
+++ b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
@@ -1,36 +1,36 @@
 ; RUN: llc -mtriple=arm-eabi -mattr=+neon -O0 -optimize-regalloc -regalloc=basic %s -o /dev/null
 
 ; This test would crash the rewriter when trying to handle a spill after one of
-; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register.
+; the @llvm.arm.neon.vld3.v8i8.p0i8 defined three parts of a register.
 
 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
 
 define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
-  %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
   %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1]
   %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1]
-  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
   %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
   %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1]
-  %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
   %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1]
-  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
   %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
-  %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
   %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1]
   %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1]
-  %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
   %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1]
   %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1]
   %tmp2bd = add <8 x i8> %tmp2b, %tmp2d           ; <<8 x i8>> [#uses=1]
   %tmp4bd = add <8 x i8> %tmp4b, %tmp4d           ; <<8 x i8>> [#uses=1]
   %tmp2abcd = mul <8 x i8> undef, %tmp2bd         ; <<8 x i8>> [#uses=1]
   %tmp4abcd = mul <8 x i8> undef, %tmp4bd         ; <<8 x i8>> [#uses=2]
-  call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1)
+  call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1)
   %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f           ; <<8 x i8>> [#uses=1]
   %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h           ; <<8 x i8>> [#uses=1]
   %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h  ; <<8 x i8>> [#uses=1]
@@ -38,8 +38,8 @@ define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A
   %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh       ; <<8 x i8>> [#uses=1]
   %tmp3efgh = mul <8 x i8> undef, %tmp3gh         ; <<8 x i8>> [#uses=1]
   %tmp4efgh = mul <8 x i8> %tmp4ef, undef         ; <<8 x i8>> [#uses=2]
-  call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1)
+  call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1)
   %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd       ; <<8 x i8>> [#uses=1]
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1)
+  tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1)
   ret <8 x i8> %tmp4
 }
diff --git a/test/CodeGen/ARM/2010-05-21-BuildVector.ll b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
index 6a6ccf3d0a014..c6c0e2caee420 100644
--- a/test/CodeGen/ARM/2010-05-21-BuildVector.ll
+++ b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
@@ -36,8 +36,8 @@ entry:
   %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3
   %19 = fmul <4 x float> %tmp5, %2
   %20 = bitcast float* %fltp to i8*
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %20, <4 x float> %19, i32 1)
   ret void
 }
 
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll b/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
index f86c3ba9ef6e3..1deb98631a4f6 100644
--- a/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
+++ b/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
@@ -12,8 +12,8 @@ entry:
  %tmp9 = trunc i128 %tmp8 to i64                 ; <i64> [#uses=1]
  %tmp16.i = bitcast i64 %tmp6 to <8 x i8>        ; <<8 x i8>> [#uses=1]
  %tmp20.i = bitcast i64 %tmp9 to <8 x i8>        ; <<8 x i8>> [#uses=1]
- tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind
+ tail call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind
  ret void
 }
 
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
diff --git a/test/CodeGen/ARM/2010-06-21-nondarwin-tc.ll b/test/CodeGen/ARM/2010-06-21-nondarwin-tc.ll
index bead8d9781e84..47a5ef0bc5444 100755
--- a/test/CodeGen/ARM/2010-06-21-nondarwin-tc.ll
+++ b/test/CodeGen/ARM/2010-06-21-nondarwin-tc.ll
@@ -20,7 +20,7 @@
 @.str51 = external constant [45 x i8]             ; <[45 x i8]*> [#uses=1]
 @__PRETTY_FUNCTION__._ZNK4llvm7VarInit12getFieldInitERNS_6RecordEPKNS_9RecordValERKSs = external constant [116 x i8] ; <[116 x i8]*> [#uses=1]
 
-@_ZN4llvm9RecordValC1ERKSsPNS_5RecTyEj = alias void (%"class.llvm::RecordVal"*, %"class.std::basic_string"*, %"struct.llvm::Init"*, i32)* @_ZN4llvm9RecordValC2ERKSsPNS_5RecTyEj ; <void (%"class.llvm::RecordVal"*, %"class.std::basic_string"*, %"struct.llvm::Init"*, i32)*> [#uses=0]
+@_ZN4llvm9RecordValC1ERKSsPNS_5RecTyEj = alias void (%"class.llvm::RecordVal"*, %"class.std::basic_string"*, %"struct.llvm::Init"*, i32), void (%"class.llvm::RecordVal"*, %"class.std::basic_string"*, %"struct.llvm::Init"*, i32)* @_ZN4llvm9RecordValC2ERKSsPNS_5RecTyEj ; <void (%"class.llvm::RecordVal"*, %"class.std::basic_string"*, %"struct.llvm::Init"*, i32)*> [#uses=0]
 
 declare i8* @__dynamic_cast(i8*, i8*, i8*, i32)
 
diff --git a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
index 95bb2769759e6..38b352c473b1c 100644
--- a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
+++ b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
@@ -47,19 +47,19 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.lv.fn = !{!0, !8, !10, !12}
 !llvm.dbg.gv = !{!14}
 
-!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "buf", line: 4, arg: 0, scope: !1, file: !2, type: !6)
-!1 = !DISubprogram(name: "x0", linkageName: "x0", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !26, scope: null, type: !4)
+!0 = !DILocalVariable(name: "buf", line: 4, arg: 1, scope: !1, file: !2, type: !6)
+!1 = distinct !DISubprogram(name: "x0", linkageName: "x0", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !26, scope: null, type: !4)
 !2 = !DIFile(filename: "t.c", directory: "/private/tmp")
-!3 = !DICompileUnit(language: DW_LANG_C99, producer: "clang 2.0", isOptimized: true, file: !26)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 2.0", isOptimized: true, file: !26)
 !4 = !DISubroutineType(types: !5)
 !5 = !{null}
 !6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !26, scope: !2, baseType: !7)
 !7 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
-!8 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "nbytes", line: 4, arg: 0, scope: !1, file: !2, type: !9)
+!8 = !DILocalVariable(name: "nbytes", line: 4, arg: 2, scope: !1, file: !2, type: !9)
 !9 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned long", size: 32, align: 32, encoding: DW_ATE_unsigned)
-!10 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "nread", line: 6, scope: !11, file: !2, type: !9)
+!10 = !DILocalVariable(name: "nread", line: 6, scope: !11, file: !2, type: !9)
 !11 = distinct !DILexicalBlock(line: 5, column: 1, file: !26, scope: !1)
-!12 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "c", line: 7, scope: !11, file: !2, type: !13)
+!12 = !DILocalVariable(name: "c", line: 7, scope: !11, file: !2, type: !13)
 !13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !14 = !DIGlobalVariable(name: "length", linkageName: "length", line: 1, isLocal: false, isDefinition: true, scope: !2, file: !2, type: !13, variable: i32* @length)
 !15 = !DILocation(line: 4, column: 24, scope: !1)
diff --git a/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll b/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
index 1aee5088eee4e..130221d38c235 100644
--- a/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
+++ b/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
@@ -16,10 +16,10 @@ target triple = "thumbv7-apple-darwin10"
 
 define i32 @test(i8* %arg) nounwind {
 entry:
- %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg, i32 1)
+ %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %arg, i32 1)
  %1 = shufflevector <2 x i64> undef, <2 x i64> %0, <2 x i32> <i32 1, i32 2>
  store <2 x i64> %1, <2 x i64>* undef, align 16
  ret i32 undef
 }
 
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index 953e2bbf291c3..14ddb59b53870 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
@@ -3,7 +3,7 @@
 
 %struct.SVal = type { i8*, i32 }
 
-define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp {
+define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp !dbg !17 {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
   call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !23, metadata !DIExpression()), !dbg !24
@@ -31,7 +31,7 @@ return:                                           ; preds = %bb2
   ret i32 %.0, !dbg !29
 }
 
-define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2  {
+define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2  !dbg !16 {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
   call void @llvm.dbg.value(metadata %struct.SVal* %this, i64 0, metadata !31, metadata !DIExpression()), !dbg !34
@@ -47,7 +47,7 @@ return:                                           ; preds = %entry
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-define i32 @main() nounwind ssp {
+define i32 @main() nounwind ssp !dbg !20 {
 entry:
   %0 = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=3]
   %v = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=4]
@@ -80,7 +80,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !0 = !DISubprogram(name: "SVal", line: 11, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !48, scope: !1, type: !14)
 !1 = !DICompositeType(tag: DW_TAG_structure_type, name: "SVal", line: 1, size: 128, align: 64, file: !48, elements: !4)
 !2 = !DIFile(filename: "small.cc", directory: "/Users/manav/R8248330")
-!3 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 1, file: !48, enums: !47, retainedTypes: !47, subprograms: !46, globals: !47, imports:  !47)
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 1, file: !48, enums: !47, retainedTypes: !47, subprograms: !46, globals: !47, imports:  !47)
 !4 = !{!5, !7, !0, !9}
 !5 = !DIDerivedType(tag: DW_TAG_member, name: "Data", line: 7, size: 64, align: 64, file: !48, scope: !1, baseType: !6)
 !6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !48, baseType: null)
@@ -93,35 +93,35 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !14 = !DISubroutineType(types: !15)
 !15 = !{null, !12}
-!16 = !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !1, type: !14, function: void (%struct.SVal*)* @_ZN4SValC1Ev)
-!17 = !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !2, type: !18, function: i32 (i32, %struct.SVal*)* @_Z3fooi4SVal)
+!16 = distinct !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !1, type: !14)
+!17 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !2, type: !18)
 !18 = !DISubroutineType(types: !19)
 !19 = !{!13, !13, !1}
-!20 = !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !2, type: !21, function: i32 ()* @main)
+!20 = distinct !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !2, type: !21)
 !21 = !DISubroutineType(types: !22)
 !22 = !{!13}
-!23 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "i", line: 16, arg: 0, scope: !17, file: !2, type: !13)
+!23 = !DILocalVariable(name: "i", line: 16, arg: 1, scope: !17, file: !2, type: !13)
 !24 = !DILocation(line: 16, scope: !17)
-!25 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "location", line: 16, arg: 0, scope: !17, file: !2, type: !26)
+!25 = !DILocalVariable(name: "location", line: 16, arg: 2, scope: !17, file: !2, type: !26)
 !26 = !DIDerivedType(tag: DW_TAG_reference_type, name: "SVal", size: 64, align: 64, file: !48, scope: !2, baseType: !1)
 !27 = !DILocation(line: 17, scope: !28)
 !28 = distinct !DILexicalBlock(line: 16, column: 0, file: !2, scope: !17)
 !29 = !DILocation(line: 18, scope: !28)
 !30 = !DILocation(line: 20, scope: !28)
-!31 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 11, arg: 0, scope: !16, file: !2, type: !32)
+!31 = !DILocalVariable(name: "this", line: 11, arg: 1, scope: !16, file: !2, type: !32)
 !32 = !DIDerivedType(tag: DW_TAG_const_type, size: 64, align: 64, flags: DIFlagArtificial, file: !48, scope: !2, baseType: !33)
 !33 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !48, scope: !2, baseType: !1)
 !34 = !DILocation(line: 11, scope: !16)
 !35 = !DILocation(line: 11, scope: !36)
 !36 = distinct !DILexicalBlock(line: 11, column: 0, file: !48, scope: !37)
 !37 = distinct !DILexicalBlock(line: 11, column: 0, file: !48, scope: !16)
-!38 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "v", line: 24, scope: !39, file: !2, type: !1)
+!38 = !DILocalVariable(name: "v", line: 24, scope: !39, file: !2, type: !1)
 !39 = distinct !DILexicalBlock(line: 23, column: 0, file: !48, scope: !40)
 !40 = distinct !DILexicalBlock(line: 23, column: 0, file: !48, scope: !20)
 !41 = !DILocation(line: 24, scope: !39)
 !42 = !DILocation(line: 25, scope: !39)
 !43 = !DILocation(line: 26, scope: !39)
-!44 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 26, scope: !39, file: !2, type: !13)
+!44 = !DILocalVariable(name: "k", line: 26, scope: !39, file: !2, type: !13)
 !45 = !DILocation(line: 27, scope: !39)
 !46 = !{!16, !17, !20}
 !47 = !{}
diff --git a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
index 9a5baf21b8fbd..d5eed8b6a2c47 100644
--- a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
@@ -28,7 +28,7 @@ target triple = "thumbv7-apple-darwin10"
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK:    DW_AT_location [DW_FORM_exprloc]        (<0x8> 03 [[ADDR]] 10 01 22  )
 
-define zeroext i8 @get1(i8 zeroext %a) nounwind optsize {
+define zeroext i8 @get1(i8 zeroext %a) nounwind optsize !dbg !0 {
 entry:
   tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !10, metadata !DIExpression()), !dbg !30
   %0 = load i8, i8* @x1, align 4, !dbg !30
@@ -39,7 +39,7 @@ entry:
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
-define zeroext i8 @get2(i8 zeroext %a) nounwind optsize {
+define zeroext i8 @get2(i8 zeroext %a) nounwind optsize !dbg !6 {
 entry:
   tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !18, metadata !DIExpression()), !dbg !32
   %0 = load i8, i8* @x2, align 4, !dbg !32
@@ -48,7 +48,7 @@ entry:
   ret i8 %0, !dbg !33
 }
 
-define zeroext i8 @get3(i8 zeroext %a) nounwind optsize {
+define zeroext i8 @get3(i8 zeroext %a) nounwind optsize !dbg !7 {
 entry:
   tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !21, metadata !DIExpression()), !dbg !34
   %0 = load i8, i8* @x3, align 4, !dbg !34
@@ -57,7 +57,7 @@ entry:
   ret i8 %0, !dbg !35
 }
 
-define zeroext i8 @get4(i8 zeroext %a) nounwind optsize {
+define zeroext i8 @get4(i8 zeroext %a) nounwind optsize !dbg !8 {
 entry:
   tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !24, metadata !DIExpression()), !dbg !36
   %0 = load i8, i8* @x4, align 4, !dbg !36
@@ -66,7 +66,7 @@ entry:
   ret i8 %0, !dbg !37
 }
 
-define zeroext i8 @get5(i8 zeroext %a) nounwind optsize {
+define zeroext i8 @get5(i8 zeroext %a) nounwind optsize !dbg !9 {
 entry:
   tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !27, metadata !DIExpression()), !dbg !38
   %0 = load i8, i8* @x5, align 4, !dbg !38
@@ -78,35 +78,35 @@ entry:
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!49}
 
-!0 = !DISubprogram(name: "get1", linkageName: "get1", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 4, file: !47, scope: !1, type: !3, function: i8 (i8)* @get1, variables: !42)
+!0 = distinct !DISubprogram(name: "get1", linkageName: "get1", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 4, file: !47, scope: !1, type: !3, variables: !42)
 !1 = !DIFile(filename: "foo.c", directory: "/tmp/")
-!2 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", isOptimized: true, emissionKind: 0, file: !47, enums: !48, retainedTypes: !48, subprograms: !40, globals: !41, imports:  !48)
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", isOptimized: true, emissionKind: 0, file: !47, enums: !48, retainedTypes: !48, subprograms: !40, globals: !41, imports:  !48)
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5, !5}
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "_Bool", size: 8, align: 8, encoding: DW_ATE_boolean)
-!6 = !DISubprogram(name: "get2", linkageName: "get2", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 7, file: !47, scope: !1, type: !3, function: i8 (i8)* @get2, variables: !43)
-!7 = !DISubprogram(name: "get3", linkageName: "get3", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 10, file: !47, scope: !1, type: !3, function: i8 (i8)* @get3, variables: !44)
-!8 = !DISubprogram(name: "get4", linkageName: "get4", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 13, file: !47, scope: !1, type: !3, function: i8 (i8)* @get4, variables: !45)
-!9 = !DISubprogram(name: "get5", linkageName: "get5", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 16, file: !47, scope: !1, type: !3, function: i8 (i8)* @get5, variables: !46)
-!10 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 4, arg: 0, scope: !0, file: !1, type: !5)
-!11 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 4, scope: !12, file: !1, type: !5)
+!6 = distinct !DISubprogram(name: "get2", linkageName: "get2", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 7, file: !47, scope: !1, type: !3, variables: !43)
+!7 = distinct !DISubprogram(name: "get3", linkageName: "get3", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 10, file: !47, scope: !1, type: !3, variables: !44)
+!8 = distinct !DISubprogram(name: "get4", linkageName: "get4", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 13, file: !47, scope: !1, type: !3, variables: !45)
+!9 = distinct !DISubprogram(name: "get5", linkageName: "get5", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 16, file: !47, scope: !1, type: !3, variables: !46)
+!10 = !DILocalVariable(name: "a", line: 4, arg: 1, scope: !0, file: !1, type: !5)
+!11 = !DILocalVariable(name: "b", line: 4, scope: !12, file: !1, type: !5)
 !12 = distinct !DILexicalBlock(line: 4, column: 0, file: !47, scope: !0)
 !13 = !DIGlobalVariable(name: "x1", line: 3, isLocal: true, isDefinition: true, scope: !1, file: !1, type: !5, variable: i8* @x1)
 !14 = !DIGlobalVariable(name: "x2", line: 6, isLocal: true, isDefinition: true, scope: !1, file: !1, type: !5, variable: i8* @x2)
 !15 = !DIGlobalVariable(name: "x3", line: 9, isLocal: true, isDefinition: true, scope: !1, file: !1, type: !5, variable: i8* @x3)
 !16 = !DIGlobalVariable(name: "x4", line: 12, isLocal: true, isDefinition: true, scope: !1, file: !1, type: !5, variable: i8* @x4)
 !17 = !DIGlobalVariable(name: "x5", line: 15, isLocal: false, isDefinition: true, scope: !1, file: !1, type: !5, variable: i8* @x5)
-!18 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 7, arg: 0, scope: !6, file: !1, type: !5)
-!19 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 7, scope: !20, file: !1, type: !5)
+!18 = !DILocalVariable(name: "a", line: 7, arg: 1, scope: !6, file: !1, type: !5)
+!19 = !DILocalVariable(name: "b", line: 7, scope: !20, file: !1, type: !5)
 !20 = distinct !DILexicalBlock(line: 7, column: 0, file: !47, scope: !6)
-!21 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 10, arg: 0, scope: !7, file: !1, type: !5)
-!22 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 10, scope: !23, file: !1, type: !5)
+!21 = !DILocalVariable(name: "a", line: 10, arg: 1, scope: !7, file: !1, type: !5)
+!22 = !DILocalVariable(name: "b", line: 10, scope: !23, file: !1, type: !5)
 !23 = distinct !DILexicalBlock(line: 10, column: 0, file: !47, scope: !7)
-!24 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 13, arg: 0, scope: !8, file: !1, type: !5)
-!25 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 13, scope: !26, file: !1, type: !5)
+!24 = !DILocalVariable(name: "a", line: 13, arg: 1, scope: !8, file: !1, type: !5)
+!25 = !DILocalVariable(name: "b", line: 13, scope: !26, file: !1, type: !5)
 !26 = distinct !DILexicalBlock(line: 13, column: 0, file: !47, scope: !8)
-!27 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 16, arg: 0, scope: !9, file: !1, type: !5)
-!28 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 16, scope: !29, file: !1, type: !5)
+!27 = !DILocalVariable(name: "a", line: 16, arg: 1, scope: !9, file: !1, type: !5)
+!28 = !DILocalVariable(name: "b", line: 16, scope: !29, file: !1, type: !5)
 !29 = distinct !DILexicalBlock(line: 16, column: 0, file: !47, scope: !9)
 !30 = !DILocation(line: 4, scope: !0)
 !31 = !DILocation(line: 4, scope: !12)
diff --git a/test/CodeGen/ARM/2011-06-29-MergeGlobalsAlign.ll b/test/CodeGen/ARM/2011-06-29-MergeGlobalsAlign.ll
index aac8f7b3a026b..1097050df54b3 100644
--- a/test/CodeGen/ARM/2011-06-29-MergeGlobalsAlign.ll
+++ b/test/CodeGen/ARM/2011-06-29-MergeGlobalsAlign.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -arm-global-merge -global-merge-group-by-use=false | FileCheck %s
-; CHECK: .zerofill __DATA,__bss,__MergedGlobals,16,2
+; CHECK: .zerofill __DATA,__bss,l__MergedGlobals,16,2
 
 @prev = external global [0 x i16]
 @max_lazy_match = internal unnamed_addr global i32 0, align 4
diff --git a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
index 067c719f491c6..3d82e706862c1 100644
--- a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
@@ -28,7 +28,7 @@ target triple = "thumbv7-apple-macosx10.7.0"
 @x4 = internal unnamed_addr global i32 4, align 4
 @x5 = global i32 0, align 4
 
-define i32 @get1(i32 %a) nounwind optsize ssp {
+define i32 @get1(i32 %a) nounwind optsize ssp !dbg !1 {
   tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !10, metadata !DIExpression()), !dbg !30
   %1 = load i32, i32* @x1, align 4, !dbg !31
   tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !11, metadata !DIExpression()), !dbg !31
@@ -36,7 +36,7 @@ define i32 @get1(i32 %a) nounwind optsize ssp {
   ret i32 %1, !dbg !31
 }
 
-define i32 @get2(i32 %a) nounwind optsize ssp {
+define i32 @get2(i32 %a) nounwind optsize ssp !dbg !6 {
   tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !13, metadata !DIExpression()), !dbg !32
   %1 = load i32, i32* @x2, align 4, !dbg !33
   tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !14, metadata !DIExpression()), !dbg !33
@@ -44,7 +44,7 @@ define i32 @get2(i32 %a) nounwind optsize ssp {
   ret i32 %1, !dbg !33
 }
 
-define i32 @get3(i32 %a) nounwind optsize ssp {
+define i32 @get3(i32 %a) nounwind optsize ssp !dbg !7 {
   tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !16, metadata !DIExpression()), !dbg !34
   %1 = load i32, i32* @x3, align 4, !dbg !35
   tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !17, metadata !DIExpression()), !dbg !35
@@ -52,7 +52,7 @@ define i32 @get3(i32 %a) nounwind optsize ssp {
   ret i32 %1, !dbg !35
 }
 
-define i32 @get4(i32 %a) nounwind optsize ssp {
+define i32 @get4(i32 %a) nounwind optsize ssp !dbg !8 {
   tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !19, metadata !DIExpression()), !dbg !36
   %1 = load i32, i32* @x4, align 4, !dbg !37
   tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !20, metadata !DIExpression()), !dbg !37
@@ -60,7 +60,7 @@ define i32 @get4(i32 %a) nounwind optsize ssp {
   ret i32 %1, !dbg !37
 }
 
-define i32 @get5(i32 %a) nounwind optsize ssp {
+define i32 @get5(i32 %a) nounwind optsize ssp !dbg !9 {
   tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !27, metadata !DIExpression()), !dbg !38
   %1 = load i32, i32* @x5, align 4, !dbg !39
   tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !28, metadata !DIExpression()), !dbg !39
@@ -73,32 +73,32 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!49}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang", isOptimized: true, emissionKind: 1, file: !47, enums: !48, retainedTypes: !48, subprograms: !40, globals: !41, imports:  !48)
-!1 = !DISubprogram(name: "get1", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !47, scope: !2, type: !3, function: i32 (i32)* @get1, variables: !42)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", isOptimized: true, emissionKind: 1, file: !47, enums: !48, retainedTypes: !48, subprograms: !40, globals: !41, imports:  !48)
+!1 = distinct !DISubprogram(name: "get1", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !47, scope: !2, type: !3, variables: !42)
 !2 = !DIFile(filename: "ss3.c", directory: "/private/tmp")
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5}
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!6 = !DISubprogram(name: "get2", line: 8, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 8, file: !47, scope: !2, type: !3, function: i32 (i32)* @get2, variables: !43)
-!7 = !DISubprogram(name: "get3", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !47, scope: !2, type: !3, function: i32 (i32)* @get3, variables: !44)
-!8 = !DISubprogram(name: "get4", line: 14, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 14, file: !47, scope: !2, type: !3, function: i32 (i32)* @get4, variables: !45)
-!9 = !DISubprogram(name: "get5", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 17, file: !47, scope: !2, type: !3, function: i32 (i32)* @get5, variables: !46)
-!10 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 5, arg: 1, scope: !1, file: !2, type: !5)
-!11 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 5, scope: !12, file: !2, type: !5)
+!6 = distinct !DISubprogram(name: "get2", line: 8, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 8, file: !47, scope: !2, type: !3, variables: !43)
+!7 = distinct !DISubprogram(name: "get3", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !47, scope: !2, type: !3, variables: !44)
+!8 = distinct !DISubprogram(name: "get4", line: 14, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 14, file: !47, scope: !2, type: !3, variables: !45)
+!9 = distinct !DISubprogram(name: "get5", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 17, file: !47, scope: !2, type: !3, variables: !46)
+!10 = !DILocalVariable(name: "a", line: 5, arg: 1, scope: !1, file: !2, type: !5)
+!11 = !DILocalVariable(name: "b", line: 5, scope: !12, file: !2, type: !5)
 !12 = distinct !DILexicalBlock(line: 5, column: 19, file: !47, scope: !1)
-!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 8, arg: 1, scope: !6, file: !2, type: !5)
-!14 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 8, scope: !15, file: !2, type: !5)
+!13 = !DILocalVariable(name: "a", line: 8, arg: 1, scope: !6, file: !2, type: !5)
+!14 = !DILocalVariable(name: "b", line: 8, scope: !15, file: !2, type: !5)
 !15 = distinct !DILexicalBlock(line: 8, column: 17, file: !47, scope: !6)
-!16 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 11, arg: 1, scope: !7, file: !2, type: !5)
-!17 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 11, scope: !18, file: !2, type: !5)
+!16 = !DILocalVariable(name: "a", line: 11, arg: 1, scope: !7, file: !2, type: !5)
+!17 = !DILocalVariable(name: "b", line: 11, scope: !18, file: !2, type: !5)
 !18 = distinct !DILexicalBlock(line: 11, column: 19, file: !47, scope: !7)
-!19 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 14, arg: 1, scope: !8, file: !2, type: !5)
-!20 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 14, scope: !21, file: !2, type: !5)
+!19 = !DILocalVariable(name: "a", line: 14, arg: 1, scope: !8, file: !2, type: !5)
+!20 = !DILocalVariable(name: "b", line: 14, scope: !21, file: !2, type: !5)
 !21 = distinct !DILexicalBlock(line: 14, column: 19, file: !47, scope: !8)
 !25 = !DIGlobalVariable(name: "x1", line: 4, isLocal: true, isDefinition: true, scope: !0, file: !2, type: !5, variable: i32* @x1)
 !26 = !DIGlobalVariable(name: "x2", line: 7, isLocal: true, isDefinition: true, scope: !0, file: !2, type: !5, variable: i32* @x2)
-!27 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 17, arg: 1, scope: !9, file: !2, type: !5)
-!28 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 17, scope: !29, file: !2, type: !5)
+!27 = !DILocalVariable(name: "a", line: 17, arg: 1, scope: !9, file: !2, type: !5)
+!28 = !DILocalVariable(name: "b", line: 17, scope: !29, file: !2, type: !5)
 !29 = distinct !DILexicalBlock(line: 17, column: 19, file: !47, scope: !9)
 !30 = !DILocation(line: 5, column: 16, scope: !1)
 !31 = !DILocation(line: 5, column: 32, scope: !12)
diff --git a/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll b/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll
index 3cbc4cdcd707a..d702af7c0c708 100644
--- a/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll
+++ b/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll
@@ -4,9 +4,9 @@
 
 define void @test_vmovqqqq_pseudo() nounwind ssp {
 entry:
-  %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2)
+  %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2)
   store { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, { <8 x i16>, <8 x i16>, <8 x i16> }* undef
   ret void
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/2011-10-26-memset-inline.ll b/test/CodeGen/ARM/2011-10-26-memset-inline.ll
index 17bd291a6b55e..5df439389cdb0 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-inline.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-inline.ll
@@ -1,5 +1,5 @@
 ; Make sure short memsets on ARM lower to stores, even when optimizing for size.
-; RUN: llc -march=arm < %s | FileCheck %s -check-prefix=CHECK-GENERIC
+; RUN: llc -march=arm -mattr=+strict-align < %s | FileCheck %s -check-prefix=CHECK-GENERIC
 ; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s -check-prefix=CHECK-UNALIGNED
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
diff --git a/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll b/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
index b70b7f6f3b2ea..f622ceb584e6e 100644
--- a/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
+++ b/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
@@ -52,8 +52,8 @@ cond.end295:                                      ; preds = %entry
   %shuffle.i35.i.i = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
   %shuffle.i34.i.i = shufflevector <1 x i64> %shuffle.i36.i.i, <1 x i64> %shuffle.i35.i.i, <2 x i32> <i32 0, i32 1>
   %2 = bitcast <2 x i64> %shuffle.i34.i.i to <4 x float>
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind
   unreachable
 
 for.end:                                          ; preds = %entry
@@ -63,10 +63,10 @@ for.end:                                          ; preds = %entry
 ; Check that pseudo-expansion preserves <undef> flags.
 define void @foo3(i8* %p) nounwind ssp {
 entry:
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4)
   ret void
 }
 
 declare arm_aapcs_vfpcc void @bar(i8*, float, float, float)
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
index 7f30ae10e436d..606af47a3d8ee 100644
--- a/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
+++ b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
@@ -7,8 +7,8 @@ entry:
   %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1
   %0 = bitcast i32* %p to i8*
-  tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
+  tail call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
   ret void
 }
 
-declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind
diff --git a/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
index 545bfc73c5905..6cff67614c640 100644
--- a/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
+++ b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
@@ -5,9 +5,9 @@
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios5.1.0"
 
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind
 
 define void @findEdges(i8*) nounwind ssp {
   %2 = icmp sgt i32 undef, 0
@@ -19,16 +19,16 @@ define void @findEdges(i8*) nounwind ssp {
 
 ; <label>:5                                       ; preds = %5, %1
   %6 = phi i8* [ %19, %5 ], [ %0, %1 ]
-  %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* null, i32 1)
+  %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* null, i32 1)
   %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0
   %9 = getelementptr inbounds i8, i8* null, i32 3
-  %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %9, i32 1)
+  %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %9, i32 1)
   %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %10, 2
-  %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %6, i32 1)
+  %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %6, i32 1)
   %13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 0
   %14 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 1
   %15 = getelementptr inbounds i8, i8* %6, i32 3
-  %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %15, i32 1)
+  %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %15, i32 1)
   %17 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 1
   %18 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 2
   %19 = getelementptr inbounds i8, i8* %6, i32 48
@@ -111,7 +111,7 @@ define void @findEdges(i8*) nounwind ssp {
   %96 = bitcast <8 x i8> %94 to <1 x i64>
   %97 = shufflevector <1 x i64> %95, <1 x i64> %96, <2 x i32> <i32 0, i32 1>
   %98 = bitcast <2 x i64> %97 to <16 x i8>
-  tail call void @llvm.arm.neon.vst1.v16i8(i8* null, <16 x i8> %98, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* null, <16 x i8> %98, i32 1)
   %99 = icmp slt i32 undef, undef
   br i1 %99, label %5, label %3
 }
diff --git a/test/CodeGen/ARM/2012-11-14-subs_carry.ll b/test/CodeGen/ARM/2012-11-14-subs_carry.ll
index 33083303a3d47..f7abac06919d3 100644
--- a/test/CodeGen/ARM/2012-11-14-subs_carry.ll
+++ b/test/CodeGen/ARM/2012-11-14-subs_carry.ll
@@ -1,10 +1,14 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 ;CHECK-LABEL: foo:
-;CHECK: adds
-;CHECK-NEXT: adc
-;CHECK-NEXT: bx
+;CHECK: movs r0, #0
+;CHECK-NEXT: bx lr
 
+; Note: This test case originally checked, per r167963, for:
+;       adds
+;       adc
+;       bx
+; But SDAG now, like InstCombine, can fold everything away.
 ;rdar://12028498
 
 define i32 @foo() nounwind ssp {
diff --git a/test/CodeGen/ARM/2013-10-11-select-stalls.ll b/test/CodeGen/ARM/2013-10-11-select-stalls.ll
index d6045c7b8c8c7..2c15c1a943bad 100644
--- a/test/CodeGen/ARM/2013-10-11-select-stalls.ll
+++ b/test/CodeGen/ARM/2013-10-11-select-stalls.ll
@@ -1,16 +1,21 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -stats 2>&1 | not grep "Number of pipeline stalls"
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -disable-ifcvt-diamond -stats 2>&1 | FileCheck %s
 ; Evaluate the two vld1.8 instructions in separate MBB's,
 ; instead of stalling on one and conditionally overwriting its result.
+;
+; Update: After if-conversion the two vld1.8 instructions are in the same MBB
+; again. So we disable this if-conversion to eliminate its influence to this
+; test.
 
+; CHECK-NOT: Number of pipeline stalls
 define <16 x i8> @multiselect(i32 %avail, i8* %foo, i8* %bar) {
 entry:
-  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %foo, i32 1)
-  %vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %foo, i32 1)
+  %vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %bar, i32 1)
   %and = and i32 %avail, 3
   %tobool = icmp eq i32 %and, 0
   %retv = select i1 %tobool, <16 x i8> %vld1, <16 x i8> %vld2
   ret <16 x i8> %retv
 }
 
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* , i32 )
diff --git a/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll b/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll
index ef575f4c41eca..be87a2fb1c899 100644
--- a/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll
+++ b/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll
@@ -27,7 +27,7 @@ entry:
   %n0 = insertelement <2 x i64> undef, i64 %tmp0, i32 0
   %n1 = insertelement <2 x i64> %n0, i64 %tmp1, i32 1
 
-  call void @llvm.arm.neon.vst4.v1i64(i8* %m, <1 x i64> %s0, <1 x i64> %s1, <1 x i64> %s2, <1 x i64> %s3, i32 8)
+  call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %m, <1 x i64> %s0, <1 x i64> %s1, <1 x i64> %s2, <1 x i64> %s3, i32 8)
 
   call void @bar(<2 x i64> %n1)
 
@@ -50,7 +50,7 @@ define <8 x i8> @vtbx4(<8 x i8>* %A, %struct.__neon_int8x8x4_t* %B, <8 x i8>* %C
 	ret <8 x i8> %tmp8
 }
 
-declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst4.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
 declare <8 x i8>  @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
 declare void @bar2(%struct.__neon_int8x8x4_t, <8 x i8>)
 declare void @bar(<2 x i64> %arg)
diff --git a/test/CodeGen/ARM/MachO-subtypes.ll b/test/CodeGen/ARM/MachO-subtypes.ll
new file mode 100644
index 0000000000000..8176d66408471
--- /dev/null
+++ b/test/CodeGen/ARM/MachO-subtypes.ll
@@ -0,0 +1,68 @@
+; Check that MachO ARM CPU Subtypes are respected
+
+; RUN: llc -mtriple=armv4t-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V4T
+
+; RUN: llc -mtriple=armv5-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V5
+; RUN: llc -mtriple=armv5e-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V5
+; RUN: llc -mtriple=armv5t-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V5
+; RUN: llc -mtriple=armv5te-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V5
+; RUN: llc -mtriple=armv5tej-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V5
+
+; RUN: llc -mtriple=armv6-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V6
+; RUN: llc -mtriple=armv6k-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V6
+; RUN: llc -mtriple=thumbv6-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V6
+; RUN: llc -mtriple=thumbv6k-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V6
+
+; RUN: llc -mtriple=armv6m-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V6M
+; RUN: llc -mtriple=thumbv6m-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V6M
+
+; RUN: llc -mtriple=armv7-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V7
+; RUN: llc -mtriple=thumbv7-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V7
+
+; RUN: llc -mtriple=thumbv7em-apple-darwin -mcpu=cortex-m4 -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V7EM
+; RUN: llc -mtriple=thumbv7em-apple-darwin -mcpu=cortex-m7 -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V7EM
+
+; RUN: llc -mtriple=armv7k-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V7K
+; RUN: llc -mtriple=thumbv7k-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V7K
+
+; RUN: llc -mtriple=thumbv7m-apple-darwin -mcpu=sc300 -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V7M
+; RUN: llc -mtriple=thumbv7m-apple-darwin -mcpu=cortex-m3 -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V7M
+
+; RUN: llc -mtriple=armv7s-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V7S
+; RUN: llc -mtriple=thumbv7s-apple-darwin -filetype=obj -o - < %s \
+; RUN: | llvm-readobj -file-headers | FileCheck %s --check-prefix=CHECK-V7S
+
+define void @_test() {
+  ret void
+}
+
+; CHECK-V4T:   CpuSubType: CPU_SUBTYPE_ARM_V4T (0x5)
+; CHECK-V5:   CpuSubType: CPU_SUBTYPE_ARM_V5 (0x7)
+; CHECK-V6:   CpuSubType: CPU_SUBTYPE_ARM_V6 (0x6)
+; CHECK-V6M:   CpuSubType: CPU_SUBTYPE_ARM_V6M (0xE)
+; CHECK-V7:   CpuSubType: CPU_SUBTYPE_ARM_V7 (0x9)
+; CHECK-V7EM:   CpuSubType: CPU_SUBTYPE_ARM_V7EM (0x10)
+; CHECK-V7K:   CpuSubType: CPU_SUBTYPE_ARM_V7K (0xC)
+; CHECK-V7M:   CpuSubType: CPU_SUBTYPE_ARM_V7M (0xF)
+; CHECK-V7S:   CpuSubType: CPU_SUBTYPE_ARM_V7S (0xB)
diff --git a/test/CodeGen/ARM/Windows/division.ll b/test/CodeGen/ARM/Windows/division.ll
new file mode 100644
index 0000000000000..b3ef9c6d278b6
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/division.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv7-windows-msvc -filetype asm -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc i32 @sdiv32(i32 %divisor, i32 %divident) {
+entry:
+  %div = sdiv i32 %divident, %divisor
+  ret i32 %div
+}
+
+; CHECK-LABEL: sdiv32
+; CHECK: b __rt_sdiv
+
+define arm_aapcs_vfpcc i32 @udiv32(i32 %divisor, i32 %divident) {
+entry:
+  %div = udiv i32 %divident, %divisor
+  ret i32 %div
+}
+
+; CHECK-LABEL: udiv32:
+; CHECK: b __rt_udiv
+
+define arm_aapcs_vfpcc i64 @sdiv64(i64 %divisor, i64 %divident) {
+entry:
+  %div = sdiv i64 %divident, %divisor
+  ret i64 %div
+}
+
+; CHECK-LABEL: sdiv64
+; CHECK: bl __rt_sdiv64
+
+define arm_aapcs_vfpcc i64 @udiv64(i64 %divisor, i64 %divident) {
+entry:
+  %div = udiv i64 %divident, %divisor
+  ret i64 %div
+}
+
+; CHECK-LABEL: udiv64:
+; CHECK: bl __rt_udiv64
diff --git a/test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll b/test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll
deleted file mode 100644
index acf21a1caad3a..0000000000000
--- a/test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -mtriple thumbv7-windows -filetype asm -o - %s | FileCheck %s
-
-define arm_aapcs_vfpcc i64 @stoi64(float %f) {
-entry:
-  %conv = fptosi float %f to i64
-  ret i64 %conv
-}
-
-; CHECK-LABEL: stoi64
-; CHECK: bl __stoi64
-
-define arm_aapcs_vfpcc i64 @stou64(float %f) {
-entry:
-  %conv = fptoui float %f to i64
-  ret i64 %conv
-}
-
-; CHECK-LABEL: stou64
-; CHECK: bl __stou64
-
-define arm_aapcs_vfpcc float @i64tos(i64 %i64) {
-entry:
-  %conv = sitofp i64 %i64 to float
-  ret float %conv
-}
-
-; CHECK-LABEL: i64tos
-; CHECK: bl __i64tos
-
-define arm_aapcs_vfpcc float @u64tos(i64 %u64) {
-entry:
-  %conv = uitofp i64 %u64 to float
-  ret float %conv
-}
-
-; CHECK-LABEL: u64tos
-; CHECK: bl __u64tos
-
-define arm_aapcs_vfpcc i64 @dtoi64(double %d) {
-entry:
-  %conv = fptosi double %d to i64
-  ret i64 %conv
-}
-
-; CHECK-LABEL: dtoi64
-; CHECK: bl __dtoi64
-
-define arm_aapcs_vfpcc i64 @dtou64(double %d) {
-entry:
-  %conv = fptoui double %d to i64
-  ret i64 %conv
-}
-
-; CHECK-LABEL: dtou64
-; CHECK: bl __dtou64
-
-define arm_aapcs_vfpcc double @i64tod(i64 %i64) {
-entry:
-  %conv = sitofp i64 %i64 to double
-  ret double %conv
-}
-
-; CHECK-LABEL: i64tod
-; CHECK: bl __i64tod
-
-define arm_aapcs_vfpcc double @u64tod(i64 %i64) {
-entry:
-  %conv = uitofp i64 %i64 to double
-  ret double %conv
-}
-
-; CHECK-LABEL: u64tod
-; CHECK: bl __u64tod
-
diff --git a/test/CodeGen/ARM/Windows/libcalls.ll b/test/CodeGen/ARM/Windows/libcalls.ll
new file mode 100644
index 0000000000000..d8b498a40d97c
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/libcalls.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv7-windows-msvc -filetype asm -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc i64 @stoi64(float %f) {
+entry:
+  %conv = fptosi float %f to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: stoi64
+; CHECK: bl __stoi64
+
+define arm_aapcs_vfpcc i64 @stou64(float %f) {
+entry:
+  %conv = fptoui float %f to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: stou64
+; CHECK: bl __stou64
+
+define arm_aapcs_vfpcc float @i64tos(i64 %i64) {
+entry:
+  %conv = sitofp i64 %i64 to float
+  ret float %conv
+}
+
+; CHECK-LABEL: i64tos
+; CHECK: bl __i64tos
+
+define arm_aapcs_vfpcc float @u64tos(i64 %u64) {
+entry:
+  %conv = uitofp i64 %u64 to float
+  ret float %conv
+}
+
+; CHECK-LABEL: u64tos
+; CHECK: bl __u64tos
+
+define arm_aapcs_vfpcc i64 @dtoi64(double %d) {
+entry:
+  %conv = fptosi double %d to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: dtoi64
+; CHECK: bl __dtoi64
+
+define arm_aapcs_vfpcc i64 @dtou64(double %d) {
+entry:
+  %conv = fptoui double %d to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: dtou64
+; CHECK: bl __dtou64
+
+define arm_aapcs_vfpcc double @i64tod(i64 %i64) {
+entry:
+  %conv = sitofp i64 %i64 to double
+  ret double %conv
+}
+
+; CHECK-LABEL: i64tod
+; CHECK: bl __i64tod
+
+define arm_aapcs_vfpcc double @u64tod(i64 %i64) {
+entry:
+  %conv = uitofp i64 %i64 to double
+  ret double %conv
+}
+
+; CHECK-LABEL: u64tod
+; CHECK: bl __u64tod
+
diff --git a/test/CodeGen/ARM/Windows/no-eabi.ll b/test/CodeGen/ARM/Windows/no-eabi.ll
new file mode 100644
index 0000000000000..033ca0267ee03
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/no-eabi.ll
@@ -0,0 +1,10 @@
+; RUN: llc -O3 -mtriple thumbv7-windows %s -filetype asm -o - | FileCheck -check-prefix CHECK-NONEABI %s
+; RUN: llc -O3 -mtriple armv7--linux-gnueabi %s -filetype asm -o - | FileCheck -check-prefix CHECK-EABI %s
+
+define arm_aapcs_vfpcc void @function() {
+  ret void
+}
+
+; CHECK-EABI: .eabi_attribute
+; CHECK-NONEABI-NOT: .eabi_attribute
+
diff --git a/test/CodeGen/ARM/Windows/no-frame-register.ll b/test/CodeGen/ARM/Windows/no-frame-register.ll
new file mode 100644
index 0000000000000..80187af7ef228
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/no-frame-register.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple thumbv7-windows -filetype asm -o - %s | FileCheck %s
+
+declare void @callee(i32)
+
+define i32 @calleer(i32 %i) {
+entry:
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %j, align 4
+  %1 = load i32, i32* %j, align 4
+  call void @callee(i32 %1)
+  %2 = load i32, i32* %j, align 4
+  %add1 = add nsw i32 %2, 1
+  ret i32 %add1
+}
+
+; CHECK-NOT: push.w {r7, lr}
+; CHECK: push.w {r11, lr}
+
diff --git a/test/CodeGen/ARM/Windows/overflow.ll b/test/CodeGen/ARM/Windows/overflow.ll
new file mode 100644
index 0000000000000..5f74f25ac2242
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/overflow.ll
@@ -0,0 +1,77 @@
+; RUN: llc -mtriple thumbv7-windows-gnu -filetype asm -o - %s
+
+define i32 @divsoverflow32(i32 %a, i32 %b) {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = load i32, i32* %1, align 4
+  %4 = load i32, i32* %2, align 4
+  %5 = sub nsw i32 0, %4
+  %6 = sdiv i32 -2147483647, %3
+  %7 = icmp sgt i32 %5, %6
+  br i1 %7, label %8, label %9
+  call void (...) @abort_simpl32()
+  unreachable
+  %10 = load i32, i32* %1, align 4
+  %11 = load i32, i32* %2, align 4
+  %12 = mul nsw i32 %10, %11
+  ret i32 %12
+}
+
+declare void @abort_simpl32(...)
+
+define i64 @divsoverflow64(i64 %a, i64 %b) {
+  %1 = alloca i64, align 8
+  %2 = alloca i64, align 8
+  %3 = load i64, i64* %1, align 8
+  %4 = load i64, i64* %2, align 8
+  %5 = sub nsw i64 0, %4
+  %6 = sdiv i64 -9223372036854775808, %3
+  %7 = icmp sgt i64 %5, %6
+  br i1 %7, label %8, label %9
+  call void (...) @abort_simpl64()
+  unreachable
+  %10 = load i64, i64* %1, align 8
+  %11 = load i64, i64* %2, align 8
+  %12 = mul nsw i64 %10, %11
+  ret i64 %12
+}
+
+declare void @abort_simpl64(...)
+
+define i32 @divuoverflow32(i32 %a, i32 %b) {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = load i32, i32* %1, align 4
+  %4 = load i32, i32* %2, align 4
+  %5 = sub nsw i32 0, %4
+  %6 = udiv i32 4294967296, %3
+  %7 = icmp sgt i32 %5, %6
+  br i1 %7, label %8, label %9
+  call void (...) @abort_uimpl32()
+  unreachable
+  %10 = load i32, i32* %1, align 4
+  %11 = load i32, i32* %2, align 4
+  %12 = mul nsw i32 %10, %11
+  ret i32 %12
+}
+
+declare void @abort_uimpl32(...)
+
+define i64 @divuoverflow64(i64 %a, i64 %b) {
+  %1 = alloca i64, align 8
+  %2 = alloca i64, align 8
+  %3 = load i64, i64* %1, align 8
+  %4 = load i64, i64* %2, align 8
+  %5 = sub nsw i64 0, %4
+  %6 = udiv i64 18446744073709551616, %3
+  %7 = icmp sgt i64 %5, %6
+  br i1 %7, label %8, label %9
+  call void (...) @abort_uimpl64()
+  unreachable
+  %10 = load i64, i64* %1, align 8
+  %11 = load i64, i64* %2, align 8
+  %12 = mul nsw i64 %10, %11
+  ret i64 %12
+}
+
+declare void @abort_uimpl64(...)
diff --git a/test/CodeGen/ARM/adv-copy-opt.ll b/test/CodeGen/ARM/adv-copy-opt.ll
index f71bf78b62c4b..395be3457203b 100644
--- a/test/CodeGen/ARM/adv-copy-opt.ll
+++ b/test/CodeGen/ARM/adv-copy-opt.ll
@@ -11,25 +11,25 @@
 ; r0 = r0 / r2
 ; r1 = r1 / r3
 ;
-; NOOPT: vmov	[[B:d[0-9]+]], r2, r3
-; NOOPT-NEXT: vmov	[[A:d[0-9]+]], r0, r1
+; NOOPT: vmov	[[A:d[0-9]+]], r0, r1
+; NOOPT-NEXT: vmov	[[B:d[0-9]+]], r2, r3
 ; Move the low part of B into a register.
 ; Unfortunately, we cannot express that the 's' register is the low
 ; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2.
 ; NOOPT-NEXT: vmov	[[B_LOW:r[0-9]+]], s{{[0-9]+}}
-; NOOPT-NEXT: vmov	[[A_LOW:r[0-9]+]], s{{[0-9]+}}
-; NOOPT-NEXT: udiv	[[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
 ; NOOPT-NEXT: vmov	[[B_HIGH:r[0-9]+]], s{{[0-9]+}}
+; NOOPT-NEXT: vmov	[[A_LOW:r[0-9]+]], s{{[0-9]+}}
 ; NOOPT-NEXT: vmov	[[A_HIGH:r[0-9]+]], s{{[0-9]+}}
-; NOOPT-NEXT: udiv	[[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
+; NOOPT-NEXT: udiv	[[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
 ; NOOPT-NEXT: vmov.32	[[RES:d[0-9]+]][0], [[RES_LOW]]
+; NOOPT-NEXT: udiv	[[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
 ; NOOPT-NEXT: vmov.32	[[RES]][1], [[RES_HIGH]]
 ; NOOPT-NEXT: vmov	r0, r1, [[RES]]
 ; NOOPT-NEXT: bx	lr
 ;
 ; OPT-NOT: vmov
-; OPT: 	udiv	r0, r0, r2
-; OPT-NEXT: udiv	r1, r1, r3
+; OPT: udiv	r1, r1, r3
+; OPT-NEXT: 	udiv	r0, r0, r2
 ; OPT-NEXT: bx	lr
 define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index 04ca3e875487e..665ffe902c81f 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -2,34 +2,54 @@
 
 ; CHECK: .globl	test
 
+; CHECK: .Lstructvar:
+; CHECK: .size .Lstructvar, 8
+
 ; CHECK: .globl	foo1
 ; CHECK: foo1 = bar
+; CHECK-NOT: .size foo1
 
 ; CHECK: .globl	foo2
 ; CHECK: foo2 = bar
+; CHECK-NOT: .size foo2
 
 ; CHECK: .weak	bar_f
 ; CHECK: bar_f = foo_f
+; CHECK-NOT: .size bar_f
 
 ; CHECK: bar_i = bar
+; CHECK-NOT: .size bar_i
 
 ; CHECK: .globl	A
 ; CHECK: A = bar
+; CHECK-NOT: .size A
+
+; CHECK: .globl elem0
+; CHECK: elem0 = .Lstructvar
+; CHECK: .size elem0, 4
+
+; CHECK: .globl elem1
+; CHECK: elem1 = .Lstructvar+4
+; CHECK: .size elem1, 4
 
 @bar = global i32 42
-@foo1 = alias i32* @bar
-@foo2 = alias i32* @bar
+@foo1 = alias i32, i32* @bar
+@foo2 = alias i32, i32* @bar
 
 %FunTy = type i32()
 
 define i32 @foo_f() {
   ret i32 0
 }
-@bar_f = weak alias %FunTy* @foo_f
+@bar_f = weak alias %FunTy, %FunTy* @foo_f
+
+@bar_i = internal alias i32, i32* @bar
 
-@bar_i = internal alias i32* @bar
+@A = alias i64, bitcast (i32* @bar to i64*)
 
-@A = alias bitcast (i32* @bar to i64*)
+@structvar = private global {i32, i32} {i32 1, i32 2}
+@elem0 = alias i32, getelementptr({i32, i32}, {i32, i32}*  @structvar, i32 0, i32 0)
+@elem1 = alias i32, getelementptr({i32, i32}, {i32, i32}*  @structvar, i32 0, i32 1)
 
 define i32 @test() {
 entry:
diff --git a/test/CodeGen/ARM/align-sp-adjustment.ll b/test/CodeGen/ARM/align-sp-adjustment.ll
new file mode 100644
index 0000000000000..cce7b03e2362a
--- /dev/null
+++ b/test/CodeGen/ARM/align-sp-adjustment.ll
@@ -0,0 +1,47 @@
+; RUN: llc -mtriple=thumbv7 -o - %s | FileCheck %s
+
+; CHECK: [sp, #2120]
+
+%struct.struct_2 = type { [172 x %struct.struct_1] }
+%struct.struct_1 = type { i32, i32, i32 }
+
+@.str = private unnamed_addr constant [2 x i8] c"a\00", align 1
+@.str.1 = private unnamed_addr constant [2 x i8] c"b\00", align 1
+@.str.2 = private unnamed_addr constant [2 x i8] c"c\00", align 1
+@.str.3 = private unnamed_addr constant [2 x i8] c"d\00", align 1
+
+declare i32* @_Z4bar3iiPKcS0_i(i32, i32, i8*, i8*, i32)
+declare void @_Z4bar1i8struct_2(i32, %struct.struct_2* byval align 4)
+declare i32 @_Z4bar2PiPKc(i32*, i8*)
+
+define void @_Z3fooiiiii(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5) #0 {
+entry:
+  %params = alloca %struct.struct_2, align 4
+  %0 = bitcast %struct.struct_2* %params to i8*
+  br label %for.body
+
+for.body:
+  %i.015 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %call = tail call i32* @_Z4bar3iiPKcS0_i(i32 %p1, i32 %p5, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i32 0, i32 0), i32 %i.015) #4
+  %cmp1 = icmp eq i32* %call, null
+  br i1 %cmp1, label %cleanup.8, label %for.inc
+
+for.inc:
+  %call2 = tail call i32 @_Z4bar2PiPKc(i32* %call, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i32 0, i32 0)) #4
+  %f1 = getelementptr inbounds %struct.struct_2, %struct.struct_2* %params, i32 0, i32 0, i32 %i.015, i32 0
+  store i32 %call2, i32* %f1, align 4
+  %call3 = tail call i32 @_Z4bar2PiPKc(i32* %call, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.3, i32 0, i32 0)) #4
+  %f2 = getelementptr inbounds %struct.struct_2, %struct.struct_2* %params, i32 0, i32 0, i32 %i.015, i32 1
+  store i32 %call3, i32* %f2, align 4
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp slt i32 %inc, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  call void @_Z4bar1i8struct_2(i32 %p4, %struct.struct_2* byval nonnull align 4 %params) #4
+  br label %cleanup.8
+
+cleanup.8:
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/apcs-vfp.ll b/test/CodeGen/ARM/apcs-vfp.ll
new file mode 100644
index 0000000000000..9157521bfbc44
--- /dev/null
+++ b/test/CodeGen/ARM/apcs-vfp.ll
@@ -0,0 +1,153 @@
+; RUN: llc -mtriple=armv7k-apple-watchos2.0 < %s | FileCheck %s
+
+define arm_aapcs_vfpcc float @t1(float %a, float %b) {
+entry:
+; CHECK: t1
+; CHECK-NOT: vmov
+; CHECK: vadd.f32 
+  %a.addr = alloca float, align 4
+  %b.addr = alloca float, align 4
+  store float %a, float* %a.addr, align 4
+  store float %b, float* %b.addr, align 4
+  %0 = load float, float* %a.addr, align 4
+  %1 = load float, float* %b.addr, align 4
+  %add = fadd float %0, %1
+  ret float %add
+}
+
+define arm_aapcs_vfpcc double @t2(double %a, double %b) {
+entry:
+; CHECK: t2
+; CHECK-NOT: vmov
+; CHECK: vadd.f64
+  %a.addr = alloca double, align 8
+  %b.addr = alloca double, align 8
+  store double %a, double* %a.addr, align 8
+  store double %b, double* %b.addr, align 8
+  %0 = load double, double* %a.addr, align 8
+  %1 = load double, double* %b.addr, align 8
+  %add = fadd double %0, %1
+  ret double %add
+}
+
+define arm_aapcs_vfpcc i64 @t3(double %ti) {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: vmov
+; CHECK: bl ___fixunsdfdi
+  %conv = fptoui double %ti to i64
+  ret i64 %conv
+}
+
+define arm_aapcs_vfpcc i64 @t4(double %ti) {
+entry:
+; CHECK-LABEL: t4:
+; CHECK-NOT: vmov
+; CHECK: bl ___fixdfdi
+  %conv = fptosi double %ti to i64
+  ret i64 %conv
+}
+
+define arm_aapcs_vfpcc double @t5(i64 %ti) {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: bl ___floatundidf
+; CHECK-NOT: vmov
+; CHECK: pop
+  %conv = uitofp i64 %ti to double
+  ret double %conv
+}
+
+define arm_aapcs_vfpcc double @t6(i64 %ti) {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: bl ___floatdidf
+; CHECK-NOT: vmov
+; CHECK: pop
+  %conv = sitofp i64 %ti to double
+  ret double %conv
+}
+
+define arm_aapcs_vfpcc float @t7(i64 %ti) {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: bl ___floatundisf
+; CHECK-NOT: vmov
+; CHECK: pop
+  %conv = uitofp i64 %ti to float
+  ret float %conv
+}
+
+define arm_aapcs_vfpcc float @t8(i64 %ti) {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: bl ___floatdisf
+; CHECK-NOT: vmov
+; CHECK: pop
+  %conv = sitofp i64 %ti to float
+  ret float %conv
+}
+
+define arm_aapcs_vfpcc double @t9(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, float %a, float %b) {
+entry:
+; CHECK-LABEL: t9:
+; CHECK-NOT: vmov
+; CHECK: vldr
+  %add = fadd float %a, %b
+  %conv = fpext float %add to double
+  ret double %conv
+}
+
+define arm_aapcs_vfpcc double @t10(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %a, float %b, double %c) {
+entry:
+; CHECK-LABEL: t10:
+; CHECK-NOT: vmov
+; CHECK: vldr
+  %add = fadd double %a, %c
+  ret double %add
+}
+
+define arm_aapcs_vfpcc float @t11(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, float %a, double %b, float %c) {
+entry:
+; CHECK-LABEL: t11:
+; CHECK: vldr
+  %add = fadd float %a, %c
+  ret float %add
+}
+
+define arm_aapcs_vfpcc double @t12(double %a, double %b) {
+entry:
+; CHECK-LABEL: t12:
+; CHECK: vstr
+  %add = fadd double %a, %b
+  %sub = fsub double %a, %b
+  %call = tail call arm_aapcs_vfpcc double @x(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double %add, float 0.000000e+00, double %sub)
+  ret double %call
+}
+
+define arm_aapcs_vfpcc double @t13(double %x) {
+entry:
+; CHECK-LABEL: t13:
+; CHECK-NOT: vmov
+; CHECK: bl ___sincos_stret
+  %call = tail call arm_aapcs_vfpcc double @cos(double %x)
+  %call1 = tail call arm_aapcs_vfpcc double @sin(double %x)
+  %mul = fmul double %call, %call1
+  ret double %mul
+}
+
+define arm_aapcs_vfpcc double @t14(double %x) {
+; CHECK-LABEL: t14:
+; CHECK-NOT: vmov
+; CHECK: b ___exp10
+  %__exp10 = tail call double @__exp10(double %x) #1
+  ret double %__exp10
+}
+
+declare arm_aapcs_vfpcc double @x(double, double, double, double, double, double, double, float, double)
+declare arm_aapcs_vfpcc double @cos(double) #0
+declare arm_aapcs_vfpcc double @sin(double) #0
+declare double @__exp10(double)
+
+attributes #0 = { readnone }
+attributes #1 = { readonly }
diff --git a/test/CodeGen/ARM/arm-eabi.ll b/test/CodeGen/ARM/arm-eabi.ll
new file mode 100644
index 0000000000000..d1e7a947553f9
--- /dev/null
+++ b/test/CodeGen/ARM/arm-eabi.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-androideabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-gnueabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-gnueabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-eabi -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-eabihf -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-androideabi -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-gnueabi -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-gnueabihf -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-eabi -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-eabihf -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-androideabi -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-gnueabi -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-gnueabihf -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-eabi -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-eabihf -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-androideabi -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-gnueabi -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-gnueabihf -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+
+%struct.my_s = type { [18 x i32] }
+
+define void @foo(i32* %t) {
+  ; CHECK-LABEL: foo
+
+  %1 = alloca i32*, align 4
+  store i32* %t, i32** %1, align 4
+  %2 = load i32*, i32** %1, align 4
+  %3 = bitcast i32* %2 to %struct.my_s*
+  %4 = bitcast %struct.my_s* %3 to i8*
+  ; CHECK-EABI: bl __aeabi_memcpy
+  ; CHECK-GNUEABI: bl memcpy
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %4, i8* inttoptr (i32 1 to i8*), i32 72, i32 4, i1 false)
+  ret void
+}
+
+define void @f1(i8* %dest, i8* %src) {
+entry:
+  ; CHECK-LABEL: f1
+
+  ; memmove
+  ; CHECK-EABI: bl __aeabi_memmove
+  ; CHECK-GNUEABI: bl memmove
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 0, i1 false)
+
+  ; memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy
+  ; CHECK-GNUEABI: bl memcpy
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 0, i1 false)
+
+  ; memset
+  ; CHECK-EABI: mov r2, #1
+  ; CHECK-EABI: bl __aeabi_memset
+  ; CHECK-GNUEABI: mov r1, #1
+  ; CHECK-GNUEABI: bl memset
+  call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 500, i32 0, i1 false)
+  ret void
+}
+
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/arm-interleaved-accesses.ll b/test/CodeGen/ARM/arm-interleaved-accesses.ll
index 9a9885ccdd0c7..002e71f6d9b88 100644
--- a/test/CodeGen/ARM/arm-interleaved-accesses.ll
+++ b/test/CodeGen/ARM/arm-interleaved-accesses.ll
@@ -1,7 +1,10 @@
-; RUN: llc -mtriple=arm-eabi -mattr=+neon -lower-interleaved-accesses=true < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
+; RUN: llc -mtriple=arm-eabi -mattr=-neon -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NONEON
 
-; CHECK-LABEL: load_factor2:
-; CHECK: vld2.8 {d16, d17}, [r0]
+; NEON-LABEL: load_factor2:
+; NEON: vld2.8 {d16, d17}, [r0]
+; NONEON-LABEL: load_factor2:
+; NONEON-NOT: vld2
 define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
   %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
   %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -10,8 +13,10 @@ define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
   ret <8 x i8> %add
 }
 
-; CHECK-LABEL: load_factor3:
-; CHECK: vld3.32 {d16, d17, d18}, [r0]
+; NEON-LABEL: load_factor3:
+; NEON: vld3.32 {d16, d17, d18}, [r0]
+; NONEON-LABEL: load_factor3:
+; NONEON-NOT: vld3
 define <2 x i32> @load_factor3(i32* %ptr) {
   %base = bitcast i32* %ptr to <6 x i32>*
   %wide.vec = load <6 x i32>, <6 x i32>* %base, align 4
@@ -21,9 +26,11 @@ define <2 x i32> @load_factor3(i32* %ptr) {
   ret <2 x i32> %add
 }
 
-; CHECK-LABEL: load_factor4:
-; CHECK: vld4.32 {d16, d18, d20, d22}, [r0]!
-; CHECK: vld4.32 {d17, d19, d21, d23}, [r0]
+; NEON-LABEL: load_factor4:
+; NEON: vld4.32 {d16, d18, d20, d22}, [r0]!
+; NEON: vld4.32 {d17, d19, d21, d23}, [r0]
+; NONEON-LABEL: load_factor4:
+; NONEON-NOT: vld4
 define <4 x i32> @load_factor4(i32* %ptr) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
@@ -33,17 +40,21 @@ define <4 x i32> @load_factor4(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: store_factor2:
-; CHECK: vst2.8 {d16, d17}, [r0]
+; NEON-LABEL: store_factor2:
+; NEON: vst2.8 {d16, d17}, [r0]
+; NONEON-LABEL: store_factor2:
+; NONEON-NOT: vst2
 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
   %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
   ret void
 }
 
-; CHECK-LABEL: store_factor3:
-; CHECK: vst3.32 {d16, d18, d20}, [r0]!
-; CHECK: vst3.32 {d17, d19, d21}, [r0]
+; NEON-LABEL: store_factor3:
+; NEON: vst3.32 {d16, d18, d20}, [r0]!
+; NEON: vst3.32 {d17, d19, d21}, [r0]
+; NONEON-LABEL: store_factor3:
+; NONEON-NOT: vst3.32
 define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -53,9 +64,11 @@ define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v
   ret void
 }
 
-; CHECK-LABEL: store_factor4:
-; CHECK: vst4.32 {d16, d18, d20, d22}, [r0]!
-; CHECK: vst4.32 {d17, d19, d21, d23}, [r0]
+; NEON-LABEL: store_factor4:
+; NEON: vst4.32 {d16, d18, d20, d22}, [r0]!
+; NEON: vst4.32 {d17, d19, d21, d23}, [r0]
+; NONEON-LABEL: store_factor4:
+; NONEON-NOT: vst4
 define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -68,8 +81,10 @@ define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v
 ; The following cases test that interleaved access of pointer vectors can be
 ; matched to ldN/stN instruction.
 
-; CHECK-LABEL: load_ptrvec_factor2:
-; CHECK: vld2.32 {d16, d17}, [r0]
+; NEON-LABEL: load_ptrvec_factor2:
+; NEON: vld2.32 {d16, d17}, [r0]
+; NONEON-LABEL: load_ptrvec_factor2:
+; NONEON-NOT: vld2
 define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
   %base = bitcast i32** %ptr to <4 x i32*>*
   %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
@@ -77,8 +92,10 @@ define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
   ret <2 x i32*> %strided.v0
 }
 
-; CHECK-LABEL: load_ptrvec_factor3:
-; CHECK: vld3.32 {d16, d17, d18}, [r0]
+; NEON-LABEL: load_ptrvec_factor3:
+; NEON: vld3.32 {d16, d17, d18}, [r0]
+; NONEON-LABEL: load_ptrvec_factor3:
+; NONEON-NOT: vld3
 define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
   %base = bitcast i32** %ptr to <6 x i32*>*
   %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
@@ -89,8 +106,10 @@ define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr
   ret void
 }
 
-; CHECK-LABEL: load_ptrvec_factor4:
-; CHECK: vld4.32 {d16, d17, d18, d19}, [r0]
+; NEON-LABEL: load_ptrvec_factor4:
+; NEON: vld4.32 {d16, d17, d18, d19}, [r0]
+; NONEON-LABEL: load_ptrvec_factor4:
+; NONEON-NOT: vld4
 define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
   %base = bitcast i32** %ptr to <8 x i32*>*
   %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
@@ -101,8 +120,10 @@ define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr
   ret void
 }
 
-; CHECK-LABEL: store_ptrvec_factor2:
-; CHECK: vst2.32 {d16, d17}, [r0]
+; NEON-LABEL: store_ptrvec_factor2:
+; NEON: vst2.32 {d16, d17}, [r0]
+; NONEON-LABEL: store_ptrvec_factor2:
+; NONEON-NOT: vst2
 define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
   %base = bitcast i32** %ptr to <4 x i32*>*
   %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -110,8 +131,10 @@ define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
   ret void
 }
 
-; CHECK-LABEL: store_ptrvec_factor3:
-; CHECK: vst3.32 {d16, d17, d18}, [r0]
+; NEON-LABEL: store_ptrvec_factor3:
+; NEON: vst3.32 {d16, d17, d18}, [r0]
+; NONEON-LABEL: store_ptrvec_factor3:
+; NONEON-NOT: vst3
 define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
   %base = bitcast i32** %ptr to <6 x i32*>*
   %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -121,8 +144,10 @@ define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2
   ret void
 }
 
-; CHECK-LABEL: store_ptrvec_factor4:
-; CHECK: vst4.32 {d16, d17, d18, d19}, [r0]
+; NEON-LABEL: store_ptrvec_factor4:
+; NEON: vst4.32 {d16, d17, d18, d19}, [r0]
+; NONEON-LABEL: store_ptrvec_factor4:
+; NONEON-NOT: vst4
 define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
   %base = bitcast i32* %ptr to <8 x i32*>*
   %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -135,8 +160,10 @@ define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2
 ; Following cases check that shuffle maskes with undef indices can be matched
 ; into ldN/stN instruction.
 
-; CHECK-LABEL: load_undef_mask_factor2:
-; CHECK: vld2.32 {d16, d17, d18, d19}, [r0]
+; NEON-LABEL: load_undef_mask_factor2:
+; NEON: vld2.32 {d16, d17, d18, d19}, [r0]
+; NONEON-LABEL: load_undef_mask_factor2:
+; NONEON-NOT: vld2
 define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
   %base = bitcast i32* %ptr to <8 x i32>*
   %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
@@ -146,9 +173,11 @@ define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: load_undef_mask_factor3:
-; CHECK: vld3.32 {d16, d18, d20}, [r0]!
-; CHECK: vld3.32 {d17, d19, d21}, [r0]
+; NEON-LABEL: load_undef_mask_factor3:
+; NEON: vld3.32 {d16, d18, d20}, [r0]!
+; NEON: vld3.32 {d17, d19, d21}, [r0]
+; NONEON-LABEL: load_undef_mask_factor3:
+; NONEON-NOT: vld3
 define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
@@ -158,9 +187,11 @@ define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: load_undef_mask_factor4:
-; CHECK: vld4.32 {d16, d18, d20, d22}, [r0]!
-; CHECK: vld4.32 {d17, d19, d21, d23}, [r0]
+; NEON-LABEL: load_undef_mask_factor4:
+; NEON: vld4.32 {d16, d18, d20, d22}, [r0]!
+; NEON: vld4.32 {d17, d19, d21, d23}, [r0]
+; NONEON-LABEL: load_undef_mask_factor4:
+; NONEON-NOT: vld4
 define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
@@ -170,8 +201,10 @@ define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: store_undef_mask_factor2:
-; CHECK: vst2.32 {d16, d17, d18, d19}, [r0]
+; NEON-LABEL: store_undef_mask_factor2:
+; NEON: vst2.32 {d16, d17, d18, d19}, [r0]
+; NONEON-LABEL: store_undef_mask_factor2:
+; NONEON-NOT: vst2
 define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
   %base = bitcast i32* %ptr to <8 x i32>*
   %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
@@ -179,9 +212,11 @@ define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
   ret void
 }
 
-; CHECK-LABEL: store_undef_mask_factor3:
-; CHECK: vst3.32 {d16, d18, d20}, [r0]!
-; CHECK: vst3.32 {d17, d19, d21}, [r0]
+; NEON-LABEL: store_undef_mask_factor3:
+; NEON: vst3.32 {d16, d18, d20}, [r0]!
+; NEON: vst3.32 {d17, d19, d21}, [r0]
+; NONEON-LABEL: store_undef_mask_factor3:
+; NONEON-NOT: vst3
 define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -191,9 +226,11 @@ define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <
   ret void
 }
 
-; CHECK-LABEL: store_undef_mask_factor4:
-; CHECK: vst4.32 {d16, d18, d20, d22}, [r0]!
-; CHECK: vst4.32 {d17, d19, d21, d23}, [r0]
+; NEON-LABEL: store_undef_mask_factor4:
+; NEON: vst4.32 {d16, d18, d20, d22}, [r0]!
+; NEON: vst4.32 {d17, d19, d21, d23}, [r0]
+; NONEON-LABEL: store_undef_mask_factor4:
+; NONEON-NOT: vst4
 define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -202,3 +239,68 @@ define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <
   store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
   ret void
 }
+
+; The following test cases check that address spaces are properly handled
+
+; NEON-LABEL: load_address_space
+; NEON: vld3.32
+; NONEON-LABEL: load_address_space
+; NONEON-NOT: vld3
+define void @load_address_space(<4 x i32> addrspace(1)* %A, <2 x i32>* %B) {
+ %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %A
+ %interleaved = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 3>
+ store <2 x i32> %interleaved, <2 x i32>* %B
+ ret void
+}
+
+; NEON-LABEL: store_address_space
+; NEON: vst2.32
+; NONEON-LABEL: store_address_space
+; NONEON-NOT: vst2
+define void @store_address_space(<2 x i32>* %A, <2 x i32>* %B, <4 x i32> addrspace(1)* %C) {
+ %tmp0 = load <2 x i32>, <2 x i32>* %A
+ %tmp1 = load <2 x i32>, <2 x i32>* %B
+ %interleaved = shufflevector <2 x i32> %tmp0, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ store <4 x i32> %interleaved, <4 x i32> addrspace(1)* %C
+ ret void
+}
+
+; Check that we do something sane with illegal types.
+
+; NEON-LABEL: load_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: vld1.64 {d16, d17}, [r0:128]
+; NEON-NEXT: vuzp.32 q8, {{.*}}
+; NEON-NEXT: vmov r0, r1, d16
+; NEON-NEXT: vmov r2, r3, {{.*}}
+; NEON-NEXT: mov pc, lr
+; NONEON-LABEL: load_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: ldr [[ELT0:r[0-9]+]], [r0]
+; NONEON-NEXT: ldr r1, [r0, #8]
+; NONEON-NEXT: mov r0, [[ELT0]]
+; NONEON-NEXT: mov pc, lr
+define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
+  %tmp1 = load <3 x float>, <3 x float>* %p, align 16
+  %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  ret <3 x float> %tmp2
+}
+
+; This lowering isn't great, but it's at least correct.
+
+; NEON-LABEL: store_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: vldr d17, [sp]
+; NEON-NEXT: vmov d16, r2, r3
+; NEON-NEXT: vuzp.32 q8, {{.*}}
+; NEON-NEXT: vstr d16, [r0]
+; NEON-NEXT: mov pc, lr
+; NONEON-LABEL: store_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: stm r0, {r1, r3}
+; NONEON-NEXT: mov pc, lr
+define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
+  %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  store <3 x float> %tmp1, <3 x float>* %p, align 16
+  ret void
+}
diff --git a/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
new file mode 100644
index 0000000000000..1434f40137b51
--- /dev/null
+++ b/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
@@ -0,0 +1,142 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; We cannot merge this test with the main test for shrink-wrapping, because
+; the code path we want to exerce is not taken with ios lowering.
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "armv7--linux-gnueabi"
+
+@skip = internal unnamed_addr constant [2 x i8] c"\01\01", align 1
+
+; Check that we do not restore the before having used the saved CSRs.
+; This happened because of a bad use of the post-dominance property.
+; The exit block of the loop happens to also lead to defs/uses of CSRs.
+; It also post-dominates the loop body and we use to generate invalid
+; restore sequence. I.e., we restored too early.
+;
+; CHECK-LABEL: wrongUseOfPostDominate:
+;
+; The prologue is the first thing happening in the function
+; without shrink-wrapping.
+; DISABLE: push
+;
+; CHECK: cmp r1, #0
+;
+; With shrink-wrapping, we branch to a pre-header, where the prologue
+; is located.
+; ENABLE-NEXT: blt [[LOOP_PREHEADER:[.a-zA-Z0-9_]+]]
+; Without shrink-wrapping, we go straight into the loop.
+; DISABLE-NEXT: blt [[LOOP_HEADER:[.a-zA-Z0-9_]+]]
+;
+; CHECK: @ %if.end29
+; DISABLE-NEXT: pop
+; ENABLE-NEXT: bx lr
+;
+; ENABLE: [[LOOP_PREHEADER]]
+; ENABLE: push
+; We must not find a pop here, otherwise that means we are in the loop
+; and are restoring before using the saved CSRs.
+; ENABLE-NOT: pop
+; ENALBE-NEXT: [[LOOP_HEADER:[.a-zA-Z0-9_]+]]: @ %while.cond2.outer
+;
+; DISABLE: [[LOOP_HEADER]]: @ %while.cond2.outer
+;
+; ENABLE-NOT: pop
+;
+; CHECK: @ %while.cond2
+; CHECK: add
+; CHECK-NEXT: cmp r{{[0-1]+}}, #1
+; Set the return value.
+; CHECK-NEXT: moveq r0,
+; CHECK-NEXT: popeq
+;
+; Use the back edge to check we get the label of the loop right.
+; This is to make sure we check the right loop pattern.
+; CHECK:  @ %while.body24.land.rhs14_crit_edge
+; CHECK: cmp r{{[0-9]+}}, #192
+; CHECK-NEXT bhs [[LOOP_HEADER]]
+define fastcc i8* @wrongUseOfPostDominate(i8* readonly %s, i32 %off, i8* readnone %lim) {
+entry:
+  %cmp = icmp sgt i32 %off, -1
+  br i1 %cmp, label %while.cond.preheader, label %while.cond2.outer
+
+while.cond.preheader:                             ; preds = %entry
+  %tobool4 = icmp ne i32 %off, 0
+  %cmp15 = icmp ult i8* %s, %lim
+  %sel66 = and i1 %tobool4, %cmp15
+  br i1 %sel66, label %while.body, label %if.end29
+
+while.body:                                       ; preds = %while.body, %while.cond.preheader
+  %s.addr.08 = phi i8* [ %add.ptr, %while.body ], [ %s, %while.cond.preheader ]
+  %off.addr.07 = phi i32 [ %dec, %while.body ], [ %off, %while.cond.preheader ]
+  %dec = add nsw i32 %off.addr.07, -1
+  %tmp = load i8, i8* %s.addr.08, align 1, !tbaa !2
+  %idxprom = zext i8 %tmp to i32
+  %arrayidx = getelementptr inbounds [2 x i8], [2 x i8]* @skip, i32 0, i32 %idxprom
+  %tmp1 = load i8, i8* %arrayidx, align 1, !tbaa !2
+  %conv = zext i8 %tmp1 to i32
+  %add.ptr = getelementptr inbounds i8, i8* %s.addr.08, i32 %conv
+  %tobool = icmp ne i32 %off.addr.07, 1
+  %cmp1 = icmp ult i8* %add.ptr, %lim
+  %sel6 = and i1 %tobool, %cmp1
+  br i1 %sel6, label %while.body, label %if.end29
+
+while.cond2.outer:                                ; preds = %while.body24.land.rhs14_crit_edge, %while.body24, %land.rhs14.preheader, %if.then7, %entry
+  %off.addr.1.ph = phi i32 [ %off, %entry ], [ %inc, %land.rhs14.preheader ], [ %inc, %if.then7 ], [ %inc, %while.body24.land.rhs14_crit_edge ], [ %inc, %while.body24 ]
+  %s.addr.1.ph = phi i8* [ %s, %entry ], [ %incdec.ptr, %land.rhs14.preheader ], [ %incdec.ptr, %if.then7 ], [ %lsr.iv, %while.body24.land.rhs14_crit_edge ], [ %lsr.iv, %while.body24 ]
+  br label %while.cond2
+
+while.cond2:                                      ; preds = %while.body4, %while.cond2.outer
+  %off.addr.1 = phi i32 [ %inc, %while.body4 ], [ %off.addr.1.ph, %while.cond2.outer ]
+  %inc = add nsw i32 %off.addr.1, 1
+  %tobool3 = icmp eq i32 %off.addr.1, 0
+  br i1 %tobool3, label %if.end29, label %while.body4
+
+while.body4:                                      ; preds = %while.cond2
+  %tmp2 = icmp ugt i8* %s.addr.1.ph, %lim
+  br i1 %tmp2, label %if.then7, label %while.cond2
+
+if.then7:                                         ; preds = %while.body4
+  %incdec.ptr = getelementptr inbounds i8, i8* %s.addr.1.ph, i32 -1
+  %tmp3 = load i8, i8* %incdec.ptr, align 1, !tbaa !2
+  %conv1525 = zext i8 %tmp3 to i32
+  %tobool9 = icmp slt i8 %tmp3, 0
+  %cmp129 = icmp ugt i8* %incdec.ptr, %lim
+  %or.cond13 = and i1 %tobool9, %cmp129
+  br i1 %or.cond13, label %land.rhs14.preheader, label %while.cond2.outer
+
+land.rhs14.preheader:                             ; preds = %if.then7
+  %cmp1624 = icmp slt i8 %tmp3, 0
+  %cmp2026 = icmp ult i32 %conv1525, 192
+  %or.cond27 = and i1 %cmp1624, %cmp2026
+  br i1 %or.cond27, label %while.body24.preheader, label %while.cond2.outer
+
+while.body24.preheader:                           ; preds = %land.rhs14.preheader
+  %scevgep = getelementptr i8, i8* %s.addr.1.ph, i32 -2
+  br label %while.body24
+
+while.body24:                                     ; preds = %while.body24.land.rhs14_crit_edge, %while.body24.preheader
+  %lsr.iv = phi i8* [ %scevgep, %while.body24.preheader ], [ %scevgep34, %while.body24.land.rhs14_crit_edge ]
+  %cmp12 = icmp ugt i8* %lsr.iv, %lim
+  br i1 %cmp12, label %while.body24.land.rhs14_crit_edge, label %while.cond2.outer
+
+while.body24.land.rhs14_crit_edge:                ; preds = %while.body24
+  %.pre = load i8, i8* %lsr.iv, align 1, !tbaa !2
+  %cmp16 = icmp slt i8 %.pre, 0
+  %conv15 = zext i8 %.pre to i32
+  %cmp20 = icmp ult i32 %conv15, 192
+  %or.cond = and i1 %cmp16, %cmp20
+  %scevgep34 = getelementptr i8, i8* %lsr.iv, i32 -1
+  br i1 %or.cond, label %while.body24, label %while.cond2.outer
+
+if.end29:                                         ; preds = %while.cond2, %while.body, %while.cond.preheader
+  %s.addr.3 = phi i8* [ %s, %while.cond.preheader ], [ %add.ptr, %while.body ], [ %s.addr.1.ph, %while.cond2 ]
+  ret i8* %s.addr.3
+}
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM/arm-shrink-wrapping.ll b/test/CodeGen/ARM/arm-shrink-wrapping.ll
new file mode 100644
index 0000000000000..9375df4b15cb2
--- /dev/null
+++ b/test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -0,0 +1,683 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=armv7-apple-ios \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ARM --check-prefix=ENABLE --check-prefix=ARM-ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=armv7-apple-ios \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ARM --check-prefix=DISABLE --check-prefix=ARM-DISABLE
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv7-apple-ios \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB --check-prefix=ENABLE --check-prefix=THUMB-ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv7-apple-ios \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB --check-prefix=DISABLE --check-prefix=THUMB-DISABLE
+
+;
+; Note: Lots of tests use inline asm instead of regular calls.
+; This allows to have a better control on what the allocation will do.
+; Otherwise, we may have spill right in the entry block, defeating
+; shrink-wrapping. Moreover, some of the inline asm statements (nop)
+; are here to ensure that the related paths do not end up as critical
+; edges.
+; Also disable the late if-converter as it makes harder to reason on
+; the diffs.
+
+; Initial motivating example: Simple diamond with a call just on one side.
+; CHECK-LABEL: foo:
+;
+; Compare the arguments and jump to exit.
+; No prologue needed.
+; ENABLE: cmp r0, r1
+; ENABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; CHECK: push {r7, lr}
+; CHECK-NEXT: mov r7, sp
+;;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; DISABLE: sub sp
+; DISABLE: cmp r0, r1
+; DISABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Store %a in the alloca.
+; ARM-ENABLE: push {r0}
+; THUMB-ENABLE: str r0, [sp, #-4]
+; DISABLE: str r0, [sp]
+; Set the alloca address in the second argument.
+; CHECK-NEXT: mov r1, sp
+; Set the first argument to zero.
+; CHECK-NEXT: mov{{s?}} r0, #0
+; CHECK-NEXT: bl{{x?}} _doSomething
+;
+; With shrink-wrapping, epilogue is just after the call.
+; ARM-ENABLE-NEXT: mov sp, r7
+; THUMB-ENABLE-NEXT: add sp, #4
+; ENABLE-NEXT: pop{{(\.w)?}} {r7, lr}
+;
+; CHECK: [[EXIT_LABEL]]:
+;
+; Without shrink-wrapping, epilogue is in the exit block.
+; Epilogue code. (What we pop does not matter.)
+; ARM-DISABLE: mov sp, r7
+; THUMB-DISABLE: add sp, 
+; DISABLE-NEXT: pop {r7, pc}
+;
+; ENABLE-NEXT: bx lr
+define i32 @foo(i32 %a, i32 %b) {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+; Function Attrs: optsize
+declare i32 @doSomething(i32, i32*)
+
+
+; Check that we do not perform the restore inside the loop whereas the save
+; is outside.
+; CHECK-LABEL: freqSaveAndRestoreOutsideLoop:
+;
+; Shrink-wrapping allows to skip the prologue in the else case.
+; ARM-ENABLE: cmp r0, #0
+; ARM-ENABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+; THUMB-ENABLE: cbz r0, [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r4.
+; CHECK: push {r4, r7, lr}
+; CHECK-NEXT: add r7, sp, #4
+;
+; ARM-DISABLE: cmp r0, #0
+; ARM-DISABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+; THUMB-DISABLE: cbz r0, [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; SUM is in r0 because it is coalesced with the second
+; argument on the else path.
+; CHECK: mov{{s?}} [[SUM:r0]], #0
+; CHECK-NEXT: mov{{s?}} [[IV:r[0-9]+]], #10
+;
+; Next BB.
+; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
+; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
+; ARM: subs [[IV]], [[IV]], #1
+; THUMB: subs [[IV]], #1
+; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]]
+; THUMB-NEXT: add [[SUM]], [[TMP]]
+; CHECK-NEXT: bne [[LOOP]]
+;
+; Next BB.
+; SUM << 3.
+; CHECK: lsl{{s?}} [[SUM]], [[SUM]], #3
+; ENABLE-NEXT: pop {r4, r7, pc}
+;
+; Duplicated epilogue.
+; DISABLE: pop {r4, r7, pc}
+;
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: lsl{{s?}} r0, r1, #1
+; DISABLE-NEXT: pop {r4, r7, pc}
+;
+; ENABLE-NEXT: bx lr
+define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
+  %call = tail call i32 asm sideeffect "mov $0, #1", "=r,~{r4}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+declare i32 @something(...)
+
+; Check that we do not perform the shrink-wrapping inside the loop even
+; though that would be legal. The cost model must prevent that.
+; CHECK-LABEL: freqSaveAndRestoreOutsideLoop2:
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r4.
+; CHECK: push {r4
+; CHECK: mov{{s?}} [[SUM:r0]], #0
+; CHECK-NEXT: mov{{s?}} [[IV:r[0-9]+]], #10
+; CHECK: nop
+; Next BB.
+; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: @ %for.body
+; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
+; ARM: subs [[IV]], [[IV]], #1
+; THUMB: subs [[IV]], #1
+; ARM: add [[SUM]], [[TMP]], [[SUM]]
+; THUMB: add [[SUM]], [[TMP]]
+; CHECK-NEXT: bne [[LOOP_LABEL]]
+; Next BB.
+; CHECK: @ %for.exit
+; CHECK: nop
+; CHECK: pop {r4
+define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) {
+entry:
+  br label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.04 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
+  %sum.03 = phi i32 [ 0, %for.preheader ], [ %add, %for.body ]
+  %call = tail call i32 asm sideeffect "mov $0, #1", "=r,~{r4}"()
+  %add = add nsw i32 %call, %sum.03
+  %inc = add nuw nsw i32 %i.04, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  tail call void asm "nop", ""()
+  br label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add
+}
+
+; Check with a more complex case that we do not have save within the loop and
+; restore outside.
+; CHECK-LABEL: loopInfoSaveOutsideLoop:
+;
+; ARM-ENABLE: cmp r0, #0
+; ARM-ENABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+; THUMB-ENABLE: cbz r0, [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r4.
+; CHECK: push {r4, r7, lr}
+; CHECK-NEXT: add r7, sp, #4
+;
+; ARM-DISABLE: cmp r0, #0
+; ARM-DISABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+; THUMB-DISABLE: cbz r0, [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; SUM is in r0 because it is coalesced with the second
+; argument on the else path.
+; CHECK: mov{{s?}} [[SUM:r0]], #0
+; CHECK-NEXT: mov{{s?}} [[IV:r[0-9]+]], #10
+;
+; Next BB.
+; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
+; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
+; ARM: subs [[IV]], [[IV]], #1
+; THUMB: subs [[IV]], #1
+; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]]
+; THUMB-NEXT: add [[SUM]], [[TMP]]
+; CHECK-NEXT: bne [[LOOP]]
+;
+; Next BB.
+; SUM << 3.
+; CHECK: lsl{{s?}} [[SUM]], [[SUM]], #3
+; ENABLE: pop {r4, r7, pc}
+;
+; Duplicated epilogue.
+; DISABLE: pop {r4, r7, pc}
+;
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: lsl{{s?}} r0, r1, #1
+; DISABLE-NEXT: pop {r4, r7, pc}
+;
+; ENABLE-NEXT: bx lr
+define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
+  %call = tail call i32 asm sideeffect "mov $0, #1", "=r,~{r4}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  tail call void asm "nop", "~{r4}"()
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+declare void @somethingElse(...)
+
+; Check with a more complex case that we do not have restore within the loop and
+; save outside.
+; CHECK-LABEL: loopInfoRestoreOutsideLoop:
+;
+; ARM-ENABLE: cmp r0, #0
+; ARM-ENABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+; THUMB-ENABLE: cbz r0, [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r4.
+; CHECK: push {r4, r7, lr}
+; CHECK-NEXT: add r7, sp, #4
+;
+; ARM-DISABLE: cmp r0, #0
+; ARM-DISABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+; THUMB-DISABLE: cbz r0, [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; SUM is in r0 because it is coalesced with the second
+; argument on the else path.
+; CHECK: mov{{s?}} [[SUM:r0]], #0
+; CHECK-NEXT: mov{{s?}} [[IV:r[0-9]+]], #10
+;
+; Next BB.
+; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
+; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
+; ARM: subs [[IV]], [[IV]], #1
+; THUMB: subs [[IV]], #1
+; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]]
+; THUMB-NEXT: add [[SUM]], [[TMP]]
+; CHECK-NEXT: bne [[LOOP]]
+;
+; Next BB.
+; SUM << 3.
+; CHECK: lsl{{s?}} [[SUM]], [[SUM]], #3
+; ENABLE-NEXT: pop {r4, r7, pc}
+;
+; Duplicated epilogue.
+; DISABLE: pop {r4, r7, pc}
+;
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: lsl{{s?}} r0, r1, #1
+; DISABLE-NEXT: pop {r4, r7, pc}
+;
+; ENABLE-NEXT: bx lr
+define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) #0 {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void asm "nop", "~{r4}"()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %if.then
+  %i.05 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
+  %sum.04 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
+  %call = tail call i32 asm sideeffect "mov $0, #1", "=r,~{r4}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+; Check that we handle function with no frame information correctly.
+; CHECK-LABEL: emptyFrame:
+; CHECK: @ %entry
+; CHECK-NEXT: mov{{s?}} r0, #0
+; CHECK-NEXT: bx lr
+define i32 @emptyFrame() {
+entry:
+  ret i32 0
+}
+
+; Check that we handle inline asm correctly.
+; CHECK-LABEL: inlineAsm:
+;
+; ARM-ENABLE: cmp r0, #0
+; ARM-ENABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+; THUMB-ENABLE: cbz r0, [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r4.
+; CHECK: push {r4, r7, lr}
+; CHECK-NEXT: add r7, sp, #4
+;
+; ARM-DISABLE: cmp r0, #0
+; ARM-DISABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+; THUMB-DISABLE: cbz r0, [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; CHECK: mov{{s?}} [[IV:r[0-9]+]], #10
+;
+; Next BB.
+; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
+; ARM: subs [[IV]], [[IV]], #1
+; THUMB: subs [[IV]], #1
+; CHECK: add{{(\.w)?}} r4, r4, #1
+; CHECK: bne [[LOOP]]
+;
+; Next BB.
+; CHECK: mov{{s?}} r0, #0
+;
+; Duplicated epilogue.
+; DISABLE: pop {r4, r7, pc}
+;
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: lsl{{s?}} r0, r1, #1
+; DISABLE-NEXT: pop {r4, r7, pc}
+;
+; ENABLE-NEXT: bx lr
+define i32 @inlineAsm(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  tail call void asm sideeffect "add r4, #1", "~{r4}"()
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  tail call void asm "nop", ""()
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.else
+  %sum.0 = phi i32 [ %mul, %if.else ], [ 0, %for.exit ]
+  ret i32 %sum.0
+}
+
+; Check that we handle calls to variadic functions correctly.
+; CHECK-LABEL: callVariadicFunc:
+;
+; ARM-ENABLE: cmp r0, #0
+; ARM-ENABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+; THUMB-ENABLE: cbz r0, [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; CHECK: push {r7, lr}
+; CHECK-NEXT: mov r7, sp
+; CHECK-NEXT: sub sp, {{(sp, )?}}#12
+;
+; ARM-DISABLE: cmp r0, #0
+; ARM-DISABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+; THUMB-DISABLE-NEXT: cbz r0, [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Setup of the varags.
+; CHECK: mov r0, r1
+; CHECK-NEXT: mov r2, r1
+; CHECK-NEXT: mov r3, r1
+; ARM-NEXT: str r1, [sp]
+; ARM-NEXT: str r1, [sp, #4]
+; THUMB-NEXT: strd r1, r1, [sp]
+; CHECK-NEXT: str r1, [sp, #8]
+; CHECK-NEXT: bl{{x?}} _someVariadicFunc
+; CHECK-NEXT: lsl{{s?}} r0, r0, #3
+; ARM-NEXT: mov sp, r7
+; THUMB-NEXT: add sp, #12
+; CHECK-NEXT: pop {r7, pc}
+;
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: lsl{{s?}} r0, r1, #1
+;
+; Epilogue code.
+; ENABLE-NEXT: bx lr
+;
+; ARM-DISABLE-NEXT: mov sp, r7
+; THUMB-DISABLE-NEXT: add sp, #12
+; DISABLE-NEXT: pop {r7, pc}
+define i32 @callVariadicFunc(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 (i32, ...) @someVariadicFunc(i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N)
+  %shl = shl i32 %call, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %sum.0 = phi i32 [ %shl, %if.then ], [ %mul, %if.else ]
+  ret i32 %sum.0
+}
+
+declare i32 @someVariadicFunc(i32, ...)
+
+; Make sure we do not insert unreachable code after noreturn function.
+; Although this is not incorrect to insert such code, it is useless
+; and it hurts the binary size.
+;
+; CHECK-LABEL: noreturn:
+; DISABLE: push
+;
+; CHECK: tst{{(\.w)?}}  r0, #255
+; CHECK-NEXT: bne      [[ABORT:LBB[0-9_]+]]
+;
+; CHECK: mov{{s?}} r0, #42
+;
+; ENABLE-NEXT: bx lr
+;
+; DISABLE-NEXT: pop
+;;
+; CHECK: [[ABORT]]: @ %if.abort
+;
+; ENABLE: push
+;
+; CHECK: bl{{x?}} _abort
+; ENABLE-NOT: pop
+define i32 @noreturn(i8 signext %bad_thing) {
+entry:
+  %tobool = icmp eq i8 %bad_thing, 0
+  br i1 %tobool, label %if.end, label %if.abort
+
+if.abort:
+  %call = tail call i32 asm sideeffect "mov $0, #1", "=r,~{r4}"()
+  tail call void @abort() #0
+  unreachable
+
+if.end:
+  ret i32 42
+}
+
+declare void @abort() #0
+
+attributes #0 = { noreturn nounwind }
+
+; Make sure that we handle infinite loops properly When checking that the Save
+; and Restore blocks are control flow equivalent, the loop searches for the
+; immediate (post) dominator for the (restore) save blocks. When either the Save
+; or Restore block is located in an infinite loop the only immediate (post)
+; dominator is itself. In this case, we cannot perform shrink wrapping, but we
+; should return gracefully and continue compilation.
+; The only condition for this test is the compilation finishes correctly.
+; CHECK-LABEL: infiniteloop
+; CHECK: pop
+define void @infiniteloop() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %ptr = alloca i32, i32 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
+  %call = tail call i32 asm sideeffect "mov $0, #1", "=r,~{r4}"()
+  %add = add nsw i32 %call, %sum.03
+  store i32 %add, i32* %ptr
+  br label %for.body
+
+if.end:
+  ret void
+}
+
+; Another infinite loop test this time with a body bigger than just one block.
+; CHECK-LABEL: infiniteloop2
+; CHECK: pop
+define void @infiniteloop2() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %ptr = alloca i32, i32 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi i32 [ 0, %if.then ], [ %add, %body1 ], [ 1, %body2]
+  %call = tail call i32 asm "mov $0, #0", "=r,~{r4}"()
+  %add = add nsw i32 %call, %sum.03
+  store i32 %add, i32* %ptr
+  br i1 undef, label %body1, label %body2
+
+body1:
+  tail call void asm sideeffect "nop", "~{r4}"()
+  br label %for.body
+
+body2:
+  tail call void asm sideeffect "nop", "~{r4}"()
+  br label %for.body
+
+if.end:
+  ret void
+}
+
+; Another infinite loop test this time with two nested infinite loop.
+; CHECK-LABEL: infiniteloop3
+; CHECK: bx lr
+define void @infiniteloop3() {
+entry:
+  br i1 undef, label %loop2a, label %body
+
+body:                                             ; preds = %entry
+  br i1 undef, label %loop2a, label %end
+
+loop1:                                            ; preds = %loop2a, %loop2b
+  %var.phi = phi i32* [ %next.phi, %loop2b ], [ %var, %loop2a ]
+  %next.phi = phi i32* [ %next.load, %loop2b ], [ %next.var, %loop2a ]
+  %0 = icmp eq i32* %var, null
+  %next.load = load i32*, i32** undef
+  br i1 %0, label %loop2a, label %loop2b
+
+loop2a:                                           ; preds = %loop1, %body, %entry
+  %var = phi i32* [ null, %body ], [ null, %entry ], [ %next.phi, %loop1 ]
+  %next.var = phi i32* [ undef, %body ], [ null, %entry ], [ %next.load, %loop1 ]
+  br label %loop1
+
+loop2b:                                           ; preds = %loop1
+  %gep1 = bitcast i32* %var.phi to i32*
+  %next.ptr = bitcast i32* %gep1 to i32**
+  store i32* %next.phi, i32** %next.ptr
+  br label %loop1
+
+end:
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare double @llvm.pow.f64(double, double)
+
+; This function needs to spill floating point registers to
+; exerce the path where we were deferencing the end iterator
+; to access debug info location while inserting the spill code
+; during PEI with shrink-wrapping enable.
+; CHECK-LABEL: debug_info:
+;
+; ENABLE: tst{{(\.w)?}}  r2, #1
+; ENABLE-NEXT: beq      [[BB13:LBB[0-9_]+]]
+;
+; CHECK: push
+;
+; DISABLE: tst{{(\.w)?}}  r2, #1
+; DISABLE-NEXT: beq      [[BB13:LBB[0-9_]+]]
+;
+; CHECK: bl{{x?}} _pow
+;
+;
+; ENABLE: pop
+;
+; CHECK: [[BB13]]:
+; CHECK: vldr
+;
+; DISABLE: pop
+;
+; CHECK: bl
+define float @debug_info(float %gamma, float %slopeLimit, i1 %or.cond, double %tmp) {
+bb:
+  br i1 %or.cond, label %bb3, label %bb13
+
+bb3:                                              ; preds = %bb
+  %tmp4 = fcmp ogt float %gamma, 1.000000e+00
+  %tmp5 = fadd double 1.000000e+00, %tmp
+  %tmp6 = select i1 %tmp4, double %tmp5, double %tmp
+  %tmp10 = tail call double @llvm.pow.f64(double %tmp, double %tmp)
+  %tmp11 = fcmp une double %tmp6, %tmp
+  %tmp12 = fadd double %tmp10, %tmp10
+  %cutoff.0 = select i1 %tmp11, double %tmp12, double %tmp
+  %phitmp = fptrunc double %cutoff.0 to float
+  br label %bb13
+
+bb13:                                             ; preds = %bb3, %bb
+  %cutoff.1 = phi float [ 0.000000e+00, %bb ], [ %phitmp, %bb3 ]
+  ret float %cutoff.1
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "LLVM", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "a.cpp", directory: "b")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 7510d6ccdc339..573cd45c0825e 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -208,10 +208,16 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 define i64 @test8(i64* %ptr) {
 ; CHECK-LABEL: test8:
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
+; CHECK-NOT: strexd
+; CHECK: clrex
+; CHECK-NOT: strexd
 ; CHECK: dmb {{ish$}}
 
 ; CHECK-THUMB-LABEL: test8:
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB-NOT: strexd
+; CHECK-THUMB: clrex
+; CHECK-THUMB-NOT: strexd
 ; CHECK-THUMB: dmb {{ish$}}
 
   %r = load atomic i64, i64* %ptr seq_cst, align 8
diff --git a/test/CodeGen/ARM/atomic-cmp.ll b/test/CodeGen/ARM/atomic-cmp.ll
index 629b16d86ab57..7f41b7d93d1a3 100644
--- a/test/CodeGen/ARM/atomic-cmp.ll
+++ b/test/CodeGen/ARM/atomic-cmp.ll
@@ -6,10 +6,12 @@ define i8 @t(i8* %a, i8 %b, i8 %c) nounwind {
 ; ARM-LABEL: t:
 ; ARM: ldrexb
 ; ARM: strexb
+; ARM: clrex
 
 ; T2-LABEL: t:
-; T2: ldrexb
 ; T2: strexb
+; T2: ldrexb
+; T2: clrex
   %tmp0 = cmpxchg i8* %a, i8 %b, i8 %c monotonic monotonic
   %tmp1 = extractvalue { i8, i1 } %tmp0, 0
   ret i8 %tmp1
diff --git a/test/CodeGen/ARM/atomic-cmpxchg.ll b/test/CodeGen/ARM/atomic-cmpxchg.ll
index 84790be6d6059..32cdf4174ddcc 100644
--- a/test/CodeGen/ARM/atomic-cmpxchg.ll
+++ b/test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -1,8 +1,11 @@
-; RUN: llc < %s -mtriple=arm-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARM
-; RUN: llc < %s -mtriple=thumb-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMB
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=thumb-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMB
 
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARMV7
-; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV7
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARMV6
+; RUN: llc < %s -mtriple=thumbv6-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV6
+
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARMV7
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV7
 
 define zeroext i1 @test_cmpxchg_res_i8(i8* %addr, i8 %desired, i8 zeroext %new) {
 entry:
@@ -26,28 +29,71 @@ entry:
 ; CHECK-THUMB: movs r0, #1
 ; CHECK-THUMB: movs [[R2:r[0-9]+]], #0
 ; CHECK-THUMB: cmp [[R1]], {{r[0-9]+}}
-; CHECK-THU<B: beq
+; CHECK-THUMB: beq
 ; CHECK-THUMB: push  {[[R2]]}
 ; CHECK-THUMB: pop {r0}
 
-; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8
-; CHECK-ARMV7: ldrexb [[R3:r[0-9]+]], [r0]
-; CHECK-ARMV7: mov [[R1:r[0-9]+]], #0
-; CHECK-ARMV7: cmp [[R3]], {{r[0-9]+}}
-; CHECK-ARMV7: bne
-; CHECK-ARMV7: strexb [[R3]], {{r[0-9]+}}, [{{r[0-9]+}}]
-; CHECK-ARMV7: mov [[R1]], #1
-; CHECK-ARMV7: cmp [[R3]], #0
-; CHECK-ARMV7: bne
-; CHECK-ARMV7: mov r0, [[R1]]
-
-; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8
-; CHECK-THUMBV7: ldrexb [[R3:r[0-9]+]], [r0]
-; CHECK-THUMBV7: cmp [[R3]], {{r[0-9]+}}
-; CHECK-THUMBV7: movne r0, #0
-; CHECK-THUMBV7: bxne lr
-; CHECK-THUMBV7: strexb [[R3]], {{r[0-9]+}}, [{{r[0-9]+}}]
-; CHECK-THUMBV7: cmp [[R3]], #0
-; CHECK-THUMBV7: itt eq
-; CHECK-THUMBV7: moveq r0, #1
-; CHECK-THUMBV7: bxeq lr
+; CHECK-ARMV6-LABEL: test_cmpxchg_res_i8:
+; CHECK-ARMV6-NEXT:  .fnstart
+; CHECK-ARMV6-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
+; CHECK-ARMV6-NEXT: [[TRY:.LBB[0-9_]+]]:
+; CHECK-ARMV6-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
+; CHECK-ARMV6-NEXT: mov [[RES:r[0-9]+]], #0
+; CHECK-ARMV6-NEXT: cmp [[LD]], [[DESIRED]]
+; CHECK-ARMV6-NEXT: bne [[END:.LBB[0-9_]+]]
+; CHECK-ARMV6-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
+; CHECK-ARMV6-NEXT: mov [[RES]], #1
+; CHECK-ARMV6-NEXT: cmp [[SUCCESS]], #0
+; CHECK-ARMV6-NEXT: bne [[TRY]]
+; CHECK-ARMV6-NEXT: [[END]]:
+; CHECK-ARMV6-NEXT: mov r0, [[RES]]
+; CHECK-ARMV6-NEXT: bx lr
+
+; CHECK-THUMBV6-LABEL: test_cmpxchg_res_i8:
+; CHECK-THUMBV6:       mov [[EXPECTED:r[0-9]+]], r1
+; CHECK-THUMBV6-NEXT:  bl __sync_val_compare_and_swap_1
+; CHECK-THUMBV6-NEXT:  mov [[RES:r[0-9]+]], r0
+; CHECK-THUMBV6-NEXT:  movs r0, #1
+; CHECK-THUMBV6-NEXT:  movs [[ZERO:r[0-9]+]], #0
+; CHECK-THUMBV6-NEXT:  cmp [[RES]], [[EXPECTED]]
+; CHECK-THUMBV6-NEXT:  beq [[END:.LBB[0-9_]+]]
+; CHECK-THUMBV6-NEXT:  mov r0, [[ZERO]]
+; CHECK-THUMBV6-NEXT: [[END]]:
+; CHECK-THUMBV6-NEXT:  pop {{.*}}pc}
+
+; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8:
+; CHECK-ARMV7-NEXT: .fnstart
+; CHECK-ARMV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
+; CHECK-ARMV7-NEXT: [[TRY:.LBB[0-9_]+]]:
+; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
+; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]
+; CHECK-ARMV7-NEXT: bne [[FAIL:.LBB[0-9_]+]]
+; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
+; CHECK-ARMV7-NEXT: mov [[RES:r[0-9]+]], #1
+; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
+; CHECK-ARMV7-NEXT: bne [[TRY]]
+; CHECK-ARMV7-NEXT: b [[END:.LBB[0-9_]+]]
+; CHECK-ARMV7-NEXT: [[FAIL]]:
+; CHECK-ARMV7-NEXT: clrex
+; CHECK-ARMV7-NEXT: mov [[RES]], #0
+; CHECK-ARMV7-NEXT: [[END]]:
+; CHECK-ARMV7-NEXT: mov r0, [[RES]]
+; CHECK-ARMV7-NEXT: bx lr
+
+; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
+; CHECK-THUMBV7-NEXT: .fnstart
+; CHECK-THUMBV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
+; CHECK-THUMBV7-NEXT: b [[TRYLD:.LBB[0-9_]+]]
+; CHECK-THUMBV7-NEXT: [[TRYST:.LBB[0-9_]+]]:
+; CHECK-THUMBV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
+; CHECK-THUMBV7-NEXT: cmp [[SUCCESS]], #0
+; CHECK-THUMBV7-NEXT: itt eq
+; CHECK-THUMBV7-NEXT: moveq r0, #1
+; CHECK-THUMBV7-NEXT: bxeq lr
+; CHECK-THUMBV7-NEXT: [[TRYLD]]:
+; CHECK-THUMBV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
+; CHECK-THUMBV7-NEXT: cmp [[LD]], [[DESIRED]]
+; CHECK-THUMBV7-NEXT: beq [[TRYST:.LBB[0-9_]+]]
+; CHECK-THUMBV7-NEXT: clrex
+; CHECK-THUMBV7-NEXT: movs r0, #0
+; CHECK-THUMBV7-NEXT: bx lr
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index db32bffdd5d1a..7913894566192 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix CHECK-ARMV7
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-T2
 ; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-T1
 ; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs -mcpu=cortex-m0 | FileCheck %s --check-prefix=CHECK-M0
 ; RUN: llc < %s -mtriple=thumbv7--none-eabi -thread-model single -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-BAREMETAL
@@ -272,16 +272,31 @@ define i32 @test_cmpxchg_fail_order(i32 *%addr, i32 %desired, i32 %new) {
 
   %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
   %oldval = extractvalue { i32, i1 } %pair, 0
-; CHECK:     dmb ish
-; CHECK: [[LOOP_BB:\.?LBB[0-9]+_1]]:
-; CHECK:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
-; CHECK:     cmp     [[OLDVAL]], r1
-; CHECK:     bxne    lr
-; CHECK:     strex   [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
-; CHECK:     cmp     [[SUCCESS]], #0
-; CHECK:     bne     [[LOOP_BB]]
-; CHECK:     dmb     ish
-; CHECK:     bx      lr
+; CHECK-ARMV7:     dmb ish
+; CHECK-ARMV7: [[LOOP_BB:\.?LBB[0-9]+_1]]:
+; CHECK-ARMV7:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
+; CHECK-ARMV7:     cmp     [[OLDVAL]], r1
+; CHECK-ARMV7:     bne     [[FAIL_BB:\.?LBB[0-9]+_[0-9]+]]
+; CHECK-ARMV7:     strex   [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
+; CHECK-ARMV7:     cmp     [[SUCCESS]], #0
+; CHECK-ARMV7:     bne     [[LOOP_BB]]
+; CHECK-ARMV7:     dmb     ish
+; CHECK-ARMV7:     bx      lr
+; CHECK-ARMV7: [[FAIL_BB]]:
+; CHECK-ARMV7:     clrex
+; CHECK-ARMV7:     bx      lr
+
+; CHECK-T2:     dmb ish
+; CHECK-T2: [[LOOP_BB:\.?LBB[0-9]+_1]]:
+; CHECK-T2:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
+; CHECK-T2:     cmp     [[OLDVAL]], r1
+; CHECK-T2:     clrexne
+; CHECK-T2:     bxne    lr
+; CHECK-T2:     strex   [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
+; CHECK-T2:     cmp     [[SUCCESS]], #0
+; CHECK-T2:     dmbeq   ish
+; CHECK-T2:     bxeq    lr
+; CHECK-T2:     b       [[LOOP_BB]]
 
   ret i32 %oldval
 }
@@ -295,11 +310,14 @@ define i32 @test_cmpxchg_fail_order1(i32 *%addr, i32 %desired, i32 %new) {
 ; CHECK: [[LOOP_BB:\.?LBB[0-9]+_1]]:
 ; CHECK:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
 ; CHECK:     cmp     [[OLDVAL]], r1
-; CHECK:     bne     [[END_BB:\.?LBB[0-9]+_[0-9]+]]
+; CHECK:     bne     [[FAIL_BB:\.?LBB[0-9]+_[0-9]+]]
 ; CHECK:     strex   [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
 ; CHECK:     cmp     [[SUCCESS]], #0
 ; CHECK:     bne     [[LOOP_BB]]
-; CHECK: [[END_BB]]:
+; CHECK:     b       [[END_BB:\.?LBB[0-9]+_[0-9]+]]
+; CHECK: [[FAIL_BB]]:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: [[END_BB]]:
 ; CHECK:     dmb     ish
 ; CHECK:     bx      lr
 
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
index 86287c1178dbe..efdb75b632229 100644
--- a/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1055,24 +1055,30 @@ define i8 @test_atomic_cmpxchg_i8(i8 zeroext %wanted, i8 zeroext %new) nounwind
    %old = extractvalue { i8, i1 } %pair, 0
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK-DAG: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK-DAG: movt r[[ADDR]], :upper16:var8
+; CHECK-THUMB-DAG: mov r[[WANTED:[0-9]+]], r0
 
 ; CHECK: .LBB{{[0-9]+}}_1:
 ; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp r[[OLD]], r0
+; CHECK-ARM-NEXT:   cmp r[[OLD]], r0
+; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
-; CHECK: strexb [[STATUS:r[0-9]+]], r1, {{.*}}[[ADDR]]
+; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NEXT: b .LBB{{[0-9]+}}_4
+; CHECK-NEXT: .LBB{{[0-9]+}}_3:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: .LBB{{[0-9]+}}_4:
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD]]
+; CHECK-ARM: mov r0, r[[OLD]]
    ret i8 %old
 }
 
@@ -1082,24 +1088,30 @@ define i16 @test_atomic_cmpxchg_i16(i16 zeroext %wanted, i16 zeroext %new) nounw
    %old = extractvalue { i16, i1 } %pair, 0
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
+; CHECK-DAG: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK-DAG: movt r[[ADDR]], :upper16:var16
+; CHECK-THUMB-DAG: mov r[[WANTED:[0-9]+]], r0
 
 ; CHECK: .LBB{{[0-9]+}}_1:
 ; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp r[[OLD]], r0
+; CHECK-ARM-NEXT:   cmp r[[OLD]], r0
+; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NEXT: b .LBB{{[0-9]+}}_4
+; CHECK-NEXT: .LBB{{[0-9]+}}_3:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: .LBB{{[0-9]+}}_4:
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD]]
+; CHECK-ARM: mov r0, r[[OLD]]
    ret i16 %old
 }
 
@@ -1124,6 +1136,10 @@ define void @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NEXT: b .LBB{{[0-9]+}}_4
+; CHECK-NEXT: .LBB{{[0-9]+}}_3:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: .LBB{{[0-9]+}}_4:
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
@@ -1158,6 +1174,10 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NEXT: b .LBB{{[0-9]+}}_4
+; CHECK-NEXT: .LBB{{[0-9]+}}_3:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: .LBB{{[0-9]+}}_4:
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
index c3de07e03b6b3..79e8e68e2f57c 100644
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
 ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
 ; dependency) when it isn't dependent on last CPSR defining instruction.
 ; rdar://8928208
@@ -7,8 +7,10 @@
 define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
  entry:
 ; CHECK-LABEL: t1:
-; CHECK: muls [[REG:(r[0-9]+)]], r3, r2
-; CHECK-NEXT: mul  [[REG2:(r[0-9]+)]], r1, r0
+; CHECK-CORTEX: muls [[REG:(r[0-9]+)]], r3, r2
+; CHECK-CORTEX-NEXT: mul  [[REG2:(r[0-9]+)]], r1, r0
+; CHECK-SWIFT: muls  [[REG2:(r[0-9]+)]], r1, r0
+; CHECK-SWIFT-NEXT: mul [[REG:(r[0-9]+)]], r2, r3
 ; CHECK-NEXT: muls r0, [[REG]], [[REG2]]
   %0 = mul nsw i32 %a, %b
   %1 = mul nsw i32 %c, %d
@@ -21,8 +23,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
 define void @t2(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind {
 entry:
 ; CHECK-LABEL: t2:
-  %tobool7 = icmp eq i32* %ptr2, null
-  br i1 %tobool7, label %while.end, label %while.body
+  br label %while.body
 
 while.body:
 ; CHECK: while.body
@@ -55,8 +56,7 @@ while.end:
 define void @t3(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind minsize {
 entry:
 ; CHECK-LABEL: t3:
-  %tobool7 = icmp eq i32* %ptr2, null
-  br i1 %tobool7, label %while.end, label %while.body
+  br label %while.body
 
 while.body:
 ; CHECK: while.body
diff --git a/test/CodeGen/ARM/bfi.ll b/test/CodeGen/ARM/bfi.ll
index 0661960d1ae00..893fef3add7e1 100644
--- a/test/CodeGen/ARM/bfi.ll
+++ b/test/CodeGen/ARM/bfi.ll
@@ -74,3 +74,98 @@ entry:
   %or = or i32 %shl, %and
   ret i32 %or
 }
+
+define i32 @f7(i32 %x, i32 %y) {
+; CHECK-LABEL: f7:
+; CHECK: bfi r1, r0, #4, #1
+  %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
+  %and = and i32 %x, 4
+  %or = or i32 %y2, 16
+  %cmp = icmp ne i32 %and, 0
+  %sel = select i1 %cmp, i32 %or, i32 %y2
+  ret i32 %sel
+}
+
+define i32 @f8(i32 %x, i32 %y) {
+; CHECK-LABEL: f8:
+; CHECK: bfi r1, r0, #4, #1
+; CHECK: bfi r1, r0, #5, #1
+  %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
+  %and = and i32 %x, 4
+  %or = or i32 %y2, 48
+  %cmp = icmp ne i32 %and, 0
+  %sel = select i1 %cmp, i32 %or, i32 %y2
+  ret i32 %sel
+}
+
+define i32 @f9(i32 %x, i32 %y) {
+; CHECK-LABEL: f9:
+; CHECK-NOT: bfi
+  %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
+  %and = and i32 %x, 4
+  %or = or i32 %y2, 48
+  %cmp = icmp ne i32 %and, 0
+  %sel = select i1 %cmp, i32 %y2, i32 %or
+  ret i32 %sel
+}
+
+define i32 @f10(i32 %x, i32 %y) {
+; CHECK-LABEL: f10:
+; CHECK: bfi r1, r0, #4, #2
+  %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
+  %and = and i32 %x, 4
+  %or = or i32 %y2, 32
+  %cmp = icmp ne i32 %and, 0
+  %sel = select i1 %cmp, i32 %or, i32 %y2
+
+  %aand = and i32 %x, 2
+  %aor = or i32 %sel, 16
+  %acmp = icmp ne i32 %aand, 0
+  %asel = select i1 %acmp, i32 %aor, i32 %sel
+
+  ret i32 %asel
+}
+
+define i32 @f11(i32 %x, i32 %y) {
+; CHECK-LABEL: f11:
+; CHECK: bfi r1, r0, #4, #3
+  %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
+  %and = and i32 %x, 4
+  %or = or i32 %y2, 32
+  %cmp = icmp ne i32 %and, 0
+  %sel = select i1 %cmp, i32 %or, i32 %y2
+
+  %aand = and i32 %x, 2
+  %aor = or i32 %sel, 16
+  %acmp = icmp ne i32 %aand, 0
+  %asel = select i1 %acmp, i32 %aor, i32 %sel
+
+  %band = and i32 %x, 8
+  %bor = or i32 %asel, 64
+  %bcmp = icmp ne i32 %band, 0
+  %bsel = select i1 %bcmp, i32 %bor, i32 %asel
+
+  ret i32 %bsel
+}
+
+define i32 @f12(i32 %x, i32 %y) {
+; CHECK-LABEL: f12:
+; CHECK: bfi r1, r0, #4, #1
+  %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
+  %and = and i32 %x, 4
+  %or = or i32 %y2, 16
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 %y2, i32 %or
+  ret i32 %sel
+}
+
+define i32 @f13(i32 %x, i32 %y) {
+; CHECK-LABEL: f13:
+; CHECK-NOT: bfi
+  %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
+  %and = and i32 %x, 4
+  %or = or i32 %y2, 16
+  %cmp = icmp eq i32 %and, 42 ; Not comparing against zero!
+  %sel = select i1 %cmp, i32 %y2, i32 %or
+  ret i32 %sel
+}
diff --git a/test/CodeGen/ARM/build-attributes-optimization-minsize.ll b/test/CodeGen/ARM/build-attributes-optimization-minsize.ll
new file mode 100644
index 0000000000000..4cfb6012f439f
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-minsize.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+
+; CHECK: .eabi_attribute 30, 4	@ Tag_ABI_optimization_goals
+; CHECK-OBJ:          TagName: ABI_optimization_goals
+; CHECK-OBJ-NEXT:     Description: Aggressive Size
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+attributes #0 = { minsize optsize }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization-mixed.ll b/test/CodeGen/ARM/build-attributes-optimization-mixed.ll
new file mode 100644
index 0000000000000..8009fc6e28f80
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-mixed.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s
+
+; CHECK-NOT: .eabi_attribute 30
+; CHECK-NOT: Tag_ABI_optimization_goals
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+define i32 @g(i64 %z) #1 {
+    ret i32 1
+}
+
+attributes #0 = { noinline optnone }
+
+attributes #1 = { minsize optsize }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization-optnone.ll b/test/CodeGen/ARM/build-attributes-optimization-optnone.ll
new file mode 100644
index 0000000000000..cbdb915045c61
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-optnone.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+
+; CHECK: .eabi_attribute 30, 6	@ Tag_ABI_optimization_goals
+; CHECK-OBJ:          TagName: ABI_optimization_goals
+; CHECK-OBJ-NEXT:     Description: Best Debugging
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+attributes #0 = { noinline optnone }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization-optsize.ll b/test/CodeGen/ARM/build-attributes-optimization-optsize.ll
new file mode 100644
index 0000000000000..bab210aa8d017
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-optsize.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+
+; CHECK: .eabi_attribute 30, 3	@ Tag_ABI_optimization_goals
+; CHECK-OBJ:          TagName: ABI_optimization_goals
+; CHECK-OBJ-NEXT:     Description: Size
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+attributes #0 = { optsize }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization.ll b/test/CodeGen/ARM/build-attributes-optimization.ll
new file mode 100644
index 0000000000000..21b7b3c3ab0c3
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s --check-prefix=NONE
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s --check-prefix=SPEED
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s --check-prefix=MAXSPEED
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=NONE-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=SPEED-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=MAXSPEED-OBJ
+
+; NONE:     .eabi_attribute 30, 5	@ Tag_ABI_optimization_goals
+; SPEED:    .eabi_attribute 30, 1	@ Tag_ABI_optimization_goals
+; MAXSPEED: .eabi_attribute 30, 2	@ Tag_ABI_optimization_goals
+
+; NONE-OBJ:          TagName: ABI_optimization_goals
+; NONE-OBJ-NEXT:     Description: Debugging
+; SPEED-OBJ:         TagName: ABI_optimization_goals
+; SPEED-OBJ-NEXT:    Description: Speed
+; MAXSPEED-OBJ:      TagName: ABI_optimization_goals
+; MAXSPEED-OBJ-NEXT: Description: Aggressive Speed
+
+define i32 @f(i64 %z) {
+    ret i32 0
+}
+
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index 29c702304a3f1..bf502b3ae077a 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -1,17 +1,17 @@
 ; This tests that MC/asm header conversion is smooth and that the
 ; build attributes are correct
 
-; RUN: llc < %s -mtriple=thumbv5-linux-gnueabi -mcpu=xscale | FileCheck %s --check-prefix=XSCALE
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi | FileCheck %s --check-prefix=V6
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6-FAST
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi | FileCheck %s --check-prefix=V6M
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST
-; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi | FileCheck %s --check-prefix=V6M
-; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s | FileCheck %s --check-prefix=ARM1156T2F-S
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast  | FileCheck %s --check-prefix=ARM1156T2F-S-FAST
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv5-linux-gnueabi -mcpu=xscale -mattr=+strict-align | FileCheck %s --check-prefix=XSCALE
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6-FAST
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6M
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mattr=+strict-align  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST
+; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6M
+; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi -mattr=+strict-align -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align | FileCheck %s --check-prefix=ARM1156T2F-S
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast  | FileCheck %s --check-prefix=ARM1156T2F-S-FAST
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi | FileCheck %s --check-prefix=V7M
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V7M-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
@@ -59,18 +59,18 @@
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mattr=+neon,+fp16 | FileCheck %s --check-prefix=GENERIC-FPU-NEON-FP16
 
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=CORTEX-M0
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0-FAST
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus | FileCheck %s --check-prefix=CORTEX-M0PLUS
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0PLUS-FAST
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 | FileCheck %s --check-prefix=CORTEX-M1
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M1-FAST
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 | FileCheck %s --check-prefix=SC000
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=SC000-FAST
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -mattr=+strict-align | FileCheck %s --check-prefix=CORTEX-M0
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -mattr=+strict-align  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0-FAST
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -mattr=+strict-align | FileCheck %s --check-prefix=CORTEX-M0PLUS
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -mattr=+strict-align  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0PLUS-FAST
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -mattr=+strict-align | FileCheck %s --check-prefix=CORTEX-M1
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -mattr=+strict-align  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M1-FAST
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align | FileCheck %s --check-prefix=SC000
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=SC000-FAST
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=CORTEX-M3
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M3-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
@@ -96,6 +96,9 @@
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 | FileCheck %s --check-prefix=CORTEX-R7
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R7-FAST
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=CORTEX-A35
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A35-FAST
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A53-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
@@ -116,58 +119,50 @@
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A7-FPUV4-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,,+d16,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
-; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=pic | FileCheck %s --check-prefix=RELOC-PIC
-; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=static | FileCheck %s --check-prefix=RELOC-OTHER
-; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=default | FileCheck %s --check-prefix=RELOC-OTHER
-; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=dynamic-no-pic | FileCheck %s --check-prefix=RELOC-OTHER
-; RUN: llc < %s -mtriple=arm-none-linux-gnueabi | FileCheck %s --check-prefix=RELOC-OTHER
-; RUN: llc < %s -mtriple=arm-none-linux-gnueabi | FileCheck %s --check-prefix=PCS-R9-USE
-; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -arm-reserve-r9 | FileCheck %s --check-prefix=PCS-R9-RESERVE
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=pic | FileCheck %s --check-prefix=RELOC-PIC
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=static | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=default | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=dynamic-no-pic | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=PCS-R9-USE
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+reserve-r9,+strict-align | FileCheck %s --check-prefix=PCS-R9-RESERVE
 
 ; ARMv8.1a (AArch32)
-; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; ARMv8a (AArch32)
-; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a35 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; ARMv7a
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; ARMv7r
-; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; ARMv7m
-; RUN: llc < %s -mtriple=thumbv7m-none-linux-gnueabi -mcpu=cortex-m3 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=thumbv7m-none-linux-gnueabi -mcpu=cortex-m3 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=thumbv7m-none-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv7m-none-linux-gnueabi -mcpu=cortex-m3 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; ARMv6
 ; RUN: llc < %s -mtriple=armv6-none-netbsd-gnueabi -mcpu=arm1136j-s | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s | FileCheck %s --check-prefix=STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; ARMv6k
 ; RUN: llc < %s -mtriple=armv6k-none-netbsd-gnueabi -mcpu=arm1176j-s | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv6k-none-linux-gnueabi -mcpu=arm1176j-s | FileCheck %s --check-prefix=STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv6k-none-linux-gnueabi -mcpu=arm1176j-s -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv6k-none-linux-gnueabi -mcpu=arm1176j-s -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6k-none-linux-gnueabi -mcpu=arm1176j-s -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6k-none-linux-gnueabi -mcpu=arm1176j-s | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; ARMv6m
-; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -arm-no-strict-align -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
-; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -arm-strict-align -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
-; RUN: llc < %s -mtriple=thumbv6m-none-linux-gnueabi -arm-no-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
-; RUN: llc < %s -mtriple=thumbv6m-none-linux-gnueabi -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
-; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
-; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -mcpu=cortex-m0 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -mattr=+strict-align -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv6m-none-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -mcpu=cortex-m0 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; ARMv5
-; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
-; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 
 ; XSCALE:      .eabi_attribute 6, 5
 ; XSCALE:      .eabi_attribute 8, 1
@@ -748,6 +743,7 @@
 ; CORTEX-M0:  .eabi_attribute 21, 1
 ; CORTEX-M0-NOT:  .eabi_attribute 22
 ; CORTEX-M0:  .eabi_attribute 23, 3
+; CORTEX-M0: .eabi_attribute 34, 0
 ; CORTEX-M0:  .eabi_attribute 24, 1
 ; CORTEX-M0:  .eabi_attribute 25, 1
 ; CORTEX-M0-NOT:  .eabi_attribute 27
@@ -1109,7 +1105,7 @@
 ; CORTEX-R7:  .eabi_attribute 25, 1
 ; CORTEX-R7:  .eabi_attribute 27, 1
 ; CORTEX-R7-NOT:  .eabi_attribute 28
-; CORTEX-R7-NOT:  .eabi_attribute 36
+; CORTEX-R7:  .eabi_attribute 36, 1
 ; CORTEX-R7:  .eabi_attribute 38, 1
 ; CORTEX-R7:  .eabi_attribute 42, 1
 ; CORTEX-R7:  .eabi_attribute 44, 2
@@ -1122,6 +1118,36 @@
 ; CORTEX-R7-FAST-NOT:  .eabi_attribute 22
 ; CORTEX-R7-FAST:  .eabi_attribute 23, 1
 
+; CORTEX-A35:  .cpu cortex-a35
+; CORTEX-A35:  .eabi_attribute 6, 14
+; CORTEX-A35:  .eabi_attribute 7, 65
+; CORTEX-A35:  .eabi_attribute 8, 1
+; CORTEX-A35:  .eabi_attribute 9, 2
+; CORTEX-A35:  .fpu crypto-neon-fp-armv8
+; CORTEX-A35:  .eabi_attribute 12, 3
+; CORTEX-A35-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-A35:  .eabi_attribute 20, 1
+; CORTEX-A35:  .eabi_attribute 21, 1
+; CORTEX-A35-NOT:  .eabi_attribute 22
+; CORTEX-A35:  .eabi_attribute 23, 3
+; CORTEX-A35:  .eabi_attribute 24, 1
+; CORTEX-A35:  .eabi_attribute 25, 1
+; CORTEX-A35-NOT:  .eabi_attribute 27
+; CORTEX-A35-NOT:  .eabi_attribute 28
+; CORTEX-A35:  .eabi_attribute 36, 1
+; CORTEX-A35:  .eabi_attribute 38, 1
+; CORTEX-A35:  .eabi_attribute 42, 1
+; CORTEX-A35-NOT:  .eabi_attribute 44
+; CORTEX-A35:  .eabi_attribute 68, 3
+
+; CORTEX-A35-FAST-NOT:   .eabi_attribute 19
+;; The A35 has the ARMv8 FP unit, which always flushes preserving sign.
+; CORTEX-A35-FAST:  .eabi_attribute 20, 2
+; CORTEX-A35-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A35-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A35-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A53:  .cpu cortex-a53
 ; CORTEX-A53:  .eabi_attribute 6, 14
 ; CORTEX-A53:  .eabi_attribute 7, 65
diff --git a/test/CodeGen/ARM/call-tc.ll b/test/CodeGen/ARM/call-tc.ll
index b2b6aaec8131b..8821029520fe4 100644
--- a/test/CodeGen/ARM/call-tc.ll
+++ b/test/CodeGen/ARM/call-tc.ll
@@ -83,9 +83,11 @@ declare void @foo() nounwind
 define void @t7() nounwind {
 entry:
 ; CHECKT2D-LABEL: t7:
-; CHECKT2D: blxeq _foo
-; CHECKT2D-NEXT: pop.w
-; CHECKT2D-NEXT: b.w _foo
+; CHECKT2D: it ne
+; CHECKT2D-NEXT: bne.w _foo
+; CHECKT2D-NEXT: push
+; CHECKT2D-NEXT: mov r7, sp
+; CHECKT2D-NEXT: blx _foo
   br i1 undef, label %bb, label %bb1.lr.ph
 
 bb1.lr.ph:
diff --git a/test/CodeGen/ARM/cfi-alignment.ll b/test/CodeGen/ARM/cfi-alignment.ll
new file mode 100644
index 0000000000000..11add22426569
--- /dev/null
+++ b/test/CodeGen/ARM/cfi-alignment.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=thumbv7k-apple-watchos7.0 -o - %s | FileCheck %s
+
+; Since d11 doesn't get pushed with the aligned registers, its frameindex
+; shouldn't be modified to say it has been.
+
+define void @foo() {
+; CHECK-LABEL: foo:
+; CHECK: push {r7, lr}
+; CHECK: .cfi_offset r7, -8
+; CHECK: vpush {d11}
+; CHECK: vpush {d8, d9}
+; CHECK: .cfi_offset d11, -16
+; CHECK: .cfi_offset d9, -24
+; CHECK: .cfi_offset d8, -32
+  call void asm sideeffect "", "~{d8},~{d9},~{d11}"()
+  call void @bar()
+  ret void
+}
+
+define void @variadic_foo(i8, ...) {
+; CHECK-LABEL: variadic_foo:
+; CHECK: sub sp, #12
+; CHECK: push {r7, lr}
+; CHECK: .cfi_offset r7, -20
+; CHECK: sub sp, #4
+; CHECK: vpush {d11}
+; CHECK: vpush {d8, d9}
+; CHECK: .cfi_offset d11, -32
+; CHECK: .cfi_offset d9, -40
+; CHECK: .cfi_offset d8, -48
+  call void asm sideeffect "", "~{d8},~{d9},~{d11}"()
+  call void @llvm.va_start(i8* null)
+  call void @bar()
+  ret void
+}
+
+define void @test_maintain_stack_align() {
+; CHECK-LABEL: test_maintain_stack_align:
+; CHECK: push {r7, lr}
+; CHECK: vpush {d8, d9}
+; CHECK: sub sp, #8
+  call void asm sideeffect "", "~{d8},~{d9}"()
+  call void @bar()
+  ret void
+}
+
+declare void @bar()
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/ARM/cmpxchg-idioms.ll b/test/CodeGen/ARM/cmpxchg-idioms.ll
index fb88575cab3b3..81e05acfef795 100644
--- a/test/CodeGen/ARM/cmpxchg-idioms.ll
+++ b/test/CodeGen/ARM/cmpxchg-idioms.ll
@@ -15,14 +15,14 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
 ; CHECK: bne [[LOOP]]
 
 ; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
-; CHECK: movs r0, #1
 ; CHECK: dmb ish
+; CHECK: movs r0, #1
 ; CHECK: bx lr
 
 ; CHECK: [[FAILED]]:
 ; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
-; CHECK: movs r0, #0
 ; CHECK: dmb ish
+; CHECK: movs r0, #0
 ; CHECK: bx lr
 
   %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
@@ -34,8 +34,8 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
 define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
 ; CHECK-LABEL: test_return_bool:
 
-; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
 ; CHECK: dmb ishst
+; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
 
 ; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]
diff --git a/test/CodeGen/ARM/cmpxchg-weak.ll b/test/CodeGen/ARM/cmpxchg-weak.ll
index 126e330626235..1eac9c41cf923 100644
--- a/test/CodeGen/ARM/cmpxchg-weak.ll
+++ b/test/CodeGen/ARM/cmpxchg-weak.ll
@@ -5,16 +5,24 @@ define void @test_cmpxchg_weak(i32 *%addr, i32 %desired, i32 %new) {
 
   %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
   %oldval = extractvalue { i32, i1 } %pair, 0
-; CHECK:     dmb ish
-; CHECK:     ldrex   [[LOADED:r[0-9]+]], [r0]
-; CHECK:     cmp     [[LOADED]], r1
-; CHECK:     strexeq [[SUCCESS:r[0-9]+]], r2, [r0]
-; CHECK:     cmpeq   [[SUCCESS]], #0
-; CHECK:     bne     [[DONE:LBB[0-9]+_[0-9]+]]
-; CHECK:     dmb     ish
-; CHECK: [[DONE]]:
-; CHECK:     str     r3, [r0]
-; CHECK:     bx      lr
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT:     dmb ish
+; CHECK-NEXT:     ldrex   [[LOADED:r[0-9]+]], [r0]
+; CHECK-NEXT:     cmp     [[LOADED]], r1
+; CHECK-NEXT:     bne     [[LDFAILBB:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#1:
+; CHECK-NEXT:     strex   [[SUCCESS:r[0-9]+]], r2, [r0]
+; CHECK-NEXT:     cmp     [[SUCCESS]], #0
+; CHECK-NEXT:     bne     [[FAILBB:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#2:
+; CHECK-NEXT:     dmb     ish
+; CHECK-NEXT:     str     r3, [r0]
+; CHECK-NEXT:     bx      lr
+; CHECK-NEXT: [[LDFAILBB]]:
+; CHECK-NEXT:     clrex
+; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT:     str     r3, [r0]
+; CHECK-NEXT:     bx      lr
 
   store i32 %oldval, i32* %addr
   ret void
@@ -27,17 +35,23 @@ define i1 @test_cmpxchg_weak_to_bool(i32, i32 *%addr, i32 %desired, i32 %new) {
   %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
   %success = extractvalue { i32, i1 } %pair, 1
 
-; CHECK:      dmb     ish
-; CHECK:      mov     r0, #0
-; CHECK:      ldrex   [[LOADED:r[0-9]+]], [r1]
-; CHECK:      cmp     [[LOADED]], r2
-; CHECK:      strexeq [[STATUS:r[0-9]+]], r3, [r1]
-; CHECK:      cmpeq   [[STATUS]], #0
-; CHECK:      bne     [[DONE:LBB[0-9]+_[0-9]+]]
-; CHECK:      dmb     ish
-; CHECK:      mov     r0, #1
-; CHECK: [[DONE]]:
-; CHECK:      bx      lr
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT:     dmb ish
+; CHECK-NEXT:     ldrex   [[LOADED:r[0-9]+]], [r1]
+; CHECK-NEXT:     cmp     [[LOADED]], r2
+; CHECK-NEXT:     bne     [[LDFAILBB:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#1:
+; CHECK-NEXT:     strex   [[SUCCESS:r[0-9]+]], r3, [r1]
+; CHECK-NEXT:     mov     r0, #0
+; CHECK-NEXT:     cmp     [[SUCCESS]], #0
+; CHECK-NEXT:     bxne    lr
+; CHECK-NEXT:     dmb     ish
+; CHECK-NEXT:     mov     r0, #1
+; CHECK-NEXT:     bx      lr
+; CHECK-NEXT: [[LDFAILBB]]:
+; CHECK-NEXT:     clrex
+; CHECK-NEXT:     mov     r0, #0
+; CHECK-NEXT:     bx      lr
 
   ret i1 %success
 }
diff --git a/test/CodeGen/ARM/coalesce-dbgvalue.ll b/test/CodeGen/ARM/coalesce-dbgvalue.ll
index cd2ab257207a7..4468f1ec9c42b 100644
--- a/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/test/CodeGen/ARM/coalesce-dbgvalue.ll
@@ -15,7 +15,7 @@ target triple = "thumbv7-apple-ios3.0.0"
 @d = common global i32 0, align 4
 
 ; Function Attrs: nounwind ssp
-define i32 @pr16110() #0 {
+define i32 @pr16110() #0 !dbg !4 {
 for.cond1.preheader:
   store i32 0, i32* @c, align 4, !dbg !21
   br label %for.cond1.outer, !dbg !26
@@ -79,18 +79,18 @@ attributes #3 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 182024) (llvm/trunk 182023)", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !15, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 182024) (llvm/trunk 182023)", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !15, imports: !2)
 !1 = !DIFile(filename: "pr16110.c", directory: "/d/b")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "pr16110", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 7, file: !1, scope: !5, type: !6, function: i32 ()* @pr16110, variables: !9)
+!4 = distinct !DISubprogram(name: "pr16110", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 7, file: !1, scope: !5, type: !6, variables: !9)
 !5 = !DIFile(filename: "pr16110.c", directory: "/d/b")
 !6 = !DISubroutineType(types: !7)
 !7 = !{!8}
 !8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !9 = !{!10, !11}
-!10 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "e", line: 8, scope: !4, file: !5, type: !8)
-!11 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "f", line: 13, scope: !12, file: !5, type: !14)
+!10 = !DILocalVariable(name: "e", line: 8, scope: !4, file: !5, type: !8)
+!11 = !DILocalVariable(name: "f", line: 13, scope: !12, file: !5, type: !14)
 !12 = distinct !DILexicalBlock(line: 12, column: 0, file: !1, scope: !13)
 !13 = distinct !DILexicalBlock(line: 12, column: 0, file: !1, scope: !4)
 !14 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, baseType: !8)
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
index 72fefeacfc5b7..a11976e274486 100644
--- a/test/CodeGen/ARM/coalesce-subregs.ll
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -14,11 +14,11 @@ target triple = "thumbv7-apple-ios0.0.0"
 define void @f(float* %p, i32 %c) nounwind ssp {
 entry:
   %0 = bitcast float* %p to i8*
-  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
   %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
   %add.ptr = getelementptr inbounds float, float* %p, i32 8
   %1 = bitcast float* %add.ptr to i8*
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4)
   ret void
 }
 
@@ -27,13 +27,13 @@ entry:
 define void @f1(float* %p, i32 %c) nounwind ssp {
 entry:
   %0 = bitcast float* %p to i8*
-  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
   %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
   %add.ptr = getelementptr inbounds float, float* %p, i32 8
   %1 = bitcast float* %add.ptr to i8*
-  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %1, i32 4)
   %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4)
   ret void
 }
 
@@ -42,7 +42,7 @@ entry:
 define void @f2(float* %p, i32 %c) nounwind ssp {
 entry:
   %0 = bitcast float* %p to i8*
-  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
   %vld224 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
   br label %do.body
 
@@ -52,10 +52,10 @@ do.body:                                          ; preds = %do.body, %entry
   %p.addr.0 = phi float* [ %p, %entry ], [ %add.ptr, %do.body ]
   %add.ptr = getelementptr inbounds float, float* %p.addr.0, i32 8
   %1 = bitcast float* %add.ptr to i8*
-  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %1, i32 4)
   %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
   %vld2216 = extractvalue { <4 x float>, <4 x float> } %vld22, 1
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4)
   %dec = add nsw i32 %c.addr.0, -1
   %tobool = icmp eq i32 %dec, 0
   br i1 %tobool, label %do.end, label %do.body
@@ -64,8 +64,8 @@ do.end:                                           ; preds = %do.body
   ret void
 }
 
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
 
 ; CHECK: f3
 ; This function has lane insertions that span basic blocks.
@@ -109,12 +109,12 @@ if.end:                                           ; preds = %if.else, %if.then
   %x.0 = phi <2 x float> [ %vecins3, %if.then ], [ %vecins5, %if.else ]
   %add.ptr = getelementptr inbounds float, float* %p, i32 4
   %4 = bitcast float* %add.ptr to i8*
-  tail call void @llvm.arm.neon.vst1.v2f32(i8* %4, <2 x float> %x.0, i32 4)
+  tail call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %4, <2 x float> %x.0, i32 4)
   ret void
 }
 
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
-declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.p0i8.v2f32(i8*, <2 x float>, i32) nounwind
+declare <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8*, i32) nounwind readonly
 
 ; CHECK: f4
 ; This function inserts a lane into a fully defined vector.
@@ -124,7 +124,7 @@ declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
 define void @f4(float* %p, float* %q) nounwind ssp {
 entry:
   %0 = bitcast float* %p to i8*
-  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %0, i32 4)
+  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* %0, i32 4)
   %tobool = icmp eq float* %q, null
   br i1 %tobool, label %if.end, label %if.then
 
@@ -138,7 +138,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %entry, %if.then
   %x.0 = phi <2 x float> [ %vecins, %if.then ], [ %vld1, %entry ]
-  tail call void @llvm.arm.neon.vst1.v2f32(i8* %0, <2 x float> %x.0, i32 4)
+  tail call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %0, <2 x float> %x.0, i32 4)
   ret void
 }
 
@@ -154,7 +154,7 @@ if.end:                                           ; preds = %entry, %if.then
 define void @f5(float* %p, float* %q) nounwind ssp {
 entry:
   %0 = bitcast float* %p to i8*
-  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %0, i32 4)
+  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %0, i32 4)
   %vecext = extractelement <4 x float> %vld1, i32 0
   %vecext1 = extractelement <4 x float> %vld1, i32 1
   %vecext2 = extractelement <4 x float> %vld1, i32 2
@@ -182,13 +182,13 @@ if.end:                                           ; preds = %entry, %if.then
   %vecinit9 = insertelement <4 x float> %vecinit, float %b.0, i32 1
   %vecinit10 = insertelement <4 x float> %vecinit9, float %c.0, i32 2
   %vecinit11 = insertelement <4 x float> %vecinit10, float %add, i32 3
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
   ret void
 }
 
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
 
 ; CHECK: pr13999
 define void @pr13999() nounwind readonly {
diff --git a/test/CodeGen/ARM/combine-vmovdrr.ll b/test/CodeGen/ARM/combine-vmovdrr.ll
new file mode 100644
index 0000000000000..358f7e3a983e0
--- /dev/null
+++ b/test/CodeGen/ARM/combine-vmovdrr.ll
@@ -0,0 +1,72 @@
+; RUN: llc %s -o - | FileCheck %s
+
+target triple = "thumbv7s-apple-ios"
+
+declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %shuffle.i.i307, <8 x i8> %shuffle.i27.i308, <8 x i8> %vtbl2.i25.i)
+
+; Check that we get the motivating example:
+; The bitcasts force the values to go through the GPRs, whereas
+; they are defined on VPRs and used on VPRs.
+;
+; CHECK-LABEL: motivatingExample:
+; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1]
+; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
+; CHECK-NEXT: vstr [[RES]], [r1]
+; CHECK-NEXT: bx lr
+define void @motivatingExample(<2 x i64>* %addr, <8 x i8>* %addr2) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret void
+}
+
+; Check that we do not perform the transformation for dynamic index.
+; CHECK-LABEL: dynamicIndex:
+; CHECK-NOT: mul
+; CHECK: pop
+define void @dynamicIndex(<2 x i64>* %addr, <8 x i8>* %addr2, i32 %index) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 %index
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret void
+}
+
+; Check that we do not perform the transformation when there are several uses
+; of the result of the bitcast.
+; CHECK-LABEL: severalUses:
+; ARG1_VALlo is hard coded because we need to access the high part of d0,
+; i.e., s1, and we can't express that with filecheck.
+; CHECK: vld1.32 {[[ARG1_VALlo:d0]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vldr [[ARG2_VAL:d[0-9]+]], [r1]
+; s1 is actually 2 * ARG1_VALlo + 1, but we cannot express that with filecheck.
+; CHECK-NEXT: vmov [[REThi:r[0-9]+]], s1
+; We build the return value here. s0 is 2 * ARG1_VALlo.
+; CHECK-NEXT: vmov r0, s0
+; This copy is correct but actually useless. We should be able to clean it up.
+; CHECK-NEXT: vmov [[ARG1_VALloCPY:d[0-9]+]], r0, [[REThi]]
+; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALloCPY]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
+; CHECK-NEXT: vstr [[RES]], [r1]
+; CHECK-NEXT: mov r1, [[REThi]]
+; CHECK-NEXT: bx lr
+define i64 @severalUses(<2 x i64>* %addr, <8 x i8>* %addr2) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret i64 %shuffle.i.extract.i310
+}
diff --git a/test/CodeGen/ARM/constants.ll b/test/CodeGen/ARM/constants.ll
index 3baa103e3d5df..75a90bbf0caa6 100644
--- a/test/CodeGen/ARM/constants.ll
+++ b/test/CodeGen/ARM/constants.ll
@@ -63,7 +63,7 @@ define i32 @f8() nounwind {
                                     float 3.000000e+00> }, align 16
 ; CHECK: const1
 ; CHECK: .zero 16
-; CHECK: float 1.0
-; CHECK: float 2.0
-; CHECK: float 3.0
+; CHECK: float 1
+; CHECK: float 2
+; CHECK: float 3
 ; CHECK: .zero 4
diff --git a/test/CodeGen/ARM/dagcombine-concatvector.ll b/test/CodeGen/ARM/dagcombine-concatvector.ll
index 80ef2ab7b8bf3..578d80d1cef46 100644
--- a/test/CodeGen/ARM/dagcombine-concatvector.ll
+++ b/test/CodeGen/ARM/dagcombine-concatvector.ll
@@ -19,8 +19,8 @@ bb:
   %tmp5 = bitcast i64 %tmp4 to <8 x i8>
   %tmp6 = shufflevector <8 x i8> %tmp5, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp7 = shufflevector <16 x i8> %tmp6, <16 x i8> %tmp3, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  tail call void @llvm.arm.neon.vst1.v16i8(i8* %arg, <16 x i8> %tmp7, i32 2)
+  tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %arg, <16 x i8> %tmp7, i32 2)
   ret void
 }
 
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
+declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32)
diff --git a/test/CodeGen/ARM/debug-frame-vararg.ll b/test/CodeGen/ARM/debug-frame-vararg.ll
index c1eff0a5bd673..13ca20c20359a 100644
--- a/test/CodeGen/ARM/debug-frame-vararg.ll
+++ b/test/CodeGen/ARM/debug-frame-vararg.ll
@@ -25,11 +25,11 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5 ", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5 ", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
 !1 = !DIFile(filename: "var.c", directory: "/tmp")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "sum", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 5, file: !1, scope: !5, type: !6, function: i32 (i32, ...)* @sum, variables: !2)
+!4 = distinct !DISubprogram(name: "sum", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 5, file: !1, scope: !5, type: !6, variables: !2)
 !5 = !DIFile(filename: "var.c", directory: "/tmp")
 !6 = !DISubroutineType(types: !7)
 !7 = !{!8, !8}
@@ -37,9 +37,9 @@
 !9 = !{i32 2, !"Dwarf Version", i32 4}
 !10 = !{i32 1, !"Debug Info Version", i32 3}
 !11 = !{!"clang version 3.5 "}
-!12 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "count", line: 5, arg: 1, scope: !4, file: !5, type: !8)
+!12 = !DILocalVariable(name: "count", line: 5, arg: 1, scope: !4, file: !5, type: !8)
 !13 = !DILocation(line: 5, scope: !4)
-!14 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "vl", line: 6, scope: !4, file: !5, type: !15)
+!14 = !DILocalVariable(name: "vl", line: 6, scope: !4, file: !5, type: !15)
 !15 = !DIDerivedType(tag: DW_TAG_typedef, name: "va_list", line: 30, file: !16, baseType: !17)
 !16 = !DIFile(filename: "/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", directory: "/tmp")
 !17 = !DIDerivedType(tag: DW_TAG_typedef, name: "__builtin_va_list", line: 6, file: !1, baseType: !18)
@@ -49,9 +49,9 @@
 !21 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, baseType: null)
 !22 = !DILocation(line: 6, scope: !4)
 !23 = !DILocation(line: 7, scope: !4)
-!24 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "sum", line: 8, scope: !4, file: !5, type: !8)
+!24 = !DILocalVariable(name: "sum", line: 8, scope: !4, file: !5, type: !8)
 !25 = !DILocation(line: 8, scope: !4)
-!26 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 9, scope: !27, file: !5, type: !8)
+!26 = !DILocalVariable(name: "i", line: 9, scope: !27, file: !5, type: !8)
 !27 = distinct !DILexicalBlock(line: 9, column: 0, file: !1, scope: !4)
 !28 = !DILocation(line: 9, scope: !27)
 !29 = !DILocation(line: 10, scope: !30)
@@ -108,7 +108,7 @@
 ; CHECK-THUMB-FP-ELIM: add    r7, sp, #8
 ; CHECK-THUMB-FP-ELIM: .cfi_def_cfa r7, 20
 
-define i32 @sum(i32 %count, ...) {
+define i32 @sum(i32 %count, ...) !dbg !4 {
 entry:
   %vl = alloca i8*, align 4
   %vl1 = bitcast i8** %vl to i8*
diff --git a/test/CodeGen/ARM/debug-frame.ll b/test/CodeGen/ARM/debug-frame.ll
index cc07400c2e1c3..4bd401b684960 100644
--- a/test/CodeGen/ARM/debug-frame.ll
+++ b/test/CodeGen/ARM/debug-frame.ll
@@ -30,11 +30,11 @@
 ; RUN:     -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP-ELIM
 
-; RUN: llc -mtriple thumb-unknown-linux-gnueabi \
+; RUN: llc -mtriple thumbv5-unknown-linux-gnueabi \
 ; RUN:     -disable-fp-elim -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-FP
 
-; RUN: llc -mtriple thumb-unknown-linux-gnueabi \
+; RUN: llc -mtriple thumbv5-unknown-linux-gnueabi \
 ; RUN:     -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-FP-ELIM
 
@@ -125,11 +125,11 @@ declare void @_ZSt9terminatev()
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
 !1 = !DIFile(filename: "exp.cpp", directory: "/tmp")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "test", linkageName: "_Z4testiiiiiddddd", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 5, file: !1, scope: !5, type: !6, function: void (i32, i32, i32, i32, i32, double, double, double, double, double)* @_Z4testiiiiiddddd, variables: !2)
+!4 = distinct !DISubprogram(name: "test", linkageName: "_Z4testiiiiiddddd", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 5, file: !1, scope: !5, type: !6, variables: !2)
 !5 = !DIFile(filename: "exp.cpp", directory: "/tmp")
 !6 = !DISubroutineType(types: !7)
 !7 = !{null, !8, !8, !8, !8, !8, !9, !9, !9, !9, !9}
@@ -138,18 +138,18 @@ declare void @_ZSt9terminatev()
 !10 = !{i32 2, !"Dwarf Version", i32 4}
 !11 = !{i32 1, !"Debug Info Version", i32 3}
 !12 = !{!"clang version 3.5 "}
-!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 4, arg: 1, scope: !4, file: !5, type: !8)
+!13 = !DILocalVariable(name: "a", line: 4, arg: 1, scope: !4, file: !5, type: !8)
 !14 = !DILocation(line: 4, scope: !4)
-!15 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "b", line: 4, arg: 2, scope: !4, file: !5, type: !8)
-!16 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 4, arg: 3, scope: !4, file: !5, type: !8)
-!17 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "d", line: 4, arg: 4, scope: !4, file: !5, type: !8)
-!18 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "e", line: 4, arg: 5, scope: !4, file: !5, type: !8)
-!19 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "m", line: 5, arg: 6, scope: !4, file: !5, type: !9)
+!15 = !DILocalVariable(name: "b", line: 4, arg: 2, scope: !4, file: !5, type: !8)
+!16 = !DILocalVariable(name: "c", line: 4, arg: 3, scope: !4, file: !5, type: !8)
+!17 = !DILocalVariable(name: "d", line: 4, arg: 4, scope: !4, file: !5, type: !8)
+!18 = !DILocalVariable(name: "e", line: 4, arg: 5, scope: !4, file: !5, type: !8)
+!19 = !DILocalVariable(name: "m", line: 5, arg: 6, scope: !4, file: !5, type: !9)
 !20 = !DILocation(line: 5, scope: !4)
-!21 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "n", line: 5, arg: 7, scope: !4, file: !5, type: !9)
-!22 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "p", line: 5, arg: 8, scope: !4, file: !5, type: !9)
-!23 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "q", line: 5, arg: 9, scope: !4, file: !5, type: !9)
-!24 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "r", line: 5, arg: 10, scope: !4, file: !5, type: !9)
+!21 = !DILocalVariable(name: "n", line: 5, arg: 7, scope: !4, file: !5, type: !9)
+!22 = !DILocalVariable(name: "p", line: 5, arg: 8, scope: !4, file: !5, type: !9)
+!23 = !DILocalVariable(name: "q", line: 5, arg: 9, scope: !4, file: !5, type: !9)
+!24 = !DILocalVariable(name: "r", line: 5, arg: 10, scope: !4, file: !5, type: !9)
 !25 = !DILocation(line: 7, scope: !26)
 !26 = distinct !DILexicalBlock(line: 6, column: 0, file: !1, scope: !4)
 !27 = !DILocation(line: 8, scope: !26)
diff --git a/test/CodeGen/ARM/debug-info-arg.ll b/test/CodeGen/ARM/debug-info-arg.ll
index 84eae77794a48..bf7e7321ae3d2 100644
--- a/test/CodeGen/ARM/debug-info-arg.ll
+++ b/test/CodeGen/ARM/debug-info-arg.ll
@@ -6,12 +6,12 @@ target triple = "thumbv7-apple-ios"
 
 %struct.tag_s = type { i32, i32, i32 }
 
-define void @foo(%struct.tag_s* nocapture %this, %struct.tag_s* %c, i64 %x, i64 %y, %struct.tag_s* nocapture %ptr1, %struct.tag_s* nocapture %ptr2) nounwind ssp {
+define void @foo(%struct.tag_s* nocapture %this, %struct.tag_s* %c, i64 %x, i64 %y, %struct.tag_s* nocapture %ptr1, %struct.tag_s* nocapture %ptr2) nounwind ssp !dbg !1 {
   tail call void @llvm.dbg.value(metadata %struct.tag_s* %this, i64 0, metadata !5, metadata !DIExpression()), !dbg !20
   tail call void @llvm.dbg.value(metadata %struct.tag_s* %c, i64 0, metadata !13, metadata !DIExpression()), !dbg !21
   tail call void @llvm.dbg.value(metadata i64 %x, i64 0, metadata !14, metadata !DIExpression()), !dbg !22
   tail call void @llvm.dbg.value(metadata i64 %y, i64 0, metadata !17, metadata !DIExpression()), !dbg !23
-;CHECK:	@DEBUG_VALUE: foo:y <- [R7+8]
+;CHECK:	@DEBUG_VALUE: foo:y <- [%R7+8]
   tail call void @llvm.dbg.value(metadata %struct.tag_s* %ptr1, i64 0, metadata !18, metadata !DIExpression()), !dbg !24
   tail call void @llvm.dbg.value(metadata %struct.tag_s* %ptr2, i64 0, metadata !19, metadata !DIExpression()), !dbg !25
   %1 = icmp eq %struct.tag_s* %c, null, !dbg !26
@@ -32,12 +32,12 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: 1, file: !32, enums: !{}, retainedTypes: !{}, subprograms: !30, imports:  null)
-!1 = !DISubprogram(name: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !2, scope: !2, type: !3, function: void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, variables: !31)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: 1, file: !32, enums: !{}, retainedTypes: !{}, subprograms: !30, imports:  null)
+!1 = distinct !DISubprogram(name: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !2, scope: !2, type: !3, variables: !31)
 !2 = !DIFile(filename: "one.c", directory: "/Volumes/Athwagate/R10048772")
 !3 = !DISubroutineType(types: !4)
 !4 = !{null}
-!5 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 11, arg: 1, scope: !1, file: !2, type: !6)
+!5 = !DILocalVariable(name: "this", line: 11, arg: 1, scope: !1, file: !2, type: !6)
 !6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !0, baseType: !7)
 !7 = !DICompositeType(tag: DW_TAG_structure_type, name: "tag_s", line: 5, size: 96, align: 32, file: !32, scope: !0, elements: !8)
 !8 = !{!9, !11, !12}
@@ -45,13 +45,13 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !10 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !11 = !DIDerivedType(tag: DW_TAG_member, name: "y", line: 7, size: 32, align: 32, offset: 32, file: !32, scope: !7, baseType: !10)
 !12 = !DIDerivedType(tag: DW_TAG_member, name: "z", line: 8, size: 32, align: 32, offset: 64, file: !32, scope: !7, baseType: !10)
-!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 11, arg: 2, scope: !1, file: !2, type: !6)
-!14 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "x", line: 11, arg: 3, scope: !1, file: !2, type: !15)
+!13 = !DILocalVariable(name: "c", line: 11, arg: 2, scope: !1, file: !2, type: !6)
+!14 = !DILocalVariable(name: "x", line: 11, arg: 3, scope: !1, file: !2, type: !15)
 !15 = !DIDerivedType(tag: DW_TAG_typedef, name: "UInt64", line: 1, file: !32, scope: !0, baseType: !16)
 !16 = !DIBasicType(tag: DW_TAG_base_type, name: "long long unsigned int", size: 64, align: 32, encoding: DW_ATE_unsigned)
-!17 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "y", line: 11, arg: 4, scope: !1, file: !2, type: !15)
-!18 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "ptr1", line: 11, arg: 5, scope: !1, file: !2, type: !6)
-!19 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "ptr2", line: 11, arg: 6, scope: !1, file: !2, type: !6)
+!17 = !DILocalVariable(name: "y", line: 11, arg: 4, scope: !1, file: !2, type: !15)
+!18 = !DILocalVariable(name: "ptr1", line: 11, arg: 5, scope: !1, file: !2, type: !6)
+!19 = !DILocalVariable(name: "ptr2", line: 11, arg: 6, scope: !1, file: !2, type: !6)
 !20 = !DILocation(line: 11, column: 24, scope: !1)
 !21 = !DILocation(line: 11, column: 44, scope: !1)
 !22 = !DILocation(line: 11, column: 54, scope: !1)
diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll
index 366102755174d..c628c5e9038da 100644
--- a/test/CodeGen/ARM/debug-info-blocks.ll
+++ b/test/CodeGen/ARM/debug-info-blocks.ll
@@ -1,5 +1,21 @@
-; RUN: llc -O0 < %s | FileCheck %s
-; CHECK: @DEBUG_VALUE: foobar_func_block_invoke_0:mydata <- [SP+{{[0-9]+}}]
+; RUN: llc -filetype=obj -O0 < %s | llvm-dwarfdump - | FileCheck %s
+
+; debug_info content
+; CHECK: DW_AT_name {{.*}} "foobar_func_block_invoke_0"
+; CHECK-NOT: DW_TAG_subprogram
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK-NEXT: DW_AT_location [DW_FORM_sec_offset]	([[MYDATA_LOC:0x[0-9a-f]*]])
+; CHECK-NEXT: DW_AT_name {{.*}} "mydata"
+
+; debug_loc content
+; CHECK: .debug_loc contents:
+; CHECK: [[MYDATA_LOC]]: Beginning address offset: {{.*}}
+; CHECK-NOT: {{0x[0-9a-f]*}}: Beginning address offset
+; CHECK: Location description: {{.*}} 23 04 06 23 18
+; CHECK-NOT: {{0x[0-9a-f]*}}: Beginning address offset
+; CHECK: Location description: {{.*}} 23 04 06 23 18
+
 ; Radar 9331779
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-ios"
@@ -27,7 +43,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 
-define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %loadedMydata, [4 x i32] %bounds.coerce0, [4 x i32] %data.coerce0) ssp {
+define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %loadedMydata, [4 x i32] %bounds.coerce0, [4 x i32] %data.coerce0) ssp !dbg !23 {
   %1 = alloca %0*, align 4
   %bounds = alloca %struct.CR, align 4
   %data = alloca %struct.CR, align 4
@@ -95,7 +111,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!162}
 
-!0 = !DICompileUnit(language: DW_LANG_ObjC, producer: "Apple clang version 2.1", isOptimized: false, runtimeVersion: 2, emissionKind: 1, file: !153, enums: !147, retainedTypes: !{}, subprograms: !148)
+!0 = distinct !DICompileUnit(language: DW_LANG_ObjC, producer: "Apple clang version 2.1", isOptimized: false, runtimeVersion: 2, emissionKind: 1, file: !153, enums: !147, retainedTypes: !{}, subprograms: !148)
 !1 = !DICompositeType(tag: DW_TAG_enumeration_type, line: 248, size: 32, align: 32, file: !160, scope: !0, elements: !3)
 !2 = !DIFile(filename: "header.h", directory: "/Volumes/Sandbox/llvm")
 !3 = !{!4}
@@ -118,11 +134,11 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !20 = !DIFile(filename: "header4.h", directory: "/Volumes/Sandbox/llvm")
 !21 = !{!22}
 !22 = !DIEnumerator(name: "Eleven", value: 0) ; [ DW_TAG_enumerator ]
-!23 = !DISubprogram(name: "foobar_func_block_invoke_0", line: 609, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 609, file: !152, scope: !24, type: !25, function: void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0)
+!23 = distinct !DISubprogram(name: "foobar_func_block_invoke_0", line: 609, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 609, file: !152, scope: !24, type: !25)
 !24 = !DIFile(filename: "MyLibrary.m", directory: "/Volumes/Sandbox/llvm")
 !25 = !DISubroutineType(types: !26)
 !26 = !{null}
-!27 = !DILocalVariable(tag: DW_TAG_arg_variable, name: ".block_descriptor", line: 609, arg: 1, flags: DIFlagArtificial, scope: !23, file: !24, type: !28)
+!27 = !DILocalVariable(name: ".block_descriptor", line: 609, arg: 1, flags: DIFlagArtificial, scope: !23, file: !24, type: !28)
 !28 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, scope: !0, baseType: !29)
 !29 = !DICompositeType(tag: DW_TAG_structure_type, name: "__block_literal_14", line: 609, size: 256, align: 32, file: !152, scope: !24, elements: !30)
 !30 = !{!31, !33, !35, !36, !37, !48, !89, !124}
@@ -225,16 +241,16 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !127 = !DICompositeType(tag: DW_TAG_structure_type, name: "my_struct", line: 49, flags: DIFlagFwdDecl, file: !159, scope: !0)
 !128 = !DIFile(filename: "header15.h", directory: "/Volumes/Sandbox/llvm")
 !129 = !DILocation(line: 609, column: 144, scope: !23)
-!130 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "loadedMydata", line: 609, arg: 2, scope: !23, file: !24, type: !59)
+!130 = !DILocalVariable(name: "loadedMydata", line: 609, arg: 2, scope: !23, file: !24, type: !59)
 !131 = !DILocation(line: 609, column: 155, scope: !23)
-!132 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "bounds", line: 609, arg: 3, scope: !23, file: !24, type: !108)
+!132 = !DILocalVariable(name: "bounds", line: 609, arg: 3, scope: !23, file: !24, type: !108)
 !133 = !DILocation(line: 609, column: 175, scope: !23)
-!134 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "data", line: 609, arg: 4, scope: !23, file: !24, type: !108)
+!134 = !DILocalVariable(name: "data", line: 609, arg: 4, scope: !23, file: !24, type: !108)
 !135 = !DILocation(line: 609, column: 190, scope: !23)
-!136 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "mydata", line: 604, scope: !23, file: !24, type: !50)
+!136 = !DILocalVariable(name: "mydata", line: 604, scope: !23, file: !24, type: !50)
 !137 = !DILocation(line: 604, column: 49, scope: !23)
-!138 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "self", line: 604, scope: !23, file: !40, type: !90)
-!139 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "semi", line: 607, scope: !23, file: !24, type: !125)
+!138 = !DILocalVariable(name: "self", line: 604, scope: !23, file: !40, type: !90)
+!139 = !DILocalVariable(name: "semi", line: 607, scope: !23, file: !24, type: !125)
 !140 = !DILocation(line: 607, column: 30, scope: !23)
 !141 = !DILocation(line: 610, column: 17, scope: !142)
 !142 = distinct !DILexicalBlock(line: 609, column: 200, file: !152, scope: !23)
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index 03b4d6b381517..b9d110e42cd40 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -6,15 +6,15 @@ target triple = "thumbv7-apple-macosx10.6.7"
 ;CHECK-NEXT: Ltmp1
 ;CHECK-NEXT: LBB0_1
 
-;CHECK:@DEBUG_VALUE: x <- Q4{{$}}
-;CHECK-NEXT:@DEBUG_VALUE: y <- Q4{{$}}
+;CHECK:@DEBUG_VALUE: x <- %Q4{{$}}
+;CHECK-NEXT:@DEBUG_VALUE: y <- %Q4{{$}}
 
 
 @.str = external constant [13 x i8]
 
 declare <4 x float> @test0001(float) nounwind readnone ssp
 
-define i32 @main(i32 %argc, i8** nocapture %argv, i1 %cond) nounwind ssp {
+define i32 @main(i32 %argc, i8** nocapture %argv, i1 %cond) nounwind ssp !dbg !10 {
 entry:
   br label %for.body9
 
@@ -42,9 +42,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.module.flags = !{!56}
 !llvm.dbg.cu = !{!2}
 
-!0 = !DISubprogram(name: "test0001", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !54, scope: null, type: !3, function: <4 x float> (float)* @test0001, variables: !51)
+!0 = distinct !DISubprogram(name: "test0001", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !54, scope: null, type: !3, variables: !51)
 !1 = !DIFile(filename: "build2.c", directory: "/private/tmp")
-!2 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !54, enums: !{}, retainedTypes: !{}, subprograms: !50, imports:  null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !54, enums: !{}, retainedTypes: !{}, subprograms: !50, imports:  null)
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5}
 !5 = !DIDerivedType(tag: DW_TAG_typedef, name: "v4f32", line: 14, file: !54, scope: !2, baseType: !6)
@@ -52,27 +52,27 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !7 = !DIBasicType(tag: DW_TAG_base_type, name: "float", size: 32, align: 32, encoding: DW_ATE_float)
 !8 = !{!9}
 !9 = !DISubrange(count: 4)
-!10 = !DISubprogram(name: "main", line: 59, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !54, scope: null, type: !11, function: i32 (i32, i8**, i1)* @main, variables: !52)
+!10 = distinct !DISubprogram(name: "main", line: 59, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !54, scope: null, type: !11, variables: !52)
 !11 = !DISubroutineType(types: !12)
 !12 = !{!13}
 !13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!14 = !DISubprogram(name: "printFV", line: 41, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !55, scope: null, type: !16, variables: !53)
+!14 = distinct !DISubprogram(name: "printFV", line: 41, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !55, scope: null, type: !16, variables: !53)
 !15 = !DIFile(filename: "/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", directory: "/private/tmp")
 !16 = !DISubroutineType(types: !17)
 !17 = !{null}
-!18 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 3, arg: 1, scope: !0, file: !1, type: !7)
-!19 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argc", line: 59, arg: 1, scope: !10, file: !1, type: !13)
-!20 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argv", line: 59, arg: 2, scope: !10, file: !1, type: !21)
+!18 = !DILocalVariable(name: "a", line: 3, arg: 1, scope: !0, file: !1, type: !7)
+!19 = !DILocalVariable(name: "argc", line: 59, arg: 1, scope: !10, file: !1, type: !13)
+!20 = !DILocalVariable(name: "argv", line: 59, arg: 2, scope: !10, file: !1, type: !21)
 !21 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: !22)
 !22 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: !23)
 !23 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!24 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 60, scope: !25, file: !1, type: !13)
+!24 = !DILocalVariable(name: "i", line: 60, scope: !25, file: !1, type: !13)
 !25 = distinct !DILexicalBlock(line: 59, column: 33, file: !1, scope: !10)
-!26 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 60, scope: !25, file: !1, type: !13)
-!27 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "x", line: 61, scope: !25, file: !1, type: !5)
-!28 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "y", line: 62, scope: !25, file: !1, type: !5)
-!29 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "z", line: 63, scope: !25, file: !1, type: !5)
-!30 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "F", line: 41, arg: 1, scope: !14, file: !15, type: !31)
+!26 = !DILocalVariable(name: "j", line: 60, scope: !25, file: !1, type: !13)
+!27 = !DILocalVariable(name: "x", line: 61, scope: !25, file: !1, type: !5)
+!28 = !DILocalVariable(name: "y", line: 62, scope: !25, file: !1, type: !5)
+!29 = !DILocalVariable(name: "z", line: 63, scope: !25, file: !1, type: !5)
+!30 = !DILocalVariable(name: "F", line: 41, arg: 1, scope: !14, file: !15, type: !31)
 !31 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: !32)
 !32 = !DIDerivedType(tag: DW_TAG_typedef, name: "FV", line: 25, file: !55, scope: !2, baseType: !33)
 !33 = !DICompositeType(tag: DW_TAG_union_type, line: 22, size: 128, align: 128, file: !55, scope: !2, elements: !34)
diff --git a/test/CodeGen/ARM/debug-info-d16-reg.ll b/test/CodeGen/ARM/debug-info-d16-reg.ll
index 27bd3b8639c42..0d457d3a7371a 100644
--- a/test/CodeGen/ARM/debug-info-d16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-d16-reg.ll
@@ -10,7 +10,7 @@ target triple = "thumbv7-apple-darwin10"
 @.str = private unnamed_addr constant [11 x i8] c"%p %lf %c\0A\00", align 4
 @.str1 = private unnamed_addr constant [6 x i8] c"point\00", align 4
 
-define i32 @inlineprinter(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize {
+define i32 @inlineprinter(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize !dbg !9 {
 entry:
   tail call void @llvm.dbg.value(metadata i8* %ptr, i64 0, metadata !19, metadata !DIExpression()), !dbg !26
   tail call void @llvm.dbg.value(metadata double %val, i64 0, metadata !20, metadata !DIExpression()), !dbg !26
@@ -20,7 +20,7 @@ entry:
   ret i32 0, !dbg !29
 }
 
-define i32 @printer(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize noinline {
+define i32 @printer(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize noinline !dbg !0 {
 entry:
   tail call void @llvm.dbg.value(metadata i8* %ptr, i64 0, metadata !16, metadata !DIExpression()), !dbg !30
   tail call void @llvm.dbg.value(metadata double %val, i64 0, metadata !17, metadata !DIExpression()), !dbg !30
@@ -34,7 +34,7 @@ declare i32 @printf(i8* nocapture, ...) nounwind
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
-define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize {
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize !dbg !10 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !22, metadata !DIExpression()), !dbg !34
   tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !23, metadata !DIExpression()), !dbg !34
@@ -59,36 +59,36 @@ declare i32 @puts(i8* nocapture) nounwind
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!48}
 
-!0 = !DISubprogram(name: "printer", linkageName: "printer", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !46, scope: !1, type: !3, function: i32 (i8*, double, i8)* @printer, variables: !43)
+!0 = distinct !DISubprogram(name: "printer", linkageName: "printer", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !46, scope: !1, type: !3, variables: !43)
 !1 = !DIFile(filename: "a.c", directory: "/tmp/")
-!2 = !DICompileUnit(language: DW_LANG_C89, producer: "(LLVM build 00)", isOptimized: true, emissionKind: 1, file: !46, enums: !47, retainedTypes: !47, subprograms: !42, imports:  null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "(LLVM build 00)", isOptimized: true, emissionKind: 1, file: !46, enums: !47, retainedTypes: !47, subprograms: !42, imports:  null)
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5, !6, !7, !8}
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !46, scope: !1, baseType: null)
 !7 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 32, encoding: DW_ATE_float)
 !8 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
-!9 = !DISubprogram(name: "inlineprinter", linkageName: "inlineprinter", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !46, scope: !1, type: !3, function: i32 (i8*, double, i8)* @inlineprinter, variables: !44)
-!10 = !DISubprogram(name: "main", linkageName: "main", line: 18, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 18, file: !46, scope: !1, type: !11, function: i32 (i32, i8**)* @main, variables: !45)
+!9 = distinct !DISubprogram(name: "inlineprinter", linkageName: "inlineprinter", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !46, scope: !1, type: !3, variables: !44)
+!10 = distinct !DISubprogram(name: "main", linkageName: "main", line: 18, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 18, file: !46, scope: !1, type: !11, variables: !45)
 !11 = !DISubroutineType(types: !12)
 !12 = !{!5, !5, !13}
 !13 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !46, scope: !1, baseType: !14)
 !14 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !46, scope: !1, baseType: !15)
 !15 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!16 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 11, arg: 1, scope: !0, file: !1, type: !6)
-!17 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 11, arg: 2, scope: !0, file: !1, type: !7)
-!18 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 11, arg: 3, scope: !0, file: !1, type: !8)
-!19 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 4, arg: 1, scope: !9, file: !1, type: !6)
-!20 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 4, arg: 2, scope: !9, file: !1, type: !7)
-!21 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 4, arg: 3, scope: !9, file: !1, type: !8)
+!16 = !DILocalVariable(name: "ptr", line: 11, arg: 1, scope: !0, file: !1, type: !6)
+!17 = !DILocalVariable(name: "val", line: 11, arg: 2, scope: !0, file: !1, type: !7)
+!18 = !DILocalVariable(name: "c", line: 11, arg: 3, scope: !0, file: !1, type: !8)
+!19 = !DILocalVariable(name: "ptr", line: 4, arg: 1, scope: !9, file: !1, type: !6)
+!20 = !DILocalVariable(name: "val", line: 4, arg: 2, scope: !9, file: !1, type: !7)
+!21 = !DILocalVariable(name: "c", line: 4, arg: 3, scope: !9, file: !1, type: !8)
 
-!49 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 4, arg: 1, scope: !9, file: !1, type: !6)
-!50 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 4, arg: 2, scope: !9, file: !1, type: !7)
-!51 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 4, arg: 2, scope: !9, file: !1, type: !8)
+!49 = !DILocalVariable(name: "ptr", line: 4, arg: 1, scope: !9, file: !1, type: !6)
+!50 = !DILocalVariable(name: "val", line: 4, arg: 2, scope: !9, file: !1, type: !7)
+!51 = !DILocalVariable(name: "c", line: 4, arg: 2, scope: !9, file: !1, type: !8)
 
-!22 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argc", line: 17, arg: 0, scope: !10, file: !1, type: !5)
-!23 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argv", line: 17, arg: 0, scope: !10, file: !1, type: !13)
-!24 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "dval", line: 19, scope: !25, file: !1, type: !7)
+!22 = !DILocalVariable(name: "argc", line: 17, arg: 1, scope: !10, file: !1, type: !5)
+!23 = !DILocalVariable(name: "argv", line: 17, arg: 2, scope: !10, file: !1, type: !13)
+!24 = !DILocalVariable(name: "dval", line: 19, scope: !25, file: !1, type: !7)
 !25 = distinct !DILexicalBlock(line: 18, column: 0, file: !46, scope: !10)
 !26 = !DILocation(line: 4, scope: !9)
 !27 = !DILocation(line: 6, scope: !28)
diff --git a/test/CodeGen/ARM/debug-info-no-frame.ll b/test/CodeGen/ARM/debug-info-no-frame.ll
index e00563cc47c4b..d77a195b95289 100644
--- a/test/CodeGen/ARM/debug-info-no-frame.ll
+++ b/test/CodeGen/ARM/debug-info-no-frame.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=armv7-none-linux-gnueabihf < %s -o - | FileCheck %s
 
 ; Function Attrs: nounwind
-define void @need_cfi_def_cfa_offset() #0 {
+define void @need_cfi_def_cfa_offset() #0 !dbg !3 {
 ; CHECK-LABEL: need_cfi_def_cfa_offset:
 ; CHECK: sub	sp, sp, #4
 ; CHECK: .cfi_def_cfa_offset 4
@@ -21,16 +21,16 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: false)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: false, subprograms: !{!3})
 !1 = !DIFile(filename: "file.c", directory: "/dir")
 !2 = !{}
-!3 = !DISubprogram(name: "need_cfi_def_cfa_offset", scope: !1, file: !1, line: 1, type: !4, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, function: void ()* @need_cfi_def_cfa_offset, variables: !2)
+!3 = distinct !DISubprogram(name: "need_cfi_def_cfa_offset", scope: !1, file: !1, line: 1, type: !4, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, variables: !2)
 !4 = !DISubroutineType(types: !5)
 !5 = !{null}
 !6 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !7 = !{i32 2, !"Dwarf Version", i32 4}
 !8 = !{i32 2, !"Debug Info Version", i32 3}
-!9 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "Depth", scope: !3, file: !1, line: 3, type: !6)
+!9 = !DILocalVariable(name: "Depth", scope: !3, file: !1, line: 3, type: !6)
 !10 = !DIExpression()
 !11 = !DILocation(line: 3, column: 9, scope: !3)
 !12 = !DILocation(line: 7, column: 5, scope: !3)
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index 665818fc0b2eb..1cd90d4336406 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -15,7 +15,7 @@ target triple = "thumbv7-apple-macosx10.6.7"
 
 declare <4 x float> @test0001(float) nounwind readnone ssp
 
-define i32 @main(i32 %argc, i8** nocapture %argv) nounwind ssp {
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind ssp !dbg !10 {
 entry:
   br label %for.body9
 
@@ -38,9 +38,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!56}
 
-!0 = !DISubprogram(name: "test0001", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !54, scope: !1, type: !3, function: <4 x float> (float)* @test0001, variables: !51)
+!0 = distinct !DISubprogram(name: "test0001", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !54, scope: !1, type: !3, variables: !51)
 !1 = !DIFile(filename: "build2.c", directory: "/private/tmp")
-!2 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !54, enums: !{}, retainedTypes: !{}, subprograms: !50, imports:  null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !54, enums: !{}, retainedTypes: !{}, subprograms: !50, imports:  null)
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5}
 !5 = !DIDerivedType(tag: DW_TAG_typedef, name: "v4f32", line: 14, file: !54, scope: !2, baseType: !6)
@@ -48,27 +48,27 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !7 = !DIBasicType(tag: DW_TAG_base_type, name: "float", size: 32, align: 32, encoding: DW_ATE_float)
 !8 = !{!9}
 !9 = !DISubrange(count: 4)
-!10 = !DISubprogram(name: "main", line: 59, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 59, file: !54, scope: !1, type: !11, function: i32 (i32, i8**)* @main, variables: !52)
+!10 = distinct !DISubprogram(name: "main", line: 59, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 59, file: !54, scope: !1, type: !11, variables: !52)
 !11 = !DISubroutineType(types: !12)
 !12 = !{!13}
 !13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!14 = !DISubprogram(name: "printFV", line: 41, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 41, file: !55, scope: !15, type: !16, variables: !53)
+!14 = distinct !DISubprogram(name: "printFV", line: 41, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 41, file: !55, scope: !15, type: !16, variables: !53)
 !15 = !DIFile(filename: "/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", directory: "/private/tmp")
 !16 = !DISubroutineType(types: !17)
 !17 = !{null}
-!18 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 3, arg: 1, scope: !0, file: !1, type: !7)
-!19 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argc", line: 59, arg: 1, scope: !10, file: !1, type: !13)
-!20 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argv", line: 59, arg: 2, scope: !10, file: !1, type: !21)
+!18 = !DILocalVariable(name: "a", line: 3, arg: 1, scope: !0, file: !1, type: !7)
+!19 = !DILocalVariable(name: "argc", line: 59, arg: 1, scope: !10, file: !1, type: !13)
+!20 = !DILocalVariable(name: "argv", line: 59, arg: 2, scope: !10, file: !1, type: !21)
 !21 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: !22)
 !22 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: !23)
 !23 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!24 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 60, scope: !25, file: !1, type: !13)
+!24 = !DILocalVariable(name: "i", line: 60, scope: !25, file: !1, type: !13)
 !25 = distinct !DILexicalBlock(line: 59, column: 33, file: !54, scope: !10)
-!26 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 60, scope: !25, file: !1, type: !13)
-!27 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "x", line: 61, scope: !25, file: !1, type: !5)
-!28 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "y", line: 62, scope: !25, file: !1, type: !5)
-!29 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "z", line: 63, scope: !25, file: !1, type: !5)
-!30 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "F", line: 41, arg: 1, scope: !14, file: !15, type: !31)
+!26 = !DILocalVariable(name: "j", line: 60, scope: !25, file: !1, type: !13)
+!27 = !DILocalVariable(name: "x", line: 61, scope: !25, file: !1, type: !5)
+!28 = !DILocalVariable(name: "y", line: 62, scope: !25, file: !1, type: !5)
+!29 = !DILocalVariable(name: "z", line: 63, scope: !25, file: !1, type: !5)
+!30 = !DILocalVariable(name: "F", line: 41, arg: 1, scope: !14, file: !15, type: !31)
 !31 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: !32)
 !32 = !DIDerivedType(tag: DW_TAG_typedef, name: "FV", line: 25, file: !55, scope: !2, baseType: !33)
 !33 = !DICompositeType(tag: DW_TAG_union_type, line: 22, size: 128, align: 128, file: !55, scope: !2, elements: !34)
diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
index ec080f20db9c4..654aa4545ca40 100644
--- a/test/CodeGen/ARM/debug-info-s16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll
@@ -12,7 +12,7 @@ target triple = "thumbv7-apple-macosx10.6.7"
 @.str = private unnamed_addr constant [11 x i8] c"%p %lf %c\0A\00"
 @.str1 = private unnamed_addr constant [6 x i8] c"point\00"
 
-define i32 @inlineprinter(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize ssp {
+define i32 @inlineprinter(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize ssp !dbg !0 {
 entry:
   tail call void @llvm.dbg.value(metadata i8* %ptr, i64 0, metadata !8, metadata !DIExpression()), !dbg !24
   tail call void @llvm.dbg.value(metadata float %val, i64 0, metadata !10, metadata !DIExpression()), !dbg !25
@@ -25,7 +25,7 @@ entry:
 
 declare i32 @printf(i8* nocapture, ...) nounwind optsize
 
-define i32 @printer(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize noinline ssp {
+define i32 @printer(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize noinline ssp !dbg !6 {
 entry:
   tail call void @llvm.dbg.value(metadata i8* %ptr, i64 0, metadata !14, metadata !DIExpression()), !dbg !30
   tail call void @llvm.dbg.value(metadata float %val, i64 0, metadata !15, metadata !DIExpression()), !dbg !31
@@ -36,7 +36,7 @@ entry:
   ret i32 0, !dbg !35
 }
 
-define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize ssp {
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize ssp !dbg !7 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !17, metadata !DIExpression()), !dbg !36
   tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !18, metadata !DIExpression()), !dbg !37
@@ -65,34 +65,34 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!53}
 
-!0 = !DISubprogram(name: "inlineprinter", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !51, scope: !1, type: !3, function: i32 (i8*, float, i8)* @inlineprinter, variables: !48)
+!0 = distinct !DISubprogram(name: "inlineprinter", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !51, scope: !1, type: !3, variables: !48)
 !1 = !DIFile(filename: "a.c", directory: "/private/tmp")
-!2 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !51, enums: !52, retainedTypes: !52, subprograms: !47, imports:  null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !51, enums: !52, retainedTypes: !52, subprograms: !47, imports:  null)
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5}
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!6 = !DISubprogram(name: "printer", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !51, scope: !1, type: !3, function: i32 (i8*, float, i8)* @printer, variables: !49)
-!7 = !DISubprogram(name: "main", line: 18, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 18, file: !51, scope: !1, type: !3, function: i32 (i32, i8**)* @main, variables: !50)
-!8 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 4, arg: 1, scope: !0, file: !1, type: !9)
+!6 = distinct !DISubprogram(name: "printer", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !51, scope: !1, type: !3, variables: !49)
+!7 = distinct !DISubprogram(name: "main", line: 18, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 18, file: !51, scope: !1, type: !3, variables: !50)
+!8 = !DILocalVariable(name: "ptr", line: 4, arg: 1, scope: !0, file: !1, type: !9)
 !9 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: null)
-!10 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 4, arg: 2, scope: !0, file: !1, type: !11)
+!10 = !DILocalVariable(name: "val", line: 4, arg: 2, scope: !0, file: !1, type: !11)
 !11 = !DIBasicType(tag: DW_TAG_base_type, name: "float", size: 32, align: 32, encoding: DW_ATE_float)
-!12 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 4, arg: 3, scope: !0, file: !1, type: !13)
+!12 = !DILocalVariable(name: "c", line: 4, arg: 3, scope: !0, file: !1, type: !13)
 !13 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
 
-!58 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 4, arg: 1, scope: !0, file: !1, type: !9)
-!60 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 4, arg: 2, scope: !0, file: !1, type: !11)
-!62 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 4, arg: 3, scope: !0, file: !1, type: !13)
+!58 = !DILocalVariable(name: "ptr", line: 4, arg: 1, scope: !0, file: !1, type: !9)
+!60 = !DILocalVariable(name: "val", line: 4, arg: 2, scope: !0, file: !1, type: !11)
+!62 = !DILocalVariable(name: "c", line: 4, arg: 3, scope: !0, file: !1, type: !13)
 
-!14 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "ptr", line: 11, arg: 1, scope: !6, file: !1, type: !9)
-!15 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "val", line: 11, arg: 2, scope: !6, file: !1, type: !11)
-!16 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 11, arg: 3, scope: !6, file: !1, type: !13)
-!17 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argc", line: 17, arg: 1, scope: !7, file: !1, type: !5)
-!18 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argv", line: 17, arg: 2, scope: !7, file: !1, type: !19)
+!14 = !DILocalVariable(name: "ptr", line: 11, arg: 1, scope: !6, file: !1, type: !9)
+!15 = !DILocalVariable(name: "val", line: 11, arg: 2, scope: !6, file: !1, type: !11)
+!16 = !DILocalVariable(name: "c", line: 11, arg: 3, scope: !6, file: !1, type: !13)
+!17 = !DILocalVariable(name: "argc", line: 17, arg: 1, scope: !7, file: !1, type: !5)
+!18 = !DILocalVariable(name: "argv", line: 17, arg: 2, scope: !7, file: !1, type: !19)
 !19 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: !20)
 !20 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: !21)
 !21 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!22 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "dval", line: 19, scope: !23, file: !1, type: !11)
+!22 = !DILocalVariable(name: "dval", line: 19, scope: !23, file: !1, type: !11)
 !23 = distinct !DILexicalBlock(line: 18, column: 1, file: !51, scope: !7)
 !24 = !DILocation(line: 4, column: 22, scope: !0)
 !25 = !DILocation(line: 4, column: 33, scope: !0)
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
index f22559efad4d4..eadf1b48156bb 100644
--- a/test/CodeGen/ARM/debug-info-sreg2.ll
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -12,7 +12,7 @@ target triple = "thumbv7-apple-macosx10.6.7"
 ; CHECK-NEXT:           Ending address offset:
 ; CHECK-NEXT:            Location description: 90 {{.. .. .. .. $}}
 
-define void @_Z3foov() optsize ssp {
+define void @_Z3foov() optsize ssp !dbg !1 {
 entry:
   %call = tail call float @_Z3barv() optsize, !dbg !11
   tail call void @llvm.dbg.value(metadata float %call, i64 0, metadata !5, metadata !DIExpression()), !dbg !11
@@ -43,15 +43,15 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20}
 
-!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.0 (trunk 130845)", isOptimized: true, emissionKind: 1, file: !18, enums: !19, retainedTypes: !19, subprograms: !16, imports:  null)
-!1 = !DISubprogram(name: "foo", linkageName: "_Z3foov", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !18, scope: !2, type: !3, function: void ()* @_Z3foov, variables: !17)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.0 (trunk 130845)", isOptimized: true, emissionKind: 1, file: !18, enums: !19, retainedTypes: !19, subprograms: !16, imports:  null)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !18, scope: !2, type: !3, variables: !17)
 !2 = !DIFile(filename: "k.cc", directory: "/private/tmp")
 !3 = !DISubroutineType(types: !4)
 !4 = !{null}
-!5 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 6, scope: !6, file: !2, type: !7)
+!5 = !DILocalVariable(name: "k", line: 6, scope: !6, file: !2, type: !7)
 !6 = distinct !DILexicalBlock(line: 5, column: 12, file: !18, scope: !1)
 !7 = !DIBasicType(tag: DW_TAG_base_type, name: "float", size: 32, align: 32, encoding: DW_ATE_float)
-!8 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "y", line: 8, scope: !9, file: !2, type: !7)
+!8 = !DILocalVariable(name: "y", line: 8, scope: !9, file: !2, type: !7)
 !9 = distinct !DILexicalBlock(line: 7, column: 25, file: !18, scope: !10)
 !10 = distinct !DILexicalBlock(line: 7, column: 3, file: !18, scope: !6)
 !11 = !DILocation(line: 6, column: 18, scope: !6)
diff --git a/test/CodeGen/ARM/debug-segmented-stacks.ll b/test/CodeGen/ARM/debug-segmented-stacks.ll
index 47d366e49ded3..bd0abedc41330 100644
--- a/test/CodeGen/ARM/debug-segmented-stacks.ll
+++ b/test/CodeGen/ARM/debug-segmented-stacks.ll
@@ -5,7 +5,7 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-define void @test_basic() #0 {
+define void @test_basic() #0 !dbg !4 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -39,11 +39,11 @@ define void @test_basic() #0 {
 ; ARM-linux       .cfi_same_value r5
 }
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5 ", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5 ", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
 !1 = !DIFile(filename: "var.c", directory: "/tmp")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "test_basic", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 5, file: !1, scope: !5, type: !6, function: void ()* @test_basic, variables: !2)
+!4 = distinct !DISubprogram(name: "test_basic", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 5, file: !1, scope: !5, type: !6, variables: !2)
 !5 = !DIFile(filename: "var.c", directory: "/tmp")
 !6 = !DISubroutineType(types: !7)
 !7 = !{!8, !8}
@@ -51,9 +51,9 @@ define void @test_basic() #0 {
 !9 = !{i32 2, !"Dwarf Version", i32 4}
 !10 = !{i32 1, !"Debug Info Version", i32 3}
 !11 = !{!"clang version 3.5 "}
-!12 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "count", line: 5, arg: 1, scope: !4, file: !5, type: !8)
+!12 = !DILocalVariable(name: "count", line: 5, arg: 1, scope: !4, file: !5, type: !8)
 !13 = !DILocation(line: 5, scope: !4)
-!14 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "vl", line: 6, scope: !4, file: !5, type: !15)
+!14 = !DILocalVariable(name: "vl", line: 6, scope: !4, file: !5, type: !15)
 !15 = !DIDerivedType(tag: DW_TAG_typedef, name: "va_list", line: 30, file: !16, baseType: !17)
 !16 = !DIFile(filename: "/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", directory: "/tmp")
 !17 = !DIDerivedType(tag: DW_TAG_typedef, name: "__builtin_va_list", line: 6, file: !1, baseType: !18)
@@ -63,9 +63,9 @@ define void @test_basic() #0 {
 !21 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, baseType: null)
 !22 = !DILocation(line: 6, scope: !4)
 !23 = !DILocation(line: 7, scope: !4)
-!24 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "test_basic", line: 8, scope: !4, file: !5, type: !8)
+!24 = !DILocalVariable(name: "test_basic", line: 8, scope: !4, file: !5, type: !8)
 !25 = !DILocation(line: 8, scope: !4)
-!26 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 9, scope: !27, file: !5, type: !8)
+!26 = !DILocalVariable(name: "i", line: 9, scope: !27, file: !5, type: !8)
 !27 = distinct !DILexicalBlock(line: 9, column: 0, file: !1, scope: !4)
 !28 = !DILocation(line: 9, scope: !27)
 !29 = !DILocation(line: 10, scope: !30)
diff --git a/test/CodeGen/ARM/debugtrap.ll b/test/CodeGen/ARM/debugtrap.ll
new file mode 100644
index 0000000000000..9ce73939ce569
--- /dev/null
+++ b/test/CodeGen/ARM/debugtrap.ll
@@ -0,0 +1,17 @@
+; This test ensures the @llvm.debugtrap() call is not removed when generating
+; the 'pop' instruction to restore the callee saved registers on ARM.
+
+; RUN: llc < %s -mtriple=armv7 -O0 -filetype=asm | FileCheck %s 
+
+declare void @llvm.debugtrap() nounwind
+declare void @foo() nounwind
+
+define void @test() nounwind {
+entry:
+  ; CHECK: bl foo
+  ; CHECK-NEXT: pop
+  ; CHECK-NEXT: trap
+  call void @foo()
+  call void @llvm.debugtrap()
+  ret void
+}
diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll
index 7b298fee42a52..997f50760f3a9 100644
--- a/test/CodeGen/ARM/div.ll
+++ b/test/CodeGen/ARM/div.ll
@@ -1,52 +1,97 @@
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8    | FileCheck %s -check-prefix=CHECK-SWDIV
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=swift        | FileCheck %s -check-prefix=CHECK-HWDIV
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-r4    | FileCheck %s -check-prefix=CHECK-SWDIV
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-r4f   | FileCheck %s -check-prefix=CHECK-SWDIV
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-r5    | FileCheck %s -check-prefix=CHECK-HWDIV
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8    | \
+; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-SWDIV
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=swift        | \
+; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-HWDIV
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-r4    | \
+; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-SWDIV
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-r4f   | \
+; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-SWDIV
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-r5    | \
+; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-HWDIV
+; RUN: llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8    | \
+; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-EABI
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
-; CHECK-SWDIV: f1
+; CHECK-LABEL: f1
 ; CHECK-SWDIV: __divsi3
 
-; CHECK-HWDIV: f1
 ; CHECK-HWDIV: sdiv
+
+; CHECK-EABI: __aeabi_idiv
         %tmp1 = sdiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
 define i32 @f2(i32 %a, i32 %b) {
 entry:
-; CHECK-SWDIV: f2
+; CHECK-LABEL: f2
 ; CHECK-SWDIV: __udivsi3
 
-; CHECK-HWDIV: f2
 ; CHECK-HWDIV: udiv
+
+; CHECK-EABI: __aeabi_uidiv
         %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
 define i32 @f3(i32 %a, i32 %b) {
 entry:
-; CHECK-SWDIV: f3
+; CHECK-LABEL: f3
 ; CHECK-SWDIV: __modsi3
 
-; CHECK-HWDIV: f3
 ; CHECK-HWDIV: sdiv
 ; CHECK-HWDIV: mls
+
+; EABI MODE = Remainder in R1, quotient in R0
+; CHECK-EABI: __aeabi_idivmod
+; CHECK-EABI-NEXT: mov r0, r1
         %tmp1 = srem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
 define i32 @f4(i32 %a, i32 %b) {
 entry:
-; CHECK-SWDIV: f4
+; CHECK-LABEL: f4
 ; CHECK-SWDIV: __umodsi3
 
-; CHECK-HWDIV: f4
 ; CHECK-HWDIV: udiv
 ; CHECK-HWDIV: mls
+
+; EABI MODE = Remainder in R1, quotient in R0
+; CHECK-EABI: __aeabi_uidivmod
+; CHECK-EABI-NEXT: mov r0, r1
         %tmp1 = urem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
+
+define i64 @f5(i64 %a, i64 %b) {
+entry:
+; CHECK-LABEL: f5
+; CHECK-SWDIV: __moddi3
+
+; CHECK-HWDIV: __moddi3
+
+; EABI MODE = Remainder in R2-R3, quotient in R0-R1
+; CHECK-EABI: __aeabi_ldivmod
+; CHECK-EABI-NEXT: mov r0, r2
+; CHECK-EABI-NEXT: mov r1, r3
+        %tmp1 = srem i64 %a, %b         ; <i64> [#uses=1]
+        ret i64 %tmp1
+}
+
+define i64 @f6(i64 %a, i64 %b) {
+entry:
+; CHECK-LABEL: f6
+; CHECK-SWDIV: __umoddi3
+
+; CHECK-HWDIV: __umoddi3
+
+; EABI MODE = Remainder in R2-R3, quotient in R0-R1
+; CHECK-EABI: __aeabi_uldivmod
+; CHECK-EABI-NEXT: mov r0, r2
+; CHECK-EABI-NEXT: mov r1, r3
+        %tmp1 = urem i64 %a, %b         ; <i64> [#uses=1]
+        ret i64 %tmp1
+}
diff --git a/test/CodeGen/ARM/divmod-eabi.ll b/test/CodeGen/ARM/divmod-eabi.ll
index 7f72048d391eb..4178af397e66f 100644
--- a/test/CodeGen/ARM/divmod-eabi.ll
+++ b/test/CodeGen/ARM/divmod-eabi.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -mtriple armv7-none-eabi %s -o - | FileCheck %s --check-prefix=EABI
 ; RUN: llc -mtriple armv7-none-eabihf %s -o - | FileCheck %s --check-prefix=EABI
+; Both "none-eabi" and "androideabi" must lower SREM/UREM to __aeabi_{u,i}divmod
+; RUN: llc -mtriple armv7-linux-androideabi %s -o - | FileCheck %s --check-prefix=EABI
 ; RUN: llc -mtriple armv7-linux-gnueabi %s -o - | FileCheck %s --check-prefix=GNU
 ; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefix=DARWIN
 ; FIXME: long-term, we will use "-apple-macho" and won't need this exception:
@@ -187,7 +189,7 @@ define i32 @g4(i32 %a, i32 %b) {
 ; DARWIN-LABEL: g4:
 entry:
   %div = sdiv i32 %a, %b
-; EABI: __aeabi_idivmod
+; EABI: __aeabi_idiv{{$}}
 ; EABI: mov [[div:r[0-9]+]], r0
 ; GNU: __aeabi_idiv
 ; GNU: mov [[sum:r[0-9]+]], r0
diff --git a/test/CodeGen/ARM/eh-resume-darwin.ll b/test/CodeGen/ARM/eh-resume-darwin.ll
index 0cd49775cfb4c..d3a8481275f31 100644
--- a/test/CodeGen/ARM/eh-resume-darwin.ll
+++ b/test/CodeGen/ARM/eh-resume-darwin.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=arm | FileCheck %s
-target triple = "armv6-apple-macosx10.6"
+; RUN: llc < %s -mtriple=armv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=IOS
+; RUN: llc < %s -mtriple=armv7k-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=IOS
+; RUN: llc < %s -mtriple=armv7k-apple-watchos -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=WATCHOS
 
 declare void @func()
 
@@ -19,4 +20,5 @@ lpad:
   resume { i8*, i32 } %exn
 }
 
-; CHECK: __Unwind_SjLj_Resume
+; IOS: __Unwind_SjLj_Resume
+; WATCHOS: __Unwind_Resume
diff --git a/test/CodeGen/ARM/emutls.ll b/test/CodeGen/ARM/emutls.ll
new file mode 100644
index 0000000000000..7ba50dd249bbc
--- /dev/null
+++ b/test/CodeGen/ARM/emutls.ll
@@ -0,0 +1,258 @@
+; RUN: llc -emulated-tls -mtriple=arm-linux-android \
+; RUN:     -relocation-model=pic < %s | FileCheck -check-prefix=ARM32 %s
+
+; Copied from X86/emutls.ll
+
+; Use my_emutls_get_address like __emutls_get_address.
+@my_emutls_v_xyz = external global i8*, align 4
+declare i8* @my_emutls_get_address(i8*)
+
+define i32 @my_get_xyz() {
+; ARM32-LABEL: my_get_xyz:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl my_emutls_get_address(PLT)
+; ARM32-NEXT:   ldr r0, [r0]
+
+entry:
+  %call = call i8* @my_emutls_get_address(i8* bitcast (i8** @my_emutls_v_xyz to i8*))
+  %0 = bitcast i8* %call to i32*
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+@i1 = thread_local global i32 15
+@i2 = external thread_local global i32
+@i3 = internal thread_local global i32 15
+@i4 = hidden thread_local global i32 15
+@i5 = external hidden thread_local global i32
+@s1 = thread_local global i16 15
+@b1 = thread_local global i8 0
+
+define i32 @f1() {
+; ARM32-LABEL: f1:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   ldr r0, [r0]
+
+entry:
+  %tmp1 = load i32, i32* @i1
+  ret i32 %tmp1
+}
+
+define i32* @f2() {
+; ARM32-LABEL: f2:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   pop
+
+entry:
+  ret i32* @i1
+}
+
+define i32 @f3() nounwind {
+; ARM32-LABEL: f3:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   ldr r0, [r0]
+
+entry:
+  %tmp1 = load i32, i32* @i2
+  ret i32 %tmp1
+}
+
+define i32* @f4() {
+; ARM32-LABEL: f4:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   pop
+
+entry:
+  ret i32* @i2
+}
+
+define i32 @f5() nounwind {
+; ARM32-LABEL: f5:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   ldr r0, [r0]
+
+entry:
+  %tmp1 = load i32, i32* @i3
+  ret i32 %tmp1
+}
+
+define i32* @f6() {
+; ARM32-LABEL: f6:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   pop
+
+entry:
+  ret i32* @i3
+}
+
+define i32 @f7() {
+; ARM32-LABEL: f7:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   ldr r0, [r0]
+
+entry:
+  %tmp1 = load i32, i32* @i4
+  ret i32 %tmp1
+}
+
+define i32* @f8() {
+; ARM32-LABEL: f8:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   pop
+
+entry:
+  ret i32* @i4
+}
+
+define i32 @f9() {
+; ARM32-LABEL: f9:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   ldr r0, [r0]
+
+entry:
+  %tmp1 = load i32, i32* @i5
+  ret i32 %tmp1
+}
+
+define i32* @f10() {
+; ARM32-LABEL: f10:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   pop
+
+entry:
+  ret i32* @i5
+}
+
+define i16 @f11() {
+; ARM32-LABEL: f11:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   ldrh r0, [r0]
+
+entry:
+  %tmp1 = load i16, i16* @s1
+  ret i16 %tmp1
+}
+
+define i32 @f12() {
+; ARM32-LABEL: f12:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   ldrsh r0, [r0]
+
+entry:
+  %tmp1 = load i16, i16* @s1
+  %tmp2 = sext i16 %tmp1 to i32
+  ret i32 %tmp2
+}
+
+define i8 @f13() {
+; ARM32-LABEL: f13:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   ldrb r0, [r0]
+; ARM32-NEXT: pop
+
+entry:
+  %tmp1 = load i8, i8* @b1
+  ret i8 %tmp1
+}
+
+define i32 @f14() {
+; ARM32-LABEL: f14:
+; ARM32:        ldr r0,
+; ARM32:        ldr r0, [pc, r0]
+; ARM32-NEXT:   bl __emutls_get_address(PLT)
+; ARM32-NEXT:   ldrsb r0, [r0]
+; ARM32-NEXT: pop
+
+entry:
+  %tmp1 = load i8, i8* @b1
+  %tmp2 = sext i8 %tmp1 to i32
+  ret i32 %tmp2
+}
+
+;;;;;;;;;;;;;; 32-bit __emutls_v. and __emutls_t.
+
+; ARM32       .section .data.rel.local,
+; ARM32-LABEL: __emutls_v.i1:
+; ARM32-NEXT: .long 4
+; ARM32-NEXT: .long 4
+; ARM32-NEXT: .long 0
+; ARM32-NEXT: .long __emutls_t.i1
+
+; ARM32       .section .rodata,
+; ARM32-LABEL: __emutls_t.i1:
+; ARM32-NEXT: .long 15
+
+; ARM32-NOT:   __emutls_v.i2
+
+; ARM32       .section .data.rel.local,
+; ARM32-LABEL: __emutls_v.i3:
+; ARM32-NEXT: .long 4
+; ARM32-NEXT: .long 4
+; ARM32-NEXT: .long 0
+; ARM32-NEXT: .long __emutls_t.i3
+
+; ARM32       .section .rodata,
+; ARM32-LABEL: __emutls_t.i3:
+; ARM32-NEXT: .long 15
+
+; ARM32       .section .data.rel.local,
+; ARM32-LABEL: __emutls_v.i4:
+; ARM32-NEXT: .long 4
+; ARM32-NEXT: .long 4
+; ARM32-NEXT: .long 0
+; ARM32-NEXT: .long __emutls_t.i4
+
+; ARM32       .section .rodata,
+; ARM32-LABEL: __emutls_t.i4:
+; ARM32-NEXT: .long 15
+
+; ARM32-NOT:   __emutls_v.i5:
+; ARM32       .hidden __emutls_v.i5
+; ARM32-NOT:   __emutls_v.i5:
+
+; ARM32 .section .data.rel.local,
+; ARM32-LABEL: __emutls_v.s1:
+; ARM32-NEXT: .long 2
+; ARM32-NEXT: .long 2
+; ARM32-NEXT: .long 0
+; ARM32-NEXT: .long __emutls_t.s1
+
+; ARM32 .section .rodata,
+; ARM32-LABEL: __emutls_t.s1:
+; ARM32-NEXT: .short 15
+
+; ARM32 .section .data.rel.local,
+; ARM32-LABEL: __emutls_v.b1:
+; ARM32-NEXT: .long 1
+; ARM32-NEXT: .long 1
+; ARM32-NEXT: .long 0
+; ARM32-NEXT: .long 0
+
+; ARM32-NOT:   __emutls_t.b1
diff --git a/test/CodeGen/ARM/emutls1.ll b/test/CodeGen/ARM/emutls1.ll
new file mode 100644
index 0000000000000..d4ba7eced66cd
--- /dev/null
+++ b/test/CodeGen/ARM/emutls1.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -emulated-tls -march=arm -mtriple=arm-linux-androideabi \
+; RUN:     | FileCheck %s
+; RUN: llc < %s -emulated-tls -march=arm -mtriple=arm-linux-androideabi \
+; RUN:     -relocation-model=pic | FileCheck %s --check-prefix=PIC
+
+; Compared with tls1.ll, emulated mode should not use __aeabi_read_tp or __tls_get_addr.
+
+; CHECK-NOT: _aeabi_read_tp
+; CHECK-NOT: _tls_get_addr
+; CHECK:     __emutls_get_addr
+; CHECK-NOT: __aeabi_read_tp
+; CHECK-NOT: _tls_get_addr
+
+; PIC-NOT: _aeabi_read_tp
+; PIC-NOT: _tls_get_addr
+; PIC:     __emutls_get_addr
+; PIC-NOT: _aeabi_read_tp
+; PIC-NOT: _tls_get_addr
+
+@i = thread_local global i32 15 ; <i32*> [#uses=2]
+
+define i32 @f() {
+entry:
+ %tmp1 = load i32, i32* @i ; <i32> [#uses=1]
+ ret i32 %tmp1
+}
+
+define i32* @g() {
+entry:
+ ret i32* @i
+}
diff --git a/test/CodeGen/ARM/emutls_generic.ll b/test/CodeGen/ARM/emutls_generic.ll
new file mode 100644
index 0000000000000..0fada88fb5d9f
--- /dev/null
+++ b/test/CodeGen/ARM/emutls_generic.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -emulated-tls -mtriple=arm-linux-android -relocation-model=pic \
+; RUN:     | FileCheck -check-prefix=ARM_32 %s
+; RUN: llc < %s -emulated-tls -mtriple=arm-linux-androidabi -relocation-model=pic \
+; RUN:     | FileCheck -check-prefix=ARM_32 %s
+; RUN: llc < %s -emulated-tls -mtriple=arm-linux-androidabi -relocation-model=pic -O3 \
+; RUN:     | FileCheck -check-prefix=ARM_32 %s
+; RUN: llc < %s -emulated-tls -mtriple=arm-linux-androidabi -O3 \
+; RUN:     | FileCheck -check-prefix=ARM_32 %s
+
+; Make sure that TLS symbols are emitted in expected order.
+
+@external_x = external thread_local global i32, align 8
+@external_y = thread_local global i8 7, align 2
+@internal_y = internal thread_local global i64 9, align 16
+
+define i32* @get_external_x() {
+entry:
+  ret i32* @external_x
+}
+
+define i8* @get_external_y() {
+entry:
+  ret i8* @external_y
+}
+
+define i64* @get_internal_y() {
+entry:
+  ret i64* @internal_y
+}
+
+; ARM_32-LABEL:  get_external_x:
+; ARM_32:        bl __emutls_get_address
+; ARM_32:        .long __emutls_v.external_x
+; ARM_32-LABEL:  get_external_y:
+; ARM_32:        bl __emutls_get_address
+; ARM_32:        .long __emutls_v.external_y
+; ARM_32-LABEL:  get_internal_y:
+; ARM_32:      bl __emutls_get_address
+; ARM_32:      .long __emutls_v.internal_y
+; ARM_32-NOT:   __emutls_t.external_x
+; ARM_32-NOT:   __emutls_v.external_x:
+; ARM_32:        .data
+; ARM_32:        .align 2
+; ARM_32-LABEL:  __emutls_v.external_y:
+; ARM_32-NEXT:   .long 1
+; ARM_32-NEXT:   .long 2
+; ARM_32-NEXT:   .long 0
+; ARM_32-NEXT:   .long __emutls_t.external_y
+; ARM_32:        .section .rodata,
+; ARM_32-LABEL:  __emutls_t.external_y:
+; ARM_32-NEXT:   .byte 7
+; ARM_32:        .data
+; ARM_32:        .align 2
+; ARM_32-LABEL:  __emutls_v.internal_y:
+; ARM_32-NEXT:   .long 8
+; ARM_32-NEXT:   .long 16
+; ARM_32-NEXT:   .long 0
+; ARM_32-NEXT:   .long __emutls_t.internal_y
+; ARM_32-LABEL:  __emutls_t.internal_y:
+; ARM_32-NEXT:   .long 9
+; ARM_32-NEXT:   .long 0
diff --git a/test/CodeGen/ARM/fast-isel-align.ll b/test/CodeGen/ARM/fast-isel-align.ll
index 39085db953163..701884e926a89 100644
--- a/test/CodeGen/ARM/fast-isel-align.ll
+++ b/test/CodeGen/ARM/fast-isel-align.ll
@@ -1,22 +1,22 @@
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+; RUN: llc < %s -O0 -mattr=+strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -mattr=+strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+; RUN: llc < %s -O0 -mattr=+strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -mattr=+strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl -verify-machineinstrs | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-
-; RUN: llc < %s -O0  -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0  -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
-; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+; RUN: llc < %s -O0 -mattr=+strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+
+; RUN: llc < %s -O0 -mattr=+strict-align -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0  -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -mattr=+strict-align -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -mattr=+strict-align -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -mattr=+strict-align -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
 ; Check unaligned stores
 %struct.anon = type <{ float }>
diff --git a/test/CodeGen/ARM/fast-isel-ext.ll b/test/CodeGen/ARM/fast-isel-ext.ll
index b792f7a90738f..440aa426067cb 100644
--- a/test/CodeGen/ARM/fast-isel-ext.ll
+++ b/test/CodeGen/ARM/fast-isel-ext.ll
@@ -1,9 +1,5 @@
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=v7
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=v7
-; RUN: llc < %s -O0 -fast-isel-abort=1 -mtriple=armv4t-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort=1 -mtriple=armv4t-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort=1 -mtriple=armv5-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort=1 -mtriple=armv5-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=prev6
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=v7
 
 ; Can't test pre-ARMv6 Thumb because ARM FastISel currently only supports
@@ -19,8 +15,6 @@
 define i8 @zext_1_8(i1 %a) nounwind ssp {
 ; v7-LABEL: zext_1_8:
 ; v7: and r0, r0, #1
-; prev6-LABEL: zext_1_8:
-; prev6: and r0, r0, #1
   %r = zext i1 %a to i8
   ret i8 %r
 }
@@ -28,8 +22,6 @@ define i8 @zext_1_8(i1 %a) nounwind ssp {
 define i16 @zext_1_16(i1 %a) nounwind ssp {
 ; v7-LABEL: zext_1_16:
 ; v7: and r0, r0, #1
-; prev6-LABEL: zext_1_16:
-; prev6: and r0, r0, #1
   %r = zext i1 %a to i16
   ret i16 %r
 }
@@ -37,8 +29,6 @@ define i16 @zext_1_16(i1 %a) nounwind ssp {
 define i32 @zext_1_32(i1 %a) nounwind ssp {
 ; v7-LABEL: zext_1_32:
 ; v7: and r0, r0, #1
-; prev6-LABEL: zext_1_32:
-; prev6: and r0, r0, #1
   %r = zext i1 %a to i32
   ret i32 %r
 }
@@ -46,8 +36,6 @@ define i32 @zext_1_32(i1 %a) nounwind ssp {
 define i16 @zext_8_16(i8 %a) nounwind ssp {
 ; v7-LABEL: zext_8_16:
 ; v7: and r0, r0, #255
-; prev6-LABEL: zext_8_16:
-; prev6: and r0, r0, #255
   %r = zext i8 %a to i16
   ret i16 %r
 }
@@ -55,8 +43,6 @@ define i16 @zext_8_16(i8 %a) nounwind ssp {
 define i32 @zext_8_32(i8 %a) nounwind ssp {
 ; v7-LABEL: zext_8_32:
 ; v7: and r0, r0, #255
-; prev6-LABEL: zext_8_32:
-; prev6: and r0, r0, #255
   %r = zext i8 %a to i32
   ret i32 %r
 }
@@ -64,9 +50,6 @@ define i32 @zext_8_32(i8 %a) nounwind ssp {
 define i32 @zext_16_32(i16 %a) nounwind ssp {
 ; v7-LABEL: zext_16_32:
 ; v7: uxth r0, r0
-; prev6-LABEL: zext_16_32:
-; prev6: lsl{{s?}} r0, r0, #16
-; prev6: lsr{{s?}} r0, r0, #16
   %r = zext i16 %a to i32
   ret i32 %r
 }
@@ -77,9 +60,6 @@ define i8 @sext_1_8(i1 %a) nounwind ssp {
 ; v7-LABEL: sext_1_8:
 ; v7: lsl{{s?}} r0, r0, #31
 ; v7: asr{{s?}} r0, r0, #31
-; prev6-LABEL: sext_1_8:
-; prev6: lsl{{s?}} r0, r0, #31
-; prev6: asr{{s?}} r0, r0, #31
   %r = sext i1 %a to i8
   ret i8 %r
 }
@@ -88,9 +68,6 @@ define i16 @sext_1_16(i1 %a) nounwind ssp {
 ; v7-LABEL: sext_1_16:
 ; v7: lsl{{s?}} r0, r0, #31
 ; v7: asr{{s?}} r0, r0, #31
-; prev6-LABEL: sext_1_16:
-; prev6: lsl{{s?}} r0, r0, #31
-; prev6: asr{{s?}} r0, r0, #31
   %r = sext i1 %a to i16
   ret i16 %r
 }
@@ -99,9 +76,6 @@ define i32 @sext_1_32(i1 %a) nounwind ssp {
 ; v7-LABEL: sext_1_32:
 ; v7: lsl{{s?}} r0, r0, #31
 ; v7: asr{{s?}} r0, r0, #31
-; prev6-LABEL: sext_1_32:
-; prev6: lsl{{s?}} r0, r0, #31
-; prev6: asr{{s?}} r0, r0, #31
   %r = sext i1 %a to i32
   ret i32 %r
 }
@@ -109,9 +83,6 @@ define i32 @sext_1_32(i1 %a) nounwind ssp {
 define i16 @sext_8_16(i8 %a) nounwind ssp {
 ; v7-LABEL: sext_8_16:
 ; v7: sxtb r0, r0
-; prev6-LABEL: sext_8_16:
-; prev6: lsl{{s?}} r0, r0, #24
-; prev6: asr{{s?}} r0, r0, #24
   %r = sext i8 %a to i16
   ret i16 %r
 }
@@ -119,9 +90,6 @@ define i16 @sext_8_16(i8 %a) nounwind ssp {
 define i32 @sext_8_32(i8 %a) nounwind ssp {
 ; v7-LABEL: sext_8_32:
 ; v7: sxtb r0, r0
-; prev6-LABEL: sext_8_32:
-; prev6: lsl{{s?}} r0, r0, #24
-; prev6: asr{{s?}} r0, r0, #24
   %r = sext i8 %a to i32
   ret i32 %r
 }
@@ -129,9 +97,6 @@ define i32 @sext_8_32(i8 %a) nounwind ssp {
 define i32 @sext_16_32(i16 %a) nounwind ssp {
 ; v7-LABEL: sext_16_32:
 ; v7: sxth r0, r0
-; prev6-LABEL: sext_16_32:
-; prev6: lsl{{s?}} r0, r0, #16
-; prev6: asr{{s?}} r0, r0, #16
   %r = sext i16 %a to i32
   ret i32 %r
 }
diff --git a/test/CodeGen/ARM/fast-isel-mvn.ll b/test/CodeGen/ARM/fast-isel-mvn.ll
index 89b7c05158cdc..34bb7225854a4 100644
--- a/test/CodeGen/ARM/fast-isel-mvn.ll
+++ b/test/CodeGen/ARM/fast-isel-mvn.ll
@@ -1,8 +1,8 @@
-; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -arm-use-movt=false -mtriple=armv7-apple-ios     < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
-; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -arm-use-movt=false -mtriple=armv7-linux-gnueabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
-; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -arm-use-movt=false -mtriple=thumbv7-apple-ios   < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
-; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -arm-use-movt=true  -mtriple=thumbv7-apple-ios   < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB
-; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -arm-use-movt=true  -mtriple=armv7-apple-ios     < %s | FileCheck %s --check-prefix=MOVT
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mattr=+no-movt -mtriple=armv7-apple-ios     < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mattr=+no-movt -mtriple=armv7-linux-gnueabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mattr=+no-movt -mtriple=thumbv7-apple-ios   < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios   < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios     < %s | FileCheck %s --check-prefix=MOVT
 ; rdar://10412592
 
 define void @t1() nounwind {
diff --git a/test/CodeGen/ARM/fast-isel-pic.ll b/test/CodeGen/ARM/fast-isel-pic.ll
index 70e15daaca62a..4cee5a7eba90b 100644
--- a/test/CodeGen/ARM/fast-isel-pic.ll
+++ b/test/CodeGen/ARM/fast-isel-pic.ll
@@ -1,8 +1,7 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=pic -mtriple=arm-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARMv7
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=pic -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s --check-prefix=THUMB-ELF
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=pic -mtriple=armv7-none-linux-gnueabi | FileCheck %s --check-prefix=ARMv7-ELF
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -arm-force-fast-isel -relocation-model=pic -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s --check-prefix=THUMB-ELF
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -arm-force-fast-isel -relocation-model=pic -mtriple=armv7-none-linux-gnueabi | FileCheck %s --check-prefix=ARMv7-ELF
 
 @g = global i32 0, align 4
 
@@ -14,8 +13,8 @@ entry:
 ; THUMB: add  [[reg0]], pc
 ; THUMB-ELF: LoadGV
 ; THUMB-ELF: ldr r[[reg0:[0-9]+]],
-; THUMB-ELF: ldr r[[reg1:[0-9]+]],
-; THUMB-ELF: ldr r[[reg0]], [r[[reg0]], r[[reg1]]]
+; THUMB-ELF: add r[[reg0]], pc
+; THUMB-ELF: ldr r[[reg0]], [r[[reg0]]]
 ; ARM: LoadGV
 ; ARM: ldr [[reg1:r[0-9]+]],
 ; ARM: add [[reg1]], pc, [[reg1]]
@@ -26,9 +25,8 @@ entry:
 ; ARMv7-ELF: LoadGV
 ; ARMv7-ELF: ldr r[[reg2:[0-9]+]],
 ; ARMv7-ELF: .LPC
-; ARMv7-ELF-NEXT: add r[[reg2]], pc
-; ARMv7-ELF: ldr r[[reg3:[0-9]+]],
-; ARMv7-ELF: ldr r[[reg2]], [r[[reg3]], r[[reg2]]]
+; ARMv7-ELF-NEXT: ldr r[[reg2]], [pc, r[[reg2]]]
+; ARMv7-ELF: ldr r[[reg2]], [r[[reg2]]]
   %tmp = load i32, i32* @g
   ret i32 %tmp
 }
@@ -44,8 +42,8 @@ entry:
 ; THUMB: ldr  r[[reg3]], [r[[reg3]]]
 ; THUMB-ELF: LoadIndirectSymbol
 ; THUMB-ELF: ldr r[[reg3:[0-9]+]],
-; THUMB-ELF: ldr r[[reg4:[0-9]+]],
-; THUMB-ELF: ldr r[[reg3]], [r[[reg3]], r[[reg4]]]
+; THUMB-ELF: ldr r[[reg4:[0-9]+]], [r[[reg3]]]
+; THUMB-ELF: ldr r0, [r[[reg4]]]
 ; ARM: LoadIndirectSymbol
 ; ARM: ldr [[reg4:r[0-9]+]],
 ; ARM: ldr [[reg4]], [pc, [[reg4]]]
@@ -57,9 +55,8 @@ entry:
 ; ARMv7-ELF: LoadIndirectSymbol
 ; ARMv7-ELF: ldr r[[reg5:[0-9]+]],
 ; ARMv7-ELF: .LPC
-; ARMv7-ELF-NEXT: add r[[reg5]], pc
-; ARMv7-ELF: ldr r[[reg6:[0-9]+]],
-; ARMv7-ELF: ldr r[[reg5]], [r[[reg6]], r[[reg5]]]
+; ARMv7-ELF: ldr r[[reg6:[0-9]+]], [pc, r[[reg5]]]
+; ARMv7-ELF: ldr r0, [r[[reg5]]]
   %tmp = load i32, i32* @i
   ret i32 %tmp
 }
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll
index 1fd9bd9e47a3e..8944a40f311fb 100644
--- a/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -1,4 +1,6 @@
-; RUN: llc -mtriple=thumbv7-apple-none-macho < %s | FileCheck %s
+; Disable shrink-wrapping on the first test otherwise we wouldn't
+; exerce the path for PR18136.
+; RUN: llc -mtriple=thumbv7-apple-none-macho < %s -enable-shrink-wrap=false | FileCheck %s
 ; RUN: llc -mtriple=thumbv6m-apple-none-macho -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1
 ; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS
 ; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-LINUX
@@ -60,20 +62,19 @@ define void @check_vfp_fold() minsize {
 ; CHECK: vpush {d6, d7, d8, d9}
 ; CHECK-NOT: sub sp,
 ; ...
-; CHECK: vldmia r[[GLOBREG]], {d8, d9}
-; ...
 ; CHECK-NOT: add sp,
 ; CHECK: vpop {d6, d7, d8, d9}
-; CHECKL pop {r[[GLOBREG]], pc}
+; CHECK: pop {r[[GLOBREG]], pc}
 
   ; iOS uses aligned NEON stores here, which is convenient since we
   ; want to make sure that works too.
 ; CHECK-IOS-LABEL: check_vfp_fold:
-; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr}
+; CHECK-IOS: push {r4, r7, lr}
 ; CHECK-IOS: sub.w r4, sp, #16
 ; CHECK-IOS: bfc r4, #0, #4
 ; CHECK-IOS: mov sp, r4
 ; CHECK-IOS: vst1.64 {d8, d9}, [r4:128]
+; CHECK-IOS: sub sp, #16
 ; ...
 ; CHECK-IOS: add r4, sp, #16
 ; CHECK-IOS: vld1.64 {d8, d9}, [r4:128]
@@ -82,9 +83,8 @@ define void @check_vfp_fold() minsize {
 
   %var = alloca i8, i32 16
 
-  %tmp = load %bigVec, %bigVec* @var
+  call void asm "", "r,~{d8},~{d9}"(i8* %var)
   call void @bar(i8* %var)
-  store %bigVec %tmp, %bigVec* @var
 
   ret void
 }
@@ -170,9 +170,9 @@ define void @test_varsize(...) minsize {
 ; CHECK-T1: push	{r5, r6, r7, lr}
 ; ...
 ; CHECK-T1: pop	{r2, r3, r7}
-; CHECK-T1: pop	{r3}
+; CHECK-T1: pop {[[POP_REG:r[0-3]]]}
 ; CHECK-T1: add	sp, #16
-; CHECK-T1: bx	r3
+; CHECK-T1: bx	[[POP_REG]]
 
 ; CHECK-LABEL: test_varsize:
 ; CHECK: sub	sp, #16
diff --git a/test/CodeGen/ARM/fp16-args.ll b/test/CodeGen/ARM/fp16-args.ll
new file mode 100644
index 0000000000000..31a20f85483bd
--- /dev/null
+++ b/test/CodeGen/ARM/fp16-args.ll
@@ -0,0 +1,40 @@
+; RUN: llc -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
+; RUN: llc -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7a--none-eabi"
+
+define float @foo(float %a.coerce, float %b.coerce) {
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = bitcast float %b.coerce to i32
+  %tmp1.0.extract.trunc = trunc i32 %2 to i16
+  %3 = bitcast i16 %tmp1.0.extract.trunc to half
+  %4 = fadd half %1, %3
+  %5 = bitcast half %4 to i16
+  %tmp5.0.insert.ext = zext i16 %5 to i32
+  %6 = bitcast i32 %tmp5.0.insert.ext to float
+  ret float %6
+; CHECK: foo:
+
+; SOFT: vmov    {{s[0-9]+}}, r1
+; SOFT: vmov    {{s[0-9]+}}, r0
+; SOFT: vcvtb.f32.f16   {{s[0-9]+}}, {{s[0-9]+}}
+; SOFT: vcvtb.f32.f16   {{s[0-9]+}}, {{s[0-9]+}}
+; SOFT: vadd.f32        {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; SOFT: vcvtb.f16.f32   {{s[0-9]+}}, {{s[0-9]+}}
+; SOFT: vmov    r0, {{s[0-9]+}}
+
+; HARD-NOT: vmov
+; HARD-NOT: uxth
+; HARD: vcvtb.f32.f16   {{s[0-9]+}}, s1
+; HARD: vcvtb.f32.f16   {{s[0-9]+}}, s0
+; HARD: vadd.f32        {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; HARD: vcvtb.f16.f32   s0, {{s[0-9]+}}
+; HARD-NOT: vmov
+; HARD-NOT: uxth
+
+; CHECK: bx lr
+}
diff --git a/test/CodeGen/ARM/fp16-promote.ll b/test/CodeGen/ARM/fp16-promote.ll
index e691c2bb8a971..2a2eb8d2b6ba7 100644
--- a/test/CodeGen/ARM/fp16-promote.ll
+++ b/test/CodeGen/ARM/fp16-promote.ll
@@ -1,19 +1,19 @@
-; RUN: llc -asm-verbose=false < %s -mattr=+vfp3,+fp16 | FileCheck %s -check-prefix=CHECK-FP16 -check-prefix=CHECK-ALL
-; RUN: llc -asm-verbose=false < %s | FileCheck %s -check-prefix=CHECK-LIBCALL -check-prefix=CHECK-ALL
+; RUN: llc -asm-verbose=false < %s -mattr=+vfp3,+fp16 | FileCheck %s -check-prefix=CHECK-FP16  --check-prefix=CHECK-VFP -check-prefix=CHECK-ALL
+; RUN: llc -asm-verbose=false < %s | FileCheck %s -check-prefix=CHECK-LIBCALL --check-prefix=CHECK-VFP -check-prefix=CHECK-ALL --check-prefix=CHECK-LIBCALL-VFP
+; RUN: llc -asm-verbose=false < %s -mattr=-vfp2 | FileCheck %s --check-prefix=CHECK-LIBCALL -check-prefix=CHECK-NOVFP -check-prefix=CHECK-ALL
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32"
-target triple = "armv7-eabihf"
+target triple = "armv7---eabihf"
 
-; CHECK-FP16-LABEL: test_fadd:
+; CHECK-ALL-LABEL: test_fadd:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: vadd.f32
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vadd.f32
+; CHECK-NOVFP: bl __aeabi_fadd
 ; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_fadd:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vadd.f32
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_fadd(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -22,16 +22,15 @@ define void @test_fadd(half* %p, half* %q) #0 {
   ret void
 }
 
-; CHECK-FP16-LABEL: test_fsub:
+; CHECK-ALL-LABEL: test_fsub:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: vsub.f32
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vsub.f32
+; CHECK-NOVFP: bl __aeabi_fsub
 ; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_fsub:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vsub.f32
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_fsub(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -40,16 +39,15 @@ define void @test_fsub(half* %p, half* %q) #0 {
   ret void
 }
 
-; CHECK-FP16-LABEL: test_fmul:
+; CHECK-ALL-LABEL: test_fmul:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: vmul.f32
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vmul.f32
+; CHECK-NOVFP: bl __aeabi_fmul
 ; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_fmul
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vmul.f32
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_fmul(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -58,16 +56,15 @@ define void @test_fmul(half* %p, half* %q) #0 {
   ret void
 }
 
-; CHECK-FP16-LABEL: test_fdiv:
+; CHECK-ALL-LABEL: test_fdiv:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: vdiv.f32
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vdiv.f32
+; CHECK-NOVFP: bl __aeabi_fdiv
 ; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_fdiv
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vdiv.f32
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_fdiv(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -76,16 +73,14 @@ define void @test_fdiv(half* %p, half* %q) #0 {
   ret void
 }
 
-; CHECK-FP16-LABEL: test_frem:
+; CHECK-ALL-LABEL: test_frem:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: bl fmodf
-; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_frem
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl fmodf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-FP16: vcvtb.f16.f32
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_frem(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -96,9 +91,8 @@ define void @test_frem(half* %p, half* %q) #0 {
 
 ; CHECK-ALL-LABEL: test_load_store:
 ; CHECK-ALL-NEXT: .fnstart
-; CHECK-ALL-NEXT: ldrh r0, [r0]
-; CHECK-ALL-NEXT: strh r0, [r1]
-; CHECK-ALL-NEXT: bx lr
+; CHECK-ALL: ldrh {{r[0-9]+}}, [{{r[0-9]+}}]
+; CHECK-ALL: strh {{r[0-9]+}}, [{{r[0-9]+}}]
 define void @test_load_store(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   store half %a, half* %q
@@ -112,6 +106,7 @@ declare half @test_callee(half %a, half %b) #0
 
 ; CHECK-ALL-LABEL: test_call:
 ; CHECK-ALL-NEXT: .fnstart
+; CHECK-ALL-NEXT: .save {r11, lr}
 ; CHECK-ALL-NEXT: push {r11, lr}
 ; CHECK-ALL-NEXT: bl test_callee
 ; CHECK-ALL-NEXT: pop {r11, pc}
@@ -122,10 +117,14 @@ define half @test_call(half %a, half %b) #0 {
 
 ; CHECK-ALL-LABEL: test_call_flipped:
 ; CHECK-ALL-NEXT: .fnstart
+; CHECK-ALL-NEXT: .save {r11, lr}
 ; CHECK-ALL-NEXT: push {r11, lr}
-; CHECK-ALL-NEXT: mov r2, r0
-; CHECK-ALL-NEXT: mov r0, r1
-; CHECK-ALL-NEXT: mov r1, r2
+; CHECK-VFP-NEXT: vmov.f32 s2, s0
+; CHECK-VFP-NEXT: vmov.f32 s0, s1
+; CHECK-VFP-NEXT: vmov.f32 s1, s2
+; CHECK-NOVFP-NEXT: mov r2, r0
+; CHECK-NOVFP-NEXT: mov r0, r1
+; CHECK-NOVFP-NEXT: mov r1, r2
 ; CHECK-ALL-NEXT: bl test_callee
 ; CHECK-ALL-NEXT: pop {r11, pc}
 define half @test_call_flipped(half %a, half %b) #0 {
@@ -135,9 +134,12 @@ define half @test_call_flipped(half %a, half %b) #0 {
 
 ; CHECK-ALL-LABEL: test_tailcall_flipped:
 ; CHECK-ALL-NEXT: .fnstart
-; CHECK-ALL-NEXT: mov r2, r0
-; CHECK-ALL-NEXT: mov r0, r1
-; CHECK-ALL-NEXT: mov r1, r2
+; CHECK-VFP-NEXT: vmov.f32 s2, s0
+; CHECK-VFP-NEXT: vmov.f32 s0, s1
+; CHECK-VFP-NEXT: vmov.f32 s1, s2
+; CHECK-NOVFP-NEXT: mov r2, r0
+; CHECK-NOVFP-NEXT: mov r0, r1
+; CHECK-NOVFP-NEXT: mov r1, r2
 ; CHECK-ALL-NEXT: b test_callee
 define half @test_tailcall_flipped(half %a, half %b) #0 {
   %r = tail call half @test_callee(half %b, half %a)
@@ -147,12 +149,10 @@ define half @test_tailcall_flipped(half %a, half %b) #0 {
 ; Optimizer picks %p or %q based on %c and only loads that value
 ; No conversion is needed
 ; CHECK-ALL-LABEL: test_select:
-; CHECK-ALL-NEXT: .fnstart
-; CHECK-ALL-NEXT: cmp r2, #0
-; CHECK-ALL-NEXT: movne r1, r0
-; CHECK-ALL-NEXT: ldrh r1, [r1]
-; CHECK-ALL-NEXT: strh r1, [r0]
-; CHECK-ALL-NEXT: bx lr
+; CHECK-ALL: cmp {{r[0-9]+}}, #0
+; CHECK-ALL: movne {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK-ALL: ldrh {{r[0-9]+}}, [{{r[0-9]+}}]
+; CHECK-ALL: strh {{r[0-9]+}}, [{{r[0-9]+}}]
 define void @test_select(half* %p, half* %q, i1 zeroext %c) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -163,17 +163,15 @@ define void @test_select(half* %p, half* %q, i1 zeroext %c) #0 {
 
 ; Test only two variants of fcmp.  These get translated to f32 vcmpe
 ; instructions anyway.
-; CHECK-FP16-LABEL: test_fcmp_une:
+; CHECK-ALL-LABEL: test_fcmp_une:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: vcmpe.f32
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vcmpe.f32
+; CHECK-NOVFP: bl __aeabi_fcmpeq
 ; CHECK-FP16: vmrs APSR_nzcv, fpscr
-; CHECK-FP16: movwne
-; CHECK-LIBCALL-LABEL: test_fcmp_une:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vcmpe.f32
-; CHECK-LIBCALL: movwne
+; CHECK-ALL: movw{{ne|eq}}
 define i1 @test_fcmp_une(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -181,18 +179,15 @@ define i1 @test_fcmp_une(half* %p, half* %q) #0 {
   ret i1 %r
 }
 
-; CHECK-FP16-LABEL: test_fcmp_ueq:
+; CHECK-ALL-LABEL: test_fcmp_ueq:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: vcmpe.f32
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vcmpe.f32
+; CHECK-NOVFP: bl __aeabi_fcmpeq
 ; CHECK-FP16: vmrs APSR_nzcv, fpscr
-; CHECK-FP16: movweq
-; CHECK-FP16: movwvs
-; CHECK-LIBCALL-LABEL: test_fcmp_ueq:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vcmpe.f32
-; CHECK-LIBCALL: movweq
+; CHECK-LIBCALL: movw{{ne|eq}}
 define i1 @test_fcmp_ueq(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -200,19 +195,18 @@ define i1 @test_fcmp_ueq(half* %p, half* %q) #0 {
   ret i1 %r
 }
 
-; CHECK-FP16-LABEL: test_br_cc:
+; CHECK-ALL-LABEL: test_br_cc:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: vcmpe.f32
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vcmpe.f32
+; CHECK-NOVFP: bl __aeabi_fcmplt
 ; CHECK-FP16: vmrs APSR_nzcv, fpscr
-; CHECK-FP16: strmi
-; CHECK-FP16: strpl
-; CHECK-LIBCALL-LABEL: test_br_cc:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vcmpe.f32
-; CHECK-LIBCALL: strmi
-; CHECK-LIBCALL: strpl
+; CHECK-VFP: strmi
+; CHECK-VFP: strpl
+; CHECK-NOVFP: strne
+; CHECK-NOVFP: streq
 define void @test_br_cc(half* %p, half* %q, i32* %p1, i32* %p2) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -227,20 +221,19 @@ else:
 }
 
 declare i1 @test_dummy(half* %p) #0
-; CHECK-FP16-LABEL: test_phi:
+; CHECK-ALL-LABEL: test_phi:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: [[LOOP:.LBB[1-9_]+]]:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: bl      test_dummy
 ; CHECK-FP16: bne     [[LOOP]]
 ; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_phi:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL-VFP: bl __aeabi_h2f
 ; CHECK-LIBCALL: [[LOOP:.LBB[1-9_]+]]:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL-VFP: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl test_dummy
 ; CHECK-LIBCALL: bne     [[LOOP]]
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL-VFP: bl __aeabi_f2h
 define void @test_phi(half* %p) #0 {
 entry:
   %a = load half, half* %p
@@ -255,96 +248,84 @@ return:
   ret void
 }
 
-; CHECK-FP16-LABEL: test_fptosi_i32:
+; CHECK-ALL-LABEL: test_fptosi_i32:
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: vcvt.s32.f32
-; CHECK-LIBCALL-LABEL: test_fptosi_i32:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vcvt.s32.f32
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vcvt.s32.f32
+; CHECK-NOVFP: bl __aeabi_f2iz
 define i32 @test_fptosi_i32(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = fptosi half %a to i32
   ret i32 %r
 }
 
-; CHECK-FP16-LABEL: test_fptosi_i64:
+; CHECK-ALL-LABEL: test_fptosi_i64:
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: bl __aeabi_f2lz
-; CHECK-LIBCALL-LABEL: test_fptosi_i64:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __aeabi_f2lz
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-ALL: bl __aeabi_f2lz
 define i64 @test_fptosi_i64(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = fptosi half %a to i64
   ret i64 %r
 }
 
-; CHECK-FP16-LABEL: test_fptoui_i32:
+; CHECK-ALL-LABEL: test_fptoui_i32:
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: vcvt.u32.f32
-; CHECK-LIBCALL-LABEL: test_fptoui_i32:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vcvt.u32.f32
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vcvt.u32.f32
+; CHECK-NOVFP: bl __aeabi_f2uiz
 define i32 @test_fptoui_i32(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = fptoui half %a to i32
   ret i32 %r
 }
 
-; CHECK-FP16-LABEL: test_fptoui_i64:
+; CHECK-ALL-LABEL: test_fptoui_i64:
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: bl __aeabi_f2ulz
-; CHECK-LIBCALL-LABEL: test_fptoui_i64:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __aeabi_f2ulz
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-ALL: bl __aeabi_f2ulz
 define i64 @test_fptoui_i64(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = fptoui half %a to i64
   ret i64 %r
 }
 
-; CHECK-FP16-LABEL: test_sitofp_i32:
-; CHECK-FP16: vcvt.f32.s32
+; CHECK-ALL-LABEL: test_sitofp_i32:
+; CHECK-VFP: vcvt.f32.s32
+; CHECK-NOVFP: bl __aeabi_i2f
 ; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_sitofp_i32:
-; CHECK-LIBCALL: vcvt.f32.s32
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_sitofp_i32(i32 %a, half* %p) #0 {
   %r = sitofp i32 %a to half
   store half %r, half* %p
   ret void
 }
 
-; CHECK-FP16-LABEL: test_uitofp_i32:
-; CHECK-FP16: vcvt.f32.u32
+; CHECK-ALL-LABEL: test_uitofp_i32:
+; CHECK-VFP: vcvt.f32.u32
+; CHECK-NOVFP: bl __aeabi_ui2f
 ; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_uitofp_i32:
-; CHECK-LIBCALL: vcvt.f32.u32
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_uitofp_i32(i32 %a, half* %p) #0 {
   %r = uitofp i32 %a to half
   store half %r, half* %p
   ret void
 }
 
-; CHECK-FP16-LABEL: test_sitofp_i64:
-; CHECK-FP16: bl __aeabi_l2f
+; CHECK-ALL-LABEL: test_sitofp_i64:
+; CHECK-ALL: bl __aeabi_l2f
 ; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_sitofp_i64:
-; CHECK-LIBCALL: bl __aeabi_l2f
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_sitofp_i64(i64 %a, half* %p) #0 {
   %r = sitofp i64 %a to half
   store half %r, half* %p
   ret void
 }
 
-; CHECK-FP16-LABEL: test_uitofp_i64:
-; CHECK-FP16: bl __aeabi_ul2f
+; CHECK-ALL-LABEL: test_uitofp_i64:
+; CHECK-ALL: bl __aeabi_ul2f
 ; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_uitofp_i64:
-; CHECK-LIBCALL: bl __aeabi_ul2f
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_uitofp_i64(i64 %a, half* %p) #0 {
   %r = uitofp i64 %a to half
   store half %r, half* %p
@@ -354,7 +335,7 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 {
 ; CHECK-FP16-LABEL: test_fptrunc_float:
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_fptrunc_float:
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_fptrunc_float(float %f, half* %p) #0 {
   %a = fptrunc float %f to half
   store half %a, half* %p
@@ -374,7 +355,7 @@ define void @test_fptrunc_double(double %d, half* %p) #0 {
 ; CHECK-FP16-LABEL: test_fpextend_float:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-LIBCALL-LABEL: test_fpextend_float:
-; CHECK-LIBCALL: b __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 define float @test_fpextend_float(half* %p) {
   %a = load half, half* %p, align 2
   %r = fpext half %a to float
@@ -383,10 +364,10 @@ define float @test_fpextend_float(half* %p) {
 
 ; CHECK-FP16-LABEL: test_fpextend_double:
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-FP16: vcvt.f64.f32
 ; CHECK-LIBCALL-LABEL: test_fpextend_double:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vcvt.f64.f32
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vcvt.f64.f32
+; CHECK-NOVFP: bl __aeabi_f2d
 define double @test_fpextend_double(half* %p) {
   %a = load half, half* %p, align 2
   %r = fpext half %a to double
@@ -436,14 +417,14 @@ declare half @llvm.nearbyint.f16(half %a) #0
 declare half @llvm.round.f16(half %a) #0
 declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
 
-; CHECK-FP16-LABEL: test_sqrt:
+; CHECK-ALL-LABEL: test_sqrt:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vsqrt.f32
 ; CHECK-FP16: vcvtb.f16.f32
-; CHECK-LIBCALL-LABEL: test_sqrt:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vsqrt.f32
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP-LIBCALL: vsqrt.f32
+; CHECK-NOVFP: bl sqrtf
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_sqrt(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = call half @llvm.sqrt.f16(half %a)
@@ -456,9 +437,9 @@ define void @test_sqrt(half* %p) #0 {
 ; CHECK-FP16: bl __powisf2
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_fpowi:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl __powisf2
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_fpowi(half* %p, i32 %b) #0 {
   %a = load half, half* %p, align 2
   %r = call half @llvm.powi.f16(half %a, i32 %b)
@@ -471,9 +452,9 @@ define void @test_fpowi(half* %p, i32 %b) #0 {
 ; CHECK-FP16: bl sinf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_sin:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl sinf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_sin(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = call half @llvm.sin.f16(half %a)
@@ -486,9 +467,9 @@ define void @test_sin(half* %p) #0 {
 ; CHECK-FP16: bl cosf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_cos:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl cosf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_cos(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = call half @llvm.cos.f16(half %a)
@@ -502,10 +483,10 @@ define void @test_cos(half* %p) #0 {
 ; CHECK-FP16: bl powf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_pow:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl powf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_pow(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -519,9 +500,9 @@ define void @test_pow(half* %p, half* %q) #0 {
 ; CHECK-FP16: bl expf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_exp:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl expf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_exp(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = call half @llvm.exp.f16(half %a)
@@ -534,9 +515,9 @@ define void @test_exp(half* %p) #0 {
 ; CHECK-FP16: bl exp2f
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_exp2:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl exp2f
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_exp2(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = call half @llvm.exp2.f16(half %a)
@@ -549,9 +530,9 @@ define void @test_exp2(half* %p) #0 {
 ; CHECK-FP16: bl logf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_log:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl logf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_log(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = call half @llvm.log.f16(half %a)
@@ -564,9 +545,9 @@ define void @test_log(half* %p) #0 {
 ; CHECK-FP16: bl log10f
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_log10:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl log10f
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_log10(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = call half @llvm.log10.f16(half %a)
@@ -579,9 +560,9 @@ define void @test_log10(half* %p) #0 {
 ; CHECK-FP16: bl log2f
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_log2:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl log2f
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_log2(half* %p) #0 {
   %a = load half, half* %p, align 2
   %r = call half @llvm.log2.f16(half %a)
@@ -596,11 +577,11 @@ define void @test_log2(half* %p) #0 {
 ; CHECK-FP16: bl fmaf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_fma:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl fmaf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_fma(half* %p, half* %q, half* %r) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -615,9 +596,9 @@ define void @test_fma(half* %p, half* %q, half* %r) #0 {
 ; CHECK-FP16: vabs.f32
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_fabs:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bfc
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_fabs(half* %p) {
   %a = load half, half* %p, align 2
   %r = call half @llvm.fabs.f16(half %a)
@@ -631,10 +612,10 @@ define void @test_fabs(half* %p) {
 ; CHECK-FP16: bl fminf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_minnum:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl fminf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_minnum(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -649,10 +630,10 @@ define void @test_minnum(half* %p, half* %q) #0 {
 ; CHECK-FP16: bl fmaxf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_maxnum:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl fmaxf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_maxnum(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -667,10 +648,13 @@ define void @test_maxnum(half* %p, half* %q) #0 {
 ; CHECK-FP16: vbsl
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_copysign:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vbsl
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP-LIBCALL: vbsl
+; CHECK-NOVFP: bfc
+; CHECK-NOVFP: and
+; CHECK-NOVFP: orr
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_copysign(half* %p, half* %q) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -684,9 +668,9 @@ define void @test_copysign(half* %p, half* %q) #0 {
 ; CHECK-FP16: bl floorf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_floor:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl floorf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_floor(half* %p) {
   %a = load half, half* %p, align 2
   %r = call half @llvm.floor.f16(half %a)
@@ -699,9 +683,9 @@ define void @test_floor(half* %p) {
 ; CHECK-FP16: bl ceilf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_ceil:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl ceilf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_ceil(half* %p) {
   %a = load half, half* %p, align 2
   %r = call half @llvm.ceil.f16(half %a)
@@ -714,9 +698,9 @@ define void @test_ceil(half* %p) {
 ; CHECK-FP16: bl truncf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_trunc:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl truncf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_trunc(half* %p) {
   %a = load half, half* %p, align 2
   %r = call half @llvm.trunc.f16(half %a)
@@ -729,9 +713,9 @@ define void @test_trunc(half* %p) {
 ; CHECK-FP16: bl rintf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_rint:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl rintf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_rint(half* %p) {
   %a = load half, half* %p, align 2
   %r = call half @llvm.rint.f16(half %a)
@@ -744,9 +728,9 @@ define void @test_rint(half* %p) {
 ; CHECK-FP16: bl nearbyintf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_nearbyint:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl nearbyintf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_nearbyint(half* %p) {
   %a = load half, half* %p, align 2
   %r = call half @llvm.nearbyint.f16(half %a)
@@ -759,9 +743,9 @@ define void @test_nearbyint(half* %p) {
 ; CHECK-FP16: bl roundf
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_round:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl roundf
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_round(half* %p) {
   %a = load half, half* %p, align 2
   %r = call half @llvm.round.f16(half %a)
@@ -776,11 +760,12 @@ define void @test_round(half* %p) {
 ; CHECK-FP16: vmla.f32
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_fmuladd:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
-; CHECK-LIBCALL: vmla.f32
-; CHECK-LIBCALL: bl __gnu_f2h_ieee
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP-LIBCALL: vmla.f32
+; CHECK-NOVFP: bl __aeabi_fmul
+; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_fmuladd(half* %p, half* %q, half* %r) #0 {
   %a = load half, half* %p, align 2
   %b = load half, half* %q, align 2
@@ -795,30 +780,28 @@ define void @test_fmuladd(half* %p, half* %q, half* %r) #0 {
 ; and extractelement have these extra loads and stores.
 
 ; CHECK-ALL-LABEL: test_insertelement:
-; CHECK-ALL-NEXT: .fnstart
-; CHECK-ALL-NEXT: sub sp, sp, #8
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: mov
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: add
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: add sp, sp, #8
-; CHECK-ALL-NEXT: bx lr
+; CHECK-ALL: sub sp, sp, #8
+; CHECK-ALL: ldrh
+; CHECK-ALL: strh
+; CHECK-ALL: ldrh
+; CHECK-ALL: strh
+; CHECK-ALL: ldrh
+; CHECK-ALL: strh
+; CHECK-ALL: ldrh
+; CHECK-ALL: strh
+; CHECK-ALL: mov
+; CHECK-ALL-DAG: ldrh
+; CHECK-ALL-DAG: add
+; CHECK-ALL: strh
+; CHECK-ALL: ldrh
+; CHECK-ALL: strh
+; CHECK-ALL: ldrh
+; CHECK-ALL: strh
+; CHECK-ALL: ldrh
+; CHECK-ALL: strh
+; CHECK-ALL: ldrh
+; CHECK-ALL: strh
+; CHECK-ALL: add sp, sp, #8
 define void @test_insertelement(half* %p, <4 x half>* %q, i32 %i) #0 {
   %a = load half, half* %p, align 2
   %b = load <4 x half>, <4 x half>* %q, align 8
@@ -828,22 +811,30 @@ define void @test_insertelement(half* %p, <4 x half>* %q, i32 %i) #0 {
 }
 
 ; CHECK-ALL-LABEL: test_extractelement:
-; CHECK-ALL-NEXT: .fnstart
-; CHECK-ALL-NEXT: sub sp, sp, #8
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: orr
-; CHECK-ALL-NEXT: str
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: orr
-; CHECK-ALL-NEXT: str
-; CHECK-ALL-NEXT: mov
-; CHECK-ALL-NEXT: add
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: add sp, sp, #8
-; CHECK-ALL-NEXT: bx lr
+; CHECK-VFP: sub sp, sp, #8
+; CHECK-VFP: ldrh
+; CHECK-VFP: ldrh
+; CHECK-VFP: orr
+; CHECK-VFP: str
+; CHECK-VFP: ldrh
+; CHECK-VFP: ldrh
+; CHECK-VFP: orr
+; CHECK-VFP: str
+; CHECK-VFP: mov
+; CHECK-VFP: add
+; CHECK-VFP: ldrh
+; CHECK-VFP: strh
+; CHECK-VFP: add sp, sp, #8
+; CHECK-VFP: bx lr
+; CHECK-NOVFP: ldrh
+; CHECK-NOVFP: strh
+; CHECK-NOVFP: ldrh
+; CHECK-NOVFP: strh
+; CHECK-NOVFP: ldrh
+; CHECK-NOVFP: strh
+; CHECK-NOVFP: ldrh
+; CHECK-NOVFP: strh
+; CHECK-NOVFP: ldrh
 define void @test_extractelement(half* %p, <4 x half>* %q, i32 %i) #0 {
   %a = load <4 x half>, <4 x half>* %q, align 8
   %b = extractelement <4 x half> %a, i32 %i
@@ -856,12 +847,10 @@ define void @test_extractelement(half* %p, <4 x half>* %q, i32 %i) #0 {
 %struct.dummy = type { i32, half }
 
 ; CHECK-ALL-LABEL: test_insertvalue:
-; CHECK-ALL-NEXT: .fnstart
-; CHECK-ALL-NEXT: ldr
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: str
-; CHECK-ALL-NEXT: bx lr
+; CHECK-ALL-DAG: ldr
+; CHECK-ALL-DAG: ldrh
+; CHECK-ALL-DAG: strh
+; CHECK-ALL-DAG: str
 define void @test_insertvalue(%struct.dummy* %p, half* %q) {
   %a = load %struct.dummy, %struct.dummy* %p
   %b = load half, half* %q
@@ -871,10 +860,9 @@ define void @test_insertvalue(%struct.dummy* %p, half* %q) {
 }
 
 ; CHECK-ALL-LABEL: test_extractvalue:
-; CHECK-ALL-NEXT: .fnstart
-; CHECK-ALL-NEXT: ldrh
-; CHECK-ALL-NEXT: strh
-; CHECK-ALL-NEXT: bx lr
+; CHECK-ALL: .fnstart
+; CHECK-ALL: ldrh
+; CHECK-ALL: strh
 define void @test_extractvalue(%struct.dummy* %p, half* %q) {
   %a = load %struct.dummy, %struct.dummy* %p
   %b = extractvalue %struct.dummy %a, 1
@@ -882,10 +870,11 @@ define void @test_extractvalue(%struct.dummy* %p, half* %q) {
   ret void
 }
 
-; CHECK-FP16-LABEL: test_struct_return:
+; CHECK-ALL-LABEL: test_struct_return:
 ; CHECK-FP16: vcvtb.f32.f16
-; CHECK-LIBCALL-LABEL: test_struct_return:
-; CHECK-LIBCALL: bl __gnu_h2f_ieee
+; CHECK-VFP-LIBCALL: bl __aeabi_h2f
+; CHECK-NOVFP-DAG: ldr
+; CHECK-NOVFP-DAG: ldrh
 define %struct.dummy @test_struct_return(%struct.dummy* %p) {
   %a = load %struct.dummy, %struct.dummy* %p
   ret %struct.dummy %a
@@ -893,7 +882,7 @@ define %struct.dummy @test_struct_return(%struct.dummy* %p) {
 
 ; CHECK-ALL-LABEL: test_struct_arg:
 ; CHECK-ALL-NEXT: .fnstart
-; CHECK-ALL-NEXT: mov r0, r1
+; CHECK-NOVFP-NEXT: mov r0, r1
 ; CHECK-ALL-NEXT: bx lr
 define half @test_struct_arg(%struct.dummy %p) {
   %a = extractvalue %struct.dummy %p, 1
diff --git a/test/CodeGen/ARM/fp16.ll b/test/CodeGen/ARM/fp16.ll
index 25fbf9070cb61..73d5c36a9c2f3 100644
--- a/test/CodeGen/ARM/fp16.ll
+++ b/test/CodeGen/ARM/fp16.ll
@@ -1,7 +1,9 @@
-; RUN: llc < %s | FileCheck %s
-; RUN: llc -mattr=+vfp3,+fp16 < %s | FileCheck --check-prefix=CHECK-FP16 %s
-; RUN: llc -mtriple=armv8-eabihf < %s | FileCheck --check-prefix=CHECK-ARMV8 %s
-; RUN: llc -mtriple=thumbv7m-eabi < %s | FileCheck --check-prefix=CHECK-SOFTFLOAT %s
+; RUN: llc -mtriple=armv7a--none-eabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-HARDFLOAT-EABI %s
+; RUN: llc -mtriple=armv7a--none-gnueabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-HARDFLOAT-GNU %s
+; RUN: llc -mattr=+vfp3,+fp16 < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-FP16 %s
+; RUN: llc -mtriple=armv8-eabihf < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARMV8 %s
+; RUN: llc -mtriple=thumbv7m-eabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-SOFTFLOAT-EABI %s
+; RUN: llc -mtriple=thumbv7m-gnueabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-SOFTFLOAT-GNU %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32"
 target triple = "armv7---eabihf"
@@ -12,41 +14,45 @@ target triple = "armv7---eabihf"
 
 define void @foo() nounwind {
 ; CHECK-LABEL: foo:
-; CHECK-FP16-LABEL: foo:
-; CHECK-ARMV8-LABEL: foo:
-; CHECK-SOFTFLOAT-LABEL: foo:
 entry:
   %0 = load i16, i16* @x, align 2
   %1 = load i16, i16* @y, align 2
   %2 = tail call float @llvm.convert.from.fp16.f32(i16 %0)
-; CHECK: __gnu_h2f_ieee
+; CHECK-HARDFLOAT-EABI: __aeabi_h2f
+; CHECK-HARDFLOAT-GNU: __gnu_h2f_ieee
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-ARMv8: vcvtb.f32.f16
-; CHECK-SOFTFLOAT: __gnu_h2f_ieee
+; CHECK-SOFTFLOAT-EABI: __aeabi_h2f
+; CHECK-SOFTFLOAT-GNU: __gnu_h2f_ieee
   %3 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
-; CHECK: __gnu_h2f_ieee
+; CHECK-HARDFLOAT-EABI: __aeabi_h2f
+; CHECK-HARDFLOAT-GNU: __gnu_h2f_ieee
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-ARMV8: vcvtb.f32.f16
-; CHECK-SOFTFLOAT: __gnu_h2f_ieee
+; CHECK-SOFTFLOAT-EABI: __aeabi_h2f
+; CHECK-SOFTFLOAT-GNU: __gnu_h2f_ieee
   %4 = fadd float %2, %3
   %5 = tail call i16 @llvm.convert.to.fp16.f32(float %4)
-; CHECK: __gnu_f2h_ieee
+; CHECK-HARDFLOAT-EABI: __aeabi_f2h
+; CHECK-HARDFLOAT-GNU: __gnu_f2h_ieee
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-ARMV8: vcvtb.f16.f32
-; CHECK-SOFTFLOAT: __gnu_f2h_ieee
+; CHECK-SOFTFLOAT-EABI: __aeabi_f2h
+; CHECK-SOFTFLOAT-GNU: __gnu_f2h_ieee
   store i16 %5, i16* @x, align 2
   ret void
 }
 
 define double @test_from_fp16(i16 %in) {
 ; CHECK-LABEL: test_from_fp16:
-; CHECK-FP16-LABEL: test_from_fp16:
-; CHECK-ARMV8-LABEL: test_from_fp16:
-; CHECK-SOFTFLOAT-LABEL: test_from_fp16:
   %val = call double @llvm.convert.from.fp16.f64(i16 %in)
-; CHECK: bl __gnu_h2f_ieee
-; CHECK: vmov [[TMP:s[0-9]+]], r0
-; CHECK: vcvt.f64.f32 d0, [[TMP]]
+; CHECK-HARDFLOAT-EABI: bl __aeabi_h2f
+; CHECK-HARDFLOAT-EABI: vmov [[TMP:s[0-9]+]], r0
+; CHECK-HARDFLOAT-EABI: vcvt.f64.f32 {{d[0-9]+}}, [[TMP]]
+
+; CHECK-HARDFLOAT-GNU: bl __gnu_h2f_ieee
+; CHECK-HARDFLOAT-GNU: vmov [[TMP:s[0-9]+]], r0
+; CHECK-HARDFLOAT-GNU: vcvt.f64.f32 {{d[0-9]+}}, [[TMP]]
 
 ; CHECK-FP16: vmov [[TMP16:s[0-9]+]], r0
 ; CHECK-FP16: vcvtb.f32.f16 [[TMP32:s[0-9]+]], [[TMP16]]
@@ -55,25 +61,29 @@ define double @test_from_fp16(i16 %in) {
 ; CHECK-ARMV8: vmov [[TMP:s[0-9]+]], r0
 ; CHECK-ARMV8: vcvtb.f64.f16 d0, [[TMP]]
 
-; CHECK-SOFTFLOAT: bl __gnu_h2f_ieee
-; CHECK-SOFTFLOAT: bl __aeabi_f2d
+; CHECK-SOFTFLOAT-EABI: bl __aeabi_h2f
+; CHECK-SOFTFLOAT-EABI: bl __aeabi_f2d
+
+; CHECK-SOFTFLOAT-GNU: bl __gnu_h2f_ieee
+; CHECK-SOFTFLOAT-GNU: bl __aeabi_f2d
   ret double %val
 }
 
 define i16 @test_to_fp16(double %in) {
 ; CHECK-LABEL: test_to_fp16:
-; CHECK-FP16-LABEL: test_to_fp16:
-; CHECK-ARMV8-LABEL: test_to_fp16:
-; CHECK-SOFTFLOAT-LABEL: test_to_fp16:
   %val = call i16 @llvm.convert.to.fp16.f64(double %in)
-; CHECK: bl __aeabi_d2h
+; CHECK-HARDFLOAT-EABI: bl __aeabi_d2h
+
+; CHECK-HARDFLOAT-GNU: bl __aeabi_d2h
 
 ; CHECK-FP16: bl __aeabi_d2h
 
 ; CHECK-ARMV8: vcvtb.f16.f64 [[TMP:s[0-9]+]], d0
 ; CHECK-ARMV8: vmov r0, [[TMP]]
 
-; CHECK-SOFTFLOAT: bl __aeabi_d2h
+; CHECK-SOFTFLOAT-EABI: bl __aeabi_d2h
+
+; CHECK-SOFTFLOAT-GNU: bl __aeabi_d2h
   ret i16 %val
 }
 
diff --git a/test/CodeGen/ARM/fparith.ll b/test/CodeGen/ARM/fparith.ll
index cc880148da85a..824824429db17 100644
--- a/test/CodeGen/ARM/fparith.ll
+++ b/test/CodeGen/ARM/fparith.ll
@@ -56,7 +56,7 @@ entry:
 	ret float %tmp1
 }
 
-define double @f8(double %a) {
+define arm_aapcs_vfpcc double @f8(double %a) {
 ;CHECK-LABEL: f8:
 ;CHECK: vneg.f64
 entry:
@@ -90,7 +90,7 @@ entry:
 
 declare float @fabsf(float)
 
-define double @f12(double %a) {
+define arm_aapcs_vfpcc double @f12(double %a) {
 ;CHECK-LABEL: f12:
 ;CHECK: vabs.f64
 entry:
diff --git a/test/CodeGen/ARM/gep-optimization.ll b/test/CodeGen/ARM/gep-optimization.ll
new file mode 100644
index 0000000000000..ce5af66d56cea
--- /dev/null
+++ b/test/CodeGen/ARM/gep-optimization.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=armv7a-eabi   | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-AT2
+; RUN: llc < %s -mtriple=thumbv7m-eabi | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-AT2
+; RUN: llc < %s -mtriple=thumbv6m-eabi | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-T1
+
+; This test checks that various kinds of getelementptr are all optimised to a
+; simple multiply plus add, with the add being done by a register offset if the
+; result is used in a load.
+
+; CHECK-LABEL: calc_1d:
+; CHECK: mov{{s?}} [[REG1:r[0-9]+]], #84
+; CHECK-AT2: mla r0, r1, [[REG1]], r0
+; CHECK-T1: muls [[REG2:r[0-9]+]], r1, [[REG1]]
+; CHECK-T1: adds r0, r0, [[REG2]]
+define i32* @calc_1d(i32* %p, i32 %n) {
+entry:
+  %mul = mul nsw i32 %n, 21
+  %add.ptr = getelementptr inbounds i32, i32* %p, i32 %mul
+  ret i32* %add.ptr
+}
+
+; CHECK-LABEL: load_1d:
+; CHECK: mov{{s?}} [[REG1:r[0-9]+]], #84
+; CHECK: mul{{s?}} [[REG2:r[0-9]+]],{{( r1,)?}} [[REG1]]{{(, r1)?}}
+; CHECK: ldr r0, [r0, [[REG2]]]
+define i32 @load_1d(i32* %p, i32 %n) #1 {
+entry:
+  %mul = mul nsw i32 %n, 21
+  %arrayidx = getelementptr inbounds i32, i32* %p, i32 %mul
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: calc_2d_a:
+; CHECK: mov{{s?}} [[REG1:r[0-9]+]], #84
+; CHECK-AT2: mla r0, r1, [[REG1]], r0
+; CHECK-T1: muls [[REG2:r[0-9]+]], r1, [[REG1]]
+; CHECK-T1: adds r0, r0, [[REG2]]
+define i32* @calc_2d_a([100 x i32]* %p, i32 %n) {
+entry:
+  %mul = mul nsw i32 %n, 21
+  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %p, i32 0, i32 %mul
+  ret i32* %arrayidx1
+}
+
+; CHECK-LABEL: load_2d_a:
+; CHECK: mov{{s?}} [[REG1:r[0-9]+]], #84
+; CHECK: mul{{s?}} [[REG2:r[0-9]+]],{{( r1,)?}} [[REG1]]{{(, r1)?}}
+; CHECK: ldr r0, [r0, [[REG2]]]
+define i32 @load_2d_a([100 x i32]* %p, i32 %n) #1 {
+entry:
+  %mul = mul nsw i32 %n, 21
+  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %p, i32 0, i32 %mul
+  %0 = load i32, i32* %arrayidx1, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: calc_2d_b:
+; CHECK: mov{{s?}} [[REG1:r[0-9]+]], #84
+; CHECK-AT2: mla r0, r1, [[REG1]], r0
+; CHECK-T1: muls [[REG2:r[0-9]+]], r1, [[REG1]]
+; CHECK-T1: adds r0, r0, [[REG2]]
+define i32* @calc_2d_b([21 x i32]* %p, i32 %n) {
+entry:
+  %arrayidx1 = getelementptr inbounds [21 x i32], [21 x i32]* %p, i32 %n, i32 0
+  ret i32* %arrayidx1
+}
+
+; CHECK-LABEL: load_2d_b:
+; CHECK: mov{{s?}} [[REG1:r[0-9]+]], #84
+; CHECK: mul{{s?}} [[REG2:r[0-9]+]],{{( r1,)?}} [[REG1]]{{(, r1)?}}
+; CHECK: ldr r0, [r0, [[REG2]]]
+define i32 @load_2d_b([21 x i32]* %p, i32 %n) {
+entry:
+  %arrayidx1 = getelementptr inbounds [21 x i32], [21 x i32]* %p, i32 %n, i32 0
+  %0 = load i32, i32* %arrayidx1, align 4
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/global-merge-1.ll b/test/CodeGen/ARM/global-merge-1.ll
index d4d9b0f9d1f3e..a3cbe8aec0984 100644
--- a/test/CodeGen/ARM/global-merge-1.ll
+++ b/test/CodeGen/ARM/global-merge-1.ll
@@ -11,16 +11,16 @@
 ; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
 ; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
 ; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; MERGE: .zerofill __DATA,__bss,l__MergedGlobals,60,4
 ; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
 ; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
 ; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
 
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; NO-MERGE-NOT: .zerofill __DATA,__bss,l__MergedGlobals,60,4
 ; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
 ; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
 ; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; NO-MERGE-NOT: .zerofill __DATA,__bss,l__MergedGlobals,60,4
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios3.0.0"
diff --git a/test/CodeGen/ARM/global-merge-external.ll b/test/CodeGen/ARM/global-merge-external.ll
new file mode 100644
index 0000000000000..a9e0d199705a8
--- /dev/null
+++ b/test/CodeGen/ARM/global-merge-external.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge                                 | FileCheck %s --check-prefix=CHECK-MERGE
+; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -global-merge-on-external=true  | FileCheck %s --check-prefix=CHECK-MERGE
+; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -global-merge-on-external=false | FileCheck %s --check-prefix=CHECK-NO-MERGE
+; RUN: llc < %s -mtriple=arm-macho -arm-global-merge                                 | FileCheck %s --check-prefix=CHECK-NO-MERGE
+
+@x = global i32 0, align 4
+@y = global i32 0, align 4
+@z = global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+;CHECK:          f1:
+;CHECK:          ldr {{r[0-9]+}}, [[LABEL1:\.LCPI[0-9]+_[0-9]]]
+;CHECK:          [[LABEL1]]:
+;CHECK-MERGE:    .long .L_MergedGlobals
+;CHECK-NO-MERGE: .long {{_?x}}
+  store i32 %a1, i32* @x, align 4
+  store i32 %a2, i32* @y, align 4
+  ret void
+}
+
+define void @g1(i32 %a1, i32 %a2) {
+;CHECK:          g1:
+;CHECK:          ldr {{r[0-9]+}}, [[LABEL2:\.LCPI[0-9]+_[0-9]]]
+;CHECK:          [[LABEL2]]:
+;CHECK-MERGE:    .long .L_MergedGlobals
+;CHECK-NO-MERGE: .long {{_?y}}
+  store i32 %a1, i32* @y, align 4
+  store i32 %a2, i32* @z, align 4
+  ret void
+}
+
+;CHECK-NO-MERGE-NOT: .globl .L_MergedGlobals
+
+;CHECK-MERGE:	.type	.L_MergedGlobals,%object
+;CHECK-MERGE:	.local	.L_MergedGlobals
+;CHECK-MERGE:	.comm	.L_MergedGlobals,12,4
+
+;CHECK-MERGE:	.globl	x
+;CHECK-MERGE: x = .L_MergedGlobals
+;CHECK-MERGE: .size x, 4
+;CHECK-MERGE:	.globl	y
+;CHECK-MERGE: y = .L_MergedGlobals+4
+;CHECK-MERGE: .size y, 4
+;CHECK-MERGE:	.globl	z
+;CHECK-MERGE: z = .L_MergedGlobals+8
+;CHECK-MERGE: .size z, 4
diff --git a/test/CodeGen/ARM/globals.ll b/test/CodeGen/ARM/globals.ll
index bab96dadce55e..e6aa2db744d58 100644
--- a/test/CodeGen/ARM/globals.ll
+++ b/test/CodeGen/ARM/globals.ll
@@ -60,16 +60,13 @@ define i32 @test1() {
 
 ; LinuxPIC-LABEL: test1:
 ; LinuxPIC: 	ldr r0, .LCPI0_0
-; LinuxPIC: 	ldr r1, .LCPI0_1
 	
 ; LinuxPIC: .LPC0_0:
-; LinuxPIC: 	add r0, pc, r0
-; LinuxPIC: 	ldr r0, [r1, r0]
+; LinuxPIC: 	ldr r0, [pc, r0]
 ; LinuxPIC: 	ldr r0, [r0]
 ; LinuxPIC: 	bx lr
 
 ; LinuxPIC: .align 2
 ; LinuxPIC: .LCPI0_0:
-; LinuxPIC:     .long _GLOBAL_OFFSET_TABLE_-(.LPC0_0+8)
-; LinuxPIC: .LCPI0_1:
-; LinuxPIC:     .long	G(GOT)
+; LinuxPIC: .Ltmp0:
+; LinuxPIC:     .long   G(GOT_PREL)-((.LPC0_0+8)-.Ltmp0)
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
index 4e6924fe5b6bb..a44c9721d6c13 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -14,15 +14,15 @@ entry:
   br i1 undef, label %for.end, label %for.body
 
 ; Before if conversion, we have
-; for.body -> lor.lhs.false.i (62)
-;          -> for.cond.backedge (62)
-; lor.lhs.false.i -> for.cond.backedge (1048575)
-;                 -> cond.false.i (1)
+; for.body -> lor.lhs.false.i (50%)
+;          -> for.cond.backedge (50%)
+; lor.lhs.false.i -> for.cond.backedge (100%)
+;                 -> cond.false.i (0%)
 ; Afer if conversion, we have
-; for.body -> for.cond.backedge (130023362)
-;          -> cond.false.i (62)
+; for.body -> for.cond.backedge (100%)
+;          -> cond.false.i (0%)
 ; CHECK: BB#1: derived from LLVM BB %for.body
-; CHECK: Successors according to CFG: BB#2(130023362) BB#4(62)
+; CHECK: Successors according to CFG: BB#2(0x7ffffc00 / 0x80000000 = 100.00%) BB#4(0x00000400 / 0x80000000 = 0.00%)
 for.body:
   br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1
 
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll
index 41d78e53acc75..0de039cde23c5 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll
@@ -19,7 +19,7 @@ bb:
   br i1 %9, label %return, label %bb2
 
 ; CHECK: BB#2: derived from LLVM BB %bb2
-; CHECK: Successors according to CFG: BB#3(192) BB#4(192)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}50.00%) BB#4({{[0-9a-fx/= ]+}}50.00%)
 
 bb2:
   %v10 = icmp eq i32 %3, 16
diff --git a/test/CodeGen/ARM/ifcvt-iter-indbr.ll b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
index 75e9d77d79207..a96b6e8a1e83f 100644
--- a/test/CodeGen/ARM/ifcvt-iter-indbr.ll
+++ b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-PROB %s
 
 declare i32 @foo(i32)
 declare i8* @bar(i32, i8*, i8*)
@@ -27,6 +28,11 @@ declare i8* @bar(i32, i8*, i8*)
 ; CHECK-NEXT:  movw r0, #4567
 ; CHECK-NEXT: [[FOOCALL]]:
 ; CHECK-NEXT:  blx _foo
+;
+; CHECK-PROB: BB#0:
+; CHECK-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}50.00%) BB#2({{[0-9a-fx/= ]+}}25.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
+; CHECK-PROB: BB#1:
+; CHECK-PROB: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
 
 define i32 @test(i32 %a, i32 %a2, i32* %p, i32* %p2) {
 entry:
diff --git a/test/CodeGen/ARM/ifcvt4.ll b/test/CodeGen/ARM/ifcvt4.ll
index 8c6825aeda973..0a6b99fb89b36 100644
--- a/test/CodeGen/ARM/ifcvt4.ll
+++ b/test/CodeGen/ARM/ifcvt4.ll
@@ -1,10 +1,8 @@
 ; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
-; Do not if-convert when branches go to the different loops.
 ; CHECK-LABEL: t:
-; CHECK-NOT: subgt
-; CHECK-NOT: suble
-; Don't use
+; CHECK: subgt
+; CHECK: suble
 define i32 @t(i32 %a, i32 %b) {
 entry:
 	%tmp1434 = icmp eq i32 %a, %b		; <i1> [#uses=1]
diff --git a/test/CodeGen/ARM/ifcvt5.ll b/test/CodeGen/ARM/ifcvt5.ll
index 3aa2139cc03a9..9fb8abde6130d 100644
--- a/test/CodeGen/ARM/ifcvt5.ll
+++ b/test/CodeGen/ARM/ifcvt5.ll
@@ -13,10 +13,10 @@ entry:
 
 define i32 @t1(i32 %a, i32 %b) {
 ; A8-LABEL: t1:
-; A8: poplt {r7, pc}
+; A8: bxlt lr
 
 ; SWIFT-LABEL: t1:
-; SWIFT: pop {r7, pc}
+; SWIFT: bxlt lr
 ; SWIFT: pop {r7, pc}
 entry:
 	%tmp1 = icmp sgt i32 %a, 10		; <i1> [#uses=1]
diff --git a/test/CodeGen/ARM/ifcvt6.ll b/test/CodeGen/ARM/ifcvt6.ll
index 78901930e4b26..668069751cf1b 100644
--- a/test/CodeGen/ARM/ifcvt6.ll
+++ b/test/CodeGen/ARM/ifcvt6.ll
@@ -3,7 +3,7 @@
 define void @foo(i32 %X, i32 %Y) {
 entry:
 ; CHECK: cmpne
-; CHECK: pophi
+; CHECK: bxhi lr
 	%tmp1 = icmp ult i32 %X, 4		; <i1> [#uses=1]
 	%tmp4 = icmp eq i32 %Y, 0		; <i1> [#uses=1]
 	%tmp7 = or i1 %tmp4, %tmp1		; <i1> [#uses=1]
diff --git a/test/CodeGen/ARM/ifcvt8.ll b/test/CodeGen/ARM/ifcvt8.ll
index ca9a5c63cda67..e8b7f6926396e 100644
--- a/test/CodeGen/ARM/ifcvt8.ll
+++ b/test/CodeGen/ARM/ifcvt8.ll
@@ -5,7 +5,9 @@
 declare void @abort()
 
 define fastcc void @t(%struct.SString* %word, i8 signext  %c) {
-; CHECK: popne
+; CHECK-NOT: pop
+; CHECK: bxne
+; CHECK-NOT: pop
 entry:
 	%tmp1 = icmp eq %struct.SString* %word, null		; <i1> [#uses=1]
 	br i1 %tmp1, label %cond_true, label %cond_false
diff --git a/test/CodeGen/ARM/inlineasm-switch-mode.ll b/test/CodeGen/ARM/inlineasm-switch-mode.ll
index 65fea114d7de8..6035612788d80 100644
--- a/test/CodeGen/ARM/inlineasm-switch-mode.ll
+++ b/test/CodeGen/ARM/inlineasm-switch-mode.ll
@@ -15,8 +15,8 @@ define hidden i32 @bah(i8* %start) #0 align 2 {
 ; ARM: $t
 ; ARM-NEXT: 48 1c
 
-; THUMB: $a
+; THUMB: $a{{.*}}:
 ; THUMB-NEXT: 04 70
 ; THUMB-NEXT: 2d e5
-; THUMB: $t
+; THUMB: $t{{.*}}:
 ; THUMB-NEXT: 48 1c   adds    r0, r1, #1
diff --git a/test/CodeGen/ARM/ldm-stm-base-materialization.ll b/test/CodeGen/ARM/ldm-stm-base-materialization.ll
new file mode 100644
index 0000000000000..a3231f95f478f
--- /dev/null
+++ b/test/CodeGen/ARM/ldm-stm-base-materialization.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple armv7a-none-eabi -mattr=-neon < %s -verify-machineinstrs -o - | FileCheck %s
+
+; Thumb1 (thumbv6m) is tested in tests/Thumb
+
+@a = external global i32*
+@b = external global i32*
+
+; Function Attrs: nounwind
+define void @foo24() #0 {
+entry:
+; CHECK-LABEL: foo24:
+; We use '[rl0-9]*' to allow 'r0'..'r12', 'lr'
+; CHECK: movt [[LB:[rl0-9]+]], :upper16:b
+; CHECK: movt [[SB:[rl0-9]+]], :upper16:a
+; CHECK: add [[NLB:[rl0-9]+]], [[LB]], #4
+; CHECK: add [[NSB:[rl0-9]+]], [[SB]], #4
+; CHECK-NEXT: ldm [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]], [[R5:[rl0-9]+]], [[R6:[rl0-9]+]]}
+; CHECK-NEXT: stm [[NSB]], {[[R1]], [[R2]], [[R3]], [[R4]], [[R5]], [[R6]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 24, i32 4, i1 false)
+  ret void
+}
+
+define void @foo28() #0 {
+entry:
+; CHECK-LABEL: foo28:
+; CHECK: movt [[LB:[rl0-9]+]], :upper16:b
+; CHECK: movt [[SB:[rl0-9]+]], :upper16:a
+; CHECK: add [[NLB:[rl0-9]+]], [[LB]], #4
+; CHECK: add [[NSB:[rl0-9]+]], [[SB]], #4
+; CHECK-NEXT: ldm [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]]}
+; CHECK-NEXT: stm [[NSB]]!, {[[R1]], [[R2]], [[R3]]}
+; CHECK-NEXT: ldm [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]}
+; CHECK-NEXT: stm [[NSB]], {[[R1]], [[R2]], [[R3]], [[R4]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false)
+  ret void
+}
+
+define void @foo32() #0 {
+entry:
+; CHECK-LABEL: foo32:
+; CHECK: movt [[LB:[rl0-9]+]], :upper16:b
+; CHECK: movt [[SB:[rl0-9]+]], :upper16:a
+; CHECK: add [[NLB:[rl0-9]+]], [[LB]], #4
+; CHECK: add [[NSB:[rl0-9]+]], [[SB]], #4
+; CHECK-NEXT: ldm [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]}
+; CHECK-NEXT: stm [[NSB]]!, {[[R1]], [[R2]], [[R3]], [[R4]]}
+; CHECK-NEXT: ldm [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]}
+; CHECK-NEXT: stm [[NSB]], {[[R1]], [[R2]], [[R3]], [[R4]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 32, i32 4, i1 false)
+  ret void
+}
+
+define void @foo36() #0 {
+entry:
+; CHECK-LABEL: foo36:
+; CHECK: movt [[LB:[rl0-9]+]], :upper16:b
+; CHECK: movt [[SB:[rl0-9]+]], :upper16:a
+; CHECK: add [[NLB:[rl0-9]+]], [[LB]], #4
+; CHECK: add [[NSB:[rl0-9]+]], [[SB]], #4
+; CHECK-NEXT: ldm [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]}
+; CHECK-NEXT: stm [[NSB]]!, {[[R1]], [[R2]], [[R3]], [[R4]]}
+; CHECK-NEXT: ldm [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]], [[R5:[rl0-9]+]]}
+; CHECK-NEXT: stm [[NSB]], {[[R1]], [[R2]], [[R3]], [[R4]], [[R5]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 36, i32 4, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index 5411618ed86dd..b2596346bfa13 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=A8 -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=CHECK
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK
 ; rdar://6949835
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK
@@ -112,10 +112,10 @@ entry:
 }
 
 ; CHECK-LABEL: strd_spill_ldrd_reload:
-; A8: strd r1, r0, [sp]
-; M3: strd r1, r0, [sp]
-; BASIC: strd r1, r0, [sp]
-; GREEDY: strd r0, r1, [sp]
+; A8: strd r1, r0, [sp, #-8]!
+; M3: strd r1, r0, [sp, #-8]!
+; BASIC: strd r1, r0, [sp, #-8]!
+; GREEDY: strd r0, r1, [sp, #-8]!
 ; CHECK: @ InlineAsm Start
 ; CHECK: @ InlineAsm End
 ; A8: ldrd r2, r1, [sp]
@@ -131,5 +131,53 @@ define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
   ret void
 }
 
+declare void @extfunc2(i32*, i32, i32)
+
+; CHECK-LABEL: ldrd_postupdate_dec:
+; CHECK: ldrd r1, r2, [r0], #-8
+; CHECK-NEXT: bl{{x?}} _extfunc
+define void @ldrd_postupdate_dec(i32* %p0) {
+  %p0.1 = getelementptr i32, i32* %p0, i32 1
+  %v0 = load i32, i32* %p0
+  %v1 = load i32, i32* %p0.1
+  %p1 = getelementptr i32, i32* %p0, i32 -2
+  call void @extfunc2(i32* %p1, i32 %v0, i32 %v1)
+  ret void
+}
+
+; CHECK-LABEL: ldrd_postupdate_inc:
+; CHECK: ldrd r1, r2, [r0], #8
+; CHECK-NEXT: bl{{x?}} _extfunc
+define void @ldrd_postupdate_inc(i32* %p0) {
+  %p0.1 = getelementptr i32, i32* %p0, i32 1
+  %v0 = load i32, i32* %p0
+  %v1 = load i32, i32* %p0.1
+  %p1 = getelementptr i32, i32* %p0, i32 2
+  call void @extfunc2(i32* %p1, i32 %v0, i32 %v1)
+  ret void
+}
+
+; CHECK-LABEL: strd_postupdate_dec:
+; CHECK: strd r1, r2, [r0], #-8
+; CHECK-NEXT: bx lr
+define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
+  %p0.1 = getelementptr i32, i32* %p0, i32 1
+  store i32 %v0, i32* %p0
+  store i32 %v1, i32* %p0.1
+  %p1 = getelementptr i32, i32* %p0, i32 -2
+  ret i32* %p1
+}
+
+; CHECK-LABEL: strd_postupdate_inc:
+; CHECK: strd r1, r2, [r0], #8
+; CHECK-NEXT: bx lr
+define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) {
+  %p0.1 = getelementptr i32, i32* %p0, i32 1
+  store i32 %v0, i32* %p0
+  store i32 %v1, i32* %p0.1
+  %p1 = getelementptr i32, i32* %p0, i32 2
+  ret i32* %p1
+}
+
 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/ARM/legalize-unaligned-load.ll b/test/CodeGen/ARM/legalize-unaligned-load.ll
new file mode 100644
index 0000000000000..fa5b21aa4a239
--- /dev/null
+++ b/test/CodeGen/ARM/legalize-unaligned-load.ll
@@ -0,0 +1,35 @@
+; RUN:  llc -O3 -code-model=default -relocation-model=default -mtriple=armv7l-unknown-linux-gnueabihf -mcpu=generic %s -o - | FileCheck %s
+; Check that we respect the existing chain between loads and stores when we
+; legalize unaligned loads.
+; Test case from PR24669.
+
+; Make sure the loads happen before the stores.
+; CHECK-LABEL: get_set_complex:
+; CHECK-NOT: str
+; CHECK: ldr
+; CHECK-NOT: str
+; CHECK: ldr
+; CHECK: str
+; CHECK: bx
+define i32 @get_set_complex({ float, float }* noalias nocapture %retptr,
+                            { i8*, i32 }** noalias nocapture readnone %excinfo,
+                            i8* noalias nocapture readnone %env,
+                            [38 x i8]* nocapture %arg.rec,
+                            float %arg.val.0, float %arg.val.1)
+{
+entry:
+  %inserted.real = insertvalue { float, float } undef, float %arg.val.0, 0
+  %inserted.imag = insertvalue { float, float } %inserted.real, float %arg.val.1, 1
+  %.15 = getelementptr inbounds [38 x i8], [38 x i8]* %arg.rec, i32 0, i32 10
+  %.16 = bitcast i8* %.15 to { float, float }*
+  %.17 = bitcast i8* %.15 to float*
+  %.18 = load float, float* %.17, align 1
+  %.19 = getelementptr inbounds [38 x i8], [38 x i8]* %arg.rec, i32 0, i32 14
+  %tmp = bitcast i8* %.19 to float*
+  %.20 = load float, float* %tmp, align 1
+  %inserted.real.1 = insertvalue { float, float } undef, float %.18, 0
+  %inserted.imag.1 = insertvalue { float, float } %inserted.real.1, float %.20, 1
+  store { float, float } %inserted.imag, { float, float }* %.16, align 1
+  store { float, float } %inserted.imag.1, { float, float }* %retptr, align 4
+  ret i32 0
+}
diff --git a/test/CodeGen/ARM/load-global.ll b/test/CodeGen/ARM/load-global.ll
index 34748bc848bdb..eade2fda37054 100644
--- a/test/CodeGen/ARM/load-global.ll
+++ b/test/CodeGen/ARM/load-global.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=thumbv6-apple-darwin -relocation-model=pic | FileCheck %s -check-prefix=PIC_T
 ; RUN: llc < %s -mtriple=armv7-apple-darwin -relocation-model=pic | FileCheck %s -check-prefix=PIC_V7
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mtriple=thumbv6-linux-gnueabi -relocation-model=pic | FileCheck %s -check-prefix=LINUX_T
 
 @G = external global i32
 
@@ -40,11 +41,14 @@ define i32 @test1() {
 
 ; LINUX: test1
 ; LINUX: ldr r0, .LCPI0_0
-; LINUX: ldr r1, .LCPI0_1
-; LINUX: add r0, pc, r0
-; LINUX: ldr r0, [r1, r0]
+; LINUX: ldr r0, [pc, r0]
 ; LINUX: ldr r0, [r0]
-; LINUX: .long G(GOT)
+; LINUX: .long G(GOT_PREL)-((.LPC0_0+8)-.Ltmp0)
+
+; LINUX_T: ldr r0, .LCPI0_0
+; LINUX_T: add r0, pc
+; LINUX_T: ldr r0, [r0]
+; LINUX_T: ldr r0, [r0]
 	%tmp = load i32, i32* @G
 	ret i32 %tmp
 }
diff --git a/test/CodeGen/ARM/load-store-flags.ll b/test/CodeGen/ARM/load-store-flags.ll
index 5825a30109d08..95d9b484a0a76 100644
--- a/test/CodeGen/ARM/load-store-flags.ll
+++ b/test/CodeGen/ARM/load-store-flags.ll
@@ -6,7 +6,7 @@
 define void @test_base_kill(i32 %v0, i32 %v1, i32* %addr) {
 ; CHECK-LABEL: test_base_kill:
 ; CHECK: adds [[NEWBASE:r[0-9]+]], r2, #4
-; CHECK: stm.w [[NEWBASE]], {r0, r1, r2}
+; CHECK: stm [[NEWBASE]]!, {r0, r1, r2}
 
   %addr.1 = getelementptr i32, i32* %addr, i32 1
   store i32 %v0, i32* %addr.1
@@ -27,7 +27,7 @@ define void @test_base_kill(i32 %v0, i32 %v1, i32* %addr) {
 define void @test_base_kill_mid(i32 %v0, i32* %addr, i32 %v1) {
 ; CHECK-LABEL: test_base_kill_mid:
 ; CHECK: adds [[NEWBASE:r[0-9]+]], r1, #4
-; CHECK: stm.w [[NEWBASE]], {r0, r1, r2}
+; CHECK: stm [[NEWBASE]]!, {r0, r1, r2}
 
   %addr.1 = getelementptr i32, i32* %addr, i32 1
   store i32 %v0, i32* %addr.1
diff --git a/test/CodeGen/ARM/load.ll b/test/CodeGen/ARM/load.ll
index 3b2d637cb26ef..b8f3003755a05 100644
--- a/test/CodeGen/ARM/load.ll
+++ b/test/CodeGen/ARM/load.ll
@@ -1,35 +1,564 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-T1
+; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-T2
 
-define i32 @f1(i8* %p) {
+
+; Register offset
+
+; CHECK-LABEL: ldrsb_rr
+; CHECK:    ldrsb   r0, [r0, r1]
+define i32 @ldrsb_rr(i8* %p, i32 %n) {
+entry:
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %n
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrsh_rr
+; CHECK-T1: lsls    r1, r1, #1
+; CHECK-T1: ldrsh   r0, [r0, r1]
+; CHECK-T2: ldrsh.w r0, [r0, r1, lsl #1]
+define i32 @ldrsh_rr(i16* %p, i32 %n) {
+entry:
+  %arrayidx = getelementptr inbounds i16, i16* %p, i32 %n
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrb_rr
+; CHECK:    ldrb r0, [r0, r1]
+define i32 @ldrb_rr(i8* %p, i32 %n) {
+entry:
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %n
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrh_rr
+; CHECK-T1: lsls    r1, r1, #1
+; CHECK-T1: ldrh    r0, [r0, r1]
+; CHECK-T2: ldrh.w  r0, [r0, r1, lsl #1]
+define i32 @ldrh_rr(i16* %p, i32 %n) {
+entry:
+  %arrayidx = getelementptr inbounds i16, i16* %p, i32 %n
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv = zext i16 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldr_rr
+; CHECK-T1: lsls    r1, r1, #2
+; CHECK-T1: ldr     r0, [r0, r1]
+; CHECK-T2: ldr.w   r0, [r0, r1, lsl #2]
+define i32 @ldr_rr(i32* %p, i32 %n) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %p, i32 %n
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: strb_rr
+; CHECK:    strb    r2, [r0, r1]
+define void @strb_rr(i8* %p, i32 %n, i32 %x) {
+entry:
+  %conv = trunc i32 %x to i8
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %n
+  store i8 %conv, i8* %arrayidx, align 1
+  ret void
+}
+
+; CHECK-LABEL: strh_rr
+; CHECK-T1: lsls    r1, r1, #1
+; CHECK-T1: strh    r2, [r0, r1]
+; CHECK-T2: strh.w  r2, [r0, r1, lsl #1]
+define void @strh_rr(i16* %p, i32 %n, i32 %x) {
+entry:
+  %conv = trunc i32 %x to i16
+  %arrayidx = getelementptr inbounds i16, i16* %p, i32 %n
+  store i16 %conv, i16* %arrayidx, align 2
+  ret void
+}
+
+; CHECK-LABEL: str_rr
+; CHECK-T1: lsls    r1, r1, #2
+; CHECK-T1: str     r2, [r0, r1]
+; CHECK-T2: str.w   r2, [r0, r1, lsl #2]
+define void @str_rr(i32* %p, i32 %n, i32 %x) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %p, i32 %n
+  store i32 %x, i32* %arrayidx, align 4
+  ret void
+}
+
+
+; Immediate offset of zero
+
+; CHECK-LABEL: ldrsb_ri_zero
+; CHECK-T1: ldrb    r0, [r0]
+; CHECK-T1: sxtb    r0, r0
+; CHECK-T2: ldrsb.w r0, [r0]
+define i32 @ldrsb_ri_zero(i8* %p) {
+entry:
+  %0 = load i8, i8* %p, align 1
+  %conv = sext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrsh_ri_zero
+; CHECK-T1: ldrh    r0, [r0]
+; CHECK-T1: sxth    r0, r0
+; CHECK-T2: ldrsh.w r0, [r0]
+define i32 @ldrsh_ri_zero(i16* %p) {
+entry:
+  %0 = load i16, i16* %p, align 2
+  %conv = sext i16 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrb_ri_zero
+; CHECK:    ldrb    r0, [r0]
+define i32 @ldrb_ri_zero(i8* %p) {
+entry:
+  %0 = load i8, i8* %p, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrh_ri_zero
+; CHECK:    ldrh    r0, [r0]
+define i32 @ldrh_ri_zero(i16* %p) {
+entry:
+  %0 = load i16, i16* %p, align 2
+  %conv = zext i16 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldr_ri_zero
+; CHECK:    ldr     r0, [r0]
+define i32 @ldr_ri_zero(i32* %p) {
+entry:
+  %0 = load i32, i32* %p, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: strb_ri_zero
+; CHECK:    strb    r1, [r0]
+define void @strb_ri_zero(i8* %p, i32 %x) {
+entry:
+  %conv = trunc i32 %x to i8
+  store i8 %conv, i8* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: strh_ri_zero
+; CHECK:    strh    r1, [r0]
+define void @strh_ri_zero(i16* %p, i32 %x) {
+entry:
+  %conv = trunc i32 %x to i16
+  store i16 %conv, i16* %p, align 2
+  ret void
+}
+
+; CHECK-LABEL: str_ri_zero
+; CHECK:    str     r1, [r0]
+define void @str_ri_zero(i32* %p, i32 %x) {
+entry:
+  store i32 %x, i32* %p, align 4
+  ret void
+}
+
+
+; Maximum Thumb-1 immediate offset
+
+; CHECK-LABEL: ldrsb_ri_t1_max
+; CHECK-T1: movs    r1, #31
+; CHECK-T1: ldrsb   r0, [r0, r1]
+; CHECK-T2: ldrsb.w r0, [r0, #31]
+define i32 @ldrsb_ri_t1_max(i8* %p) {
+entry:
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 31
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrsh_ri_t1_max
+; CHECK-T1: movs    r1, #62
+; CHECK-T1: ldrsh   r0, [r0, r1]
+; CHECK-T2: ldrsh.w r0, [r0, #62]
+define i32 @ldrsh_ri_t1_max(i16* %p) {
+entry:
+  %arrayidx = getelementptr inbounds i16, i16* %p, i32 31
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrb_ri_t1_max
+; CHECK:    ldrb    r0, [r0, #31]
+define i32 @ldrb_ri_t1_max(i8* %p) {
+entry:
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 31
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrh_ri_t1_max
+; CHECK:    ldrh    r0, [r0, #62]
+define i32 @ldrh_ri_t1_max(i16* %p) {
+entry:
+  %arrayidx = getelementptr inbounds i16, i16* %p, i32 31
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv = zext i16 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldr_ri_t1_max
+; CHECK:    ldr     r0, [r0, #124]
+define i32 @ldr_ri_t1_max(i32* %p) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %p, i32 31
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: strb_ri_t1_max
+; CHECK:    strb    r1, [r0, #31]
+define void @strb_ri_t1_max(i8* %p, i32 %x) {
+entry:
+  %conv = trunc i32 %x to i8
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 31
+  store i8 %conv, i8* %arrayidx, align 1
+  ret void
+}
+
+; CHECK-LABEL: strh_ri_t1_max
+; CHECK:    strh    r1, [r0, #62]
+define void @strh_ri_t1_max(i16* %p, i32 %x) {
+entry:
+  %conv = trunc i32 %x to i16
+  %arrayidx = getelementptr inbounds i16, i16* %p, i32 31
+  store i16 %conv, i16* %arrayidx, align 2
+  ret void
+}
+
+; CHECK-LABEL: str_ri_t1_max
+; CHECK:    str     r1, [r0, #124]
+define void @str_ri_t1_max(i32* %p, i32 %x) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %p, i32 31
+  store i32 %x, i32* %arrayidx, align 4
+  ret void
+}
+
+
+; One past maximum Thumb-1 immediate offset
+
+; CHECK-LABEL: ldrsb_ri_t1_too_big
+; CHECK-T1: movs    r1, #32
+; CHECK-T1: ldrsb   r0, [r0, r1]
+; CHECK-T2: ldrsb.w r0, [r0, #32]
+define i32 @ldrsb_ri_t1_too_big(i8* %p) {
+entry:
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 32
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrsh_ri_t1_too_big
+; CHECK-T1: movs    r1, #64
+; CHECK-T1: ldrsh   r0, [r0, r1]
+; CHECK-T2: ldrsh.w r0, [r0, #64]
+define i32 @ldrsh_ri_t1_too_big(i16* %p) {
+entry:
+  %arrayidx = getelementptr inbounds i16, i16* %p, i32 32
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrb_ri_t1_too_big
+; CHECK-T1: movs    r1, #32
+; CHECK-T1: ldrb    r0, [r0, r1]
+; CHECK-T2: ldrb.w  r0, [r0, #32]
+define i32 @ldrb_ri_t1_too_big(i8* %p) {
+entry:
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 32
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrh_ri_t1_too_big
+; CHECK-T1: movs    r1, #64
+; CHECK-T1: ldrh    r0, [r0, r1]
+; CHECK-T2: ldrh.w  r0, [r0, #64]
+define i32 @ldrh_ri_t1_too_big(i16* %p) {
 entry:
-        %tmp = load i8, i8* %p              ; <i8> [#uses=1]
-        %tmp1 = sext i8 %tmp to i32              ; <i32> [#uses=1]
-        ret i32 %tmp1
+  %arrayidx = getelementptr inbounds i16, i16* %p, i32 32
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv = zext i16 %0 to i32
+  ret i32 %conv
 }
 
-define i32 @f2(i8* %p) {
+; CHECK-LABEL: ldr_ri_t1_too_big
+; CHECK-T1: movs    r1, #128
+; CHECK-T1: ldr     r0, [r0, r1]
+; CHECK-T2: ldr.w   r0, [r0, #128]
+define i32 @ldr_ri_t1_too_big(i32* %p) {
 entry:
-        %tmp = load i8, i8* %p              ; <i8> [#uses=1]
-        %tmp2 = zext i8 %tmp to i32              ; <i32> [#uses=1]
-        ret i32 %tmp2
+  %arrayidx = getelementptr inbounds i32, i32* %p, i32 32
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
 }
 
-define i32 @f3(i16* %p) {
+; CHECK-LABEL: strb_ri_t1_too_big
+; CHECK-T1: movs    r2, #32
+; CHECK-T1: strb    r1, [r0, r2]
+; CHECK-T2: strb.w  r1, [r0, #32]
+define void @strb_ri_t1_too_big(i8* %p, i32 %x) {
 entry:
-        %tmp = load i16, i16* %p             ; <i16> [#uses=1]
-        %tmp3 = sext i16 %tmp to i32             ; <i32> [#uses=1]
-        ret i32 %tmp3
+  %conv = trunc i32 %x to i8
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 32
+  store i8 %conv, i8* %arrayidx, align 1
+  ret void
 }
 
-define i32 @f4(i16* %p) {
+; CHECK-LABEL: strh_ri_t1_too_big
+; CHECK-T1: movs    r2, #64
+; CHECK-T1: strh    r1, [r0, r2]
+; CHECK-T2: strh.w  r1, [r0, #64]
+define void @strh_ri_t1_too_big(i16* %p, i32 %x) {
 entry:
-        %tmp = load i16, i16* %p             ; <i16> [#uses=1]
-        %tmp4 = zext i16 %tmp to i32             ; <i32> [#uses=1]
-        ret i32 %tmp4
+  %conv = trunc i32 %x to i16
+  %arrayidx = getelementptr inbounds i16, i16* %p, i32 32
+  store i16 %conv, i16* %arrayidx, align 2
+  ret void
 }
 
-; CHECK: ldrsb
-; CHECK: ldrb
-; CHECK: ldrsh
-; CHECK: ldrh
+; CHECK-LABEL: str_ri_t1_too_big
+; CHECK-T1: movs    r2, #128
+; CHECK-T1: str     r1, [r0, r2]
+; CHECK-T2: str.w   r1, [r0, #128]
+define void @str_ri_t1_too_big(i32* %p, i32 %x) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %p, i32 32
+  store i32 %x, i32* %arrayidx, align 4
+  ret void
+}
+
+
+; Maximum Thumb-2 immediate offset
+
+; CHECK-LABEL: ldrsb_ri_t2_max
+; CHECK-T1: ldr     r1, .LCP
+; CHECK-T1: ldrsb   r0, [r0, r1]
+; CHECK-T2: ldrsb.w r0, [r0, #4095]
+define i32 @ldrsb_ri_t2_max(i8* %p) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4095
+  %0 = load i8, i8* %add.ptr, align 1
+  %conv = sext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrsh_ri_t2_max
+; CHECK-T1: ldr     r1, .LCP
+; CHECK-T1: ldrsh   r0, [r0, r1]
+; CHECK-T2: ldrsh.w r0, [r0, #4095]
+define i32 @ldrsh_ri_t2_max(i8* %p) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4095
+  %0 = bitcast i8* %add.ptr to i16*
+  %1 = load i16, i16* %0, align 2
+  %conv = sext i16 %1 to i32
+  ret i32 %conv
+}
 
+; CHECK-LABEL: ldrb_ri_t2_max
+; CHECK-T1: ldr     r1, .LCP
+; CHECK-T1: ldrb    r0, [r0, r1]
+; CHECK-T2: ldrb.w  r0, [r0, #4095]
+define i32 @ldrb_ri_t2_max(i8* %p) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4095
+  %0 = load i8, i8* %add.ptr, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrh_ri_t2_max
+; CHECK-T1: ldr     r1, .LCP
+; CHECK-T1: ldrh    r0, [r0, r1]
+; CHECK-T2: ldrh.w  r0, [r0, #4095]
+define i32 @ldrh_ri_t2_max(i8* %p) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4095
+  %0 = bitcast i8* %add.ptr to i16*
+  %1 = load i16, i16* %0, align 2
+  %conv = zext i16 %1 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldr_ri_t2_max
+; CHECK-T1: ldr     r1, .LCP
+; CHECK-T1: ldr     r0, [r0, r1]
+; CHECK-T2: ldr.w   r0, [r0, #4095]
+define i32 @ldr_ri_t2_max(i8* %p) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4095
+  %0 = bitcast i8* %add.ptr to i32*
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+; CHECK-LABEL: strb_ri_t2_max
+; CHECK-T1: ldr     r2, .LCP
+; CHECK-T1: strb    r1, [r0, r2]
+; CHECK-T2: strb.w  r1, [r0, #4095]
+define void @strb_ri_t2_max(i8* %p, i32 %x) {
+entry:
+  %conv = trunc i32 %x to i8
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4095
+  store i8 %conv, i8* %add.ptr, align 1
+  ret void
+}
+
+; CHECK-LABEL: strh_ri_t2_max
+; CHECK-T1: ldr     r2, .LCP
+; CHECK-T1: strh    r1, [r0, r2]
+; CHECK-T2: strh.w  r1, [r0, #4095]
+define void @strh_ri_t2_max(i8* %p, i32 %x) {
+entry:
+  %conv = trunc i32 %x to i16
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4095
+  %0 = bitcast i8* %add.ptr to i16*
+  store i16 %conv, i16* %0, align 2
+  ret void
+}
+
+; CHECK-LABEL: str_ri_t2_max
+; CHECK-T1: ldr     r2, .LCP
+; CHECK-T1: str     r1, [r0, r2]
+; CHECK-T2: str.w   r1, [r0, #4095]
+define void @str_ri_t2_max(i8* %p, i32 %x) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4095
+  %0 = bitcast i8* %add.ptr to i32*
+  store i32 %x, i32* %0, align 4
+  ret void
+}
+
+
+; One past maximum Thumb-2 immediate offset
+
+; CHECK-LABEL: ldrsb_ri_t2_too_big
+; CHECK-T1: movs    r1, #1
+; CHECK-T1: lsls    r1, r1, #12
+; CHECK-T2: mov.w   r1, #4096
+; CHECK:    ldrsb   r0, [r0, r1]
+define i32 @ldrsb_ri_t2_too_big(i8* %p) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4096
+  %0 = load i8, i8* %add.ptr, align 1
+  %conv = sext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrsh_ri_t2_too_big
+; CHECK-T1: movs    r1, #1
+; CHECK-T1: lsls    r1, r1, #12
+; CHECK-T2: mov.w   r1, #4096
+; CHECK:    ldrsh   r0, [r0, r1]
+define i32 @ldrsh_ri_t2_too_big(i8* %p) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4096
+  %0 = bitcast i8* %add.ptr to i16*
+  %1 = load i16, i16* %0, align 2
+  %conv = sext i16 %1 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrb_ri_t2_too_big
+; CHECK-T1: movs    r1, #1
+; CHECK-T1: lsls    r1, r1, #12
+; CHECK-T2: mov.w   r1, #4096
+; CHECK:    ldrb    r0, [r0, r1]
+define i32 @ldrb_ri_t2_too_big(i8* %p) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4096
+  %0 = load i8, i8* %add.ptr, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldrh_ri_t2_too_big
+; CHECK-T1: movs    r1, #1
+; CHECK-T1: lsls    r1, r1, #12
+; CHECK-T2: mov.w   r1, #4096
+; CHECK:    ldrh    r0, [r0, r1]
+define i32 @ldrh_ri_t2_too_big(i8* %p) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4096
+  %0 = bitcast i8* %add.ptr to i16*
+  %1 = load i16, i16* %0, align 2
+  %conv = zext i16 %1 to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: ldr_ri_t2_too_big
+; CHECK-T1: movs    r1, #1
+; CHECK-T1: lsls    r1, r1, #12
+; CHECK-T2: mov.w   r1, #4096
+; CHECK:    ldr     r0, [r0, r1]
+define i32 @ldr_ri_t2_too_big(i8* %p) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4096
+  %0 = bitcast i8* %add.ptr to i32*
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+; CHECK-LABEL: strb_ri_t2_too_big
+; CHECK-T1: movs    r2, #1
+; CHECK-T1: lsls    r2, r2, #12
+; CHECK-T2: mov.w   r2, #4096
+; CHECK:    strb    r1, [r0, r2]
+define void @strb_ri_t2_too_big(i8* %p, i32 %x) {
+entry:
+  %conv = trunc i32 %x to i8
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4096
+  store i8 %conv, i8* %add.ptr, align 1
+  ret void
+}
+
+; CHECK-LABEL: strh_ri_t2_too_big
+; CHECK-T1: movs    r2, #1
+; CHECK-T1: lsls    r2, r2, #12
+; CHECK-T2: mov.w   r2, #4096
+; CHECK:    strh    r1, [r0, r2]
+define void @strh_ri_t2_too_big(i8* %p, i32 %x) {
+entry:
+  %conv = trunc i32 %x to i16
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4096
+  %0 = bitcast i8* %add.ptr to i16*
+  store i16 %conv, i16* %0, align 2
+  ret void
+}
+
+; CHECK-LABEL: str_ri_t2_too_big
+; CHECK-T1: movs    r2, #1
+; CHECK-T1: lsls    r2, r2, #12
+; CHECK-T2: mov.w   r2, #4096
+; CHECK:    str     r1, [r0, r2]
+define void @str_ri_t2_too_big(i8* %p, i32 %x) {
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %p, i32 4096
+  %0 = bitcast i8* %add.ptr to i32*
+  store i32 %x, i32* %0, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM/machine-cse-cmp.ll b/test/CodeGen/ARM/machine-cse-cmp.ll
index 1f92ff4e11923..611cba6ed1fcb 100644
--- a/test/CodeGen/ARM/machine-cse-cmp.ll
+++ b/test/CodeGen/ARM/machine-cse-cmp.ll
@@ -27,7 +27,7 @@ define void @f2() nounwind ssp {
 entry:
 ; CHECK-LABEL: f2:
 ; CHECK: cmp
-; CHECK: poplt
+; CHECK: bxlt
 ; CHECK-NOT: cmp
 ; CHECK: movle
   %0 = load i32, i32* @foo, align 4
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index 4ea26e1c59a30..d874884dcb393 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv6m-apple-ios -mcpu=cortex-m0 -pre-RA-sched=source -disable-post-ra | FileCheck %s -check-prefix=CHECK-T1
+; RUN: llc < %s -mtriple=thumbv6m-apple-ios -mcpu=cortex-m0 -pre-RA-sched=source -disable-post-ra -mattr=+strict-align | FileCheck %s -check-prefix=CHECK-T1
 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
 
 @src = external global %struct.x
diff --git a/test/CodeGen/ARM/memcpy-ldm-stm.ll b/test/CodeGen/ARM/memcpy-ldm-stm.ll
new file mode 100644
index 0000000000000..2ebe7ed5b1463
--- /dev/null
+++ b/test/CodeGen/ARM/memcpy-ldm-stm.ll
@@ -0,0 +1,94 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | \
+; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECKV6
+; RUN: llc -mtriple=thumbv6m-eabi -O=0 -verify-machineinstrs %s -o - | \
+; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECKV6
+; RUN: llc -mtriple=thumbv7a-eabi -mattr=-neon -verify-machineinstrs %s -o - | \
+; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECKV7
+; RUN: llc -mtriple=armv7a-eabi -mattr=-neon -verify-machineinstrs %s -o - | \
+; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECKV7
+
+@d = external global [64 x i32]
+@s = external global [64 x i32]
+
+; Function Attrs: nounwind
+define void @t1() #0 {
+entry:
+; CHECK-LABEL: t1:
+; CHECKV6: ldr [[LB:r[0-7]]],
+; CHECKV6-NEXT: ldr [[SB:r[0-7]]],
+; We use '[rl0-9]+' to allow 'r0'..'r12', 'lr'
+; CHECKV7: movt [[LB:[rl0-9]+]], :upper16:d
+; CHECKV7-NEXT: movt [[SB:[rl0-9]+]], :upper16:s
+; CHECK-NEXT: ldm{{(\.w)?}} [[LB]]!,
+; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!,
+; Think of the monstrosity '{{\[}}[[LB]]]' as '[ [[LB]] ]' without the spaces.
+; CHECK-NEXT: ldrb{{(\.w)?}} {{.*}}, {{\[}}[[LB]]]
+; CHECK-NEXT: strb{{(\.w)?}} {{.*}}, {{\[}}[[SB]]]
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false)
+    ret void
+}
+
+; Function Attrs: nounwind
+define void @t2() #0 {
+entry:
+; CHECK-LABEL: t2:
+; CHECKV6: ldr [[LB:r[0-7]]],
+; CHECKV6-NEXT: ldr [[SB:r[0-7]]],
+; CHECKV7: movt [[LB:[rl0-9]+]], :upper16:d
+; CHECKV7-NEXT: movt [[SB:[rl0-9]+]], :upper16:s
+; CHECK-NEXT: ldm{{(\.w)?}} [[LB]]!,
+; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!,
+; CHECK-NEXT: ldrh{{(\.w)?}} {{.*}}, {{\[}}[[LB]]]
+; CHECK-NEXT: ldrb{{(\.w)?}} {{.*}}, {{\[}}[[LB]], #2]
+; CHECK-NEXT: strb{{(\.w)?}} {{.*}}, {{\[}}[[SB]], #2]
+; CHECK-NEXT: strh{{(\.w)?}} {{.*}}, {{\[}}[[SB]]]
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false)
+    ret void
+}
+
+; PR23768
+%struct.T = type { i8, i64, i8 }
+
+@copy = external global %struct.T, align 8
+@etest = external global %struct.T, align 8
+
+define void @t3() {
+  call void @llvm.memcpy.p0i8.p0i8.i32(
+     i8* getelementptr inbounds (%struct.T, %struct.T* @copy, i32 0, i32 0),
+     i8* getelementptr inbounds (%struct.T, %struct.T* @etest, i32 0, i32 0),
+     i32 24, i32 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(
+     i8* getelementptr inbounds (%struct.T, %struct.T* @copy, i32 0, i32 0),
+     i8* getelementptr inbounds (%struct.T, %struct.T* @etest, i32 0, i32 0),
+     i32 24, i32 8, i1 false)
+  ret void
+}
+
+%struct.S = type { [12 x i32] }
+
+; CHECK-LABEL: test3
+define void @test3(%struct.S* %d, %struct.S* %s) #0 {
+  %1 = bitcast %struct.S* %d to i8*
+  %2 = bitcast %struct.S* %s to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %2, i32 48, i32 4, i1 false)
+; 3 ldm/stm pairs in v6; 2 in v7
+; CHECK: ldm{{(\.w)?}} {{[rl0-9]+!?}}, [[REGLIST1:{.*}]]
+; CHECK: stm{{(\.w)?}} {{[rl0-9]+!?}}, [[REGLIST1]]
+; CHECK: ldm{{(\.w)?}} {{[rl0-9]+!?}}, [[REGLIST2:{.*}]]
+; CHECK: stm{{(\.w)?}} {{[rl0-9]+!?}}, [[REGLIST2]]
+; CHECKV6: ldm {{r[0-7]!?}}, [[REGLIST3:{.*}]]
+; CHECKV6: stm {{r[0-7]!?}}, [[REGLIST3]]
+; CHECKV7-NOT: ldm
+; CHECKV7-NOT: stm
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %s, i32 0, i32 0, i32 1
+  tail call void @g(i32* %arrayidx) #3
+  ret void
+}
+
+declare void @g(i32*)
+
+; Set "no-frame-pointer-elim" to increase register pressure
+attributes #0 = { "no-frame-pointer-elim"="true" }
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index 5223983a7f302..66743f3e9d5e3 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -1,79 +1,95 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-IOS --check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7m-none-macho -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-DARWIN --check-prefix=CHECK
-; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
-; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=armv7-apple-ios -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-IOS
+; RUN: llc < %s -mtriple=thumbv7m-none-macho -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-androideabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-gnueabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-gnueabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
 
 define void @f1(i8* %dest, i8* %src) {
 entry:
   ; CHECK-LABEL: f1
 
-  ; CHECK-IOS: memmove
-  ; CHECK-DARWIN: memmove
-  ; CHECK-EABI: __aeabi_memmove
+  ; CHECK-IOS: bl _memmove
+  ; CHECK-DARWIN: bl _memmove
+  ; CHECK-EABI: bl __aeabi_memmove
+  ; CHECK-GNUEABI: bl memmove
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 0, i1 false)
 
-  ; CHECK-IOS: memcpy
-  ; CHECK-DARWIN: memcpy
-  ; CHECK-EABI: __aeabi_memcpy
+  ; CHECK-IOS: bl _memcpy
+  ; CHECK-DARWIN: bl _memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy
+  ; CHECK-GNUEABI: bl memcpy
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 0, i1 false)
 
   ; EABI memset swaps arguments
   ; CHECK-IOS: mov r1, #1
-  ; CHECK-IOS: memset
+  ; CHECK-IOS: bl _memset
   ; CHECK-DARWIN: movs r1, #1
-  ; CHECK-DARWIN: memset
+  ; CHECK-DARWIN: bl _memset
   ; CHECK-EABI: mov r2, #1
-  ; CHECK-EABI: __aeabi_memset
+  ; CHECK-EABI: bl __aeabi_memset
+  ; CHECK-GNUEABI: mov r1, #1
+  ; CHECK-GNUEABI: bl memset
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 500, i32 0, i1 false)
 
   ; EABI uses memclr if value set to 0
   ; CHECK-IOS: mov r1, #0
-  ; CHECK-IOS: memset
+  ; CHECK-IOS: bl _memset
   ; CHECK-DARWIN: movs r1, #0
-  ; CHECK-DARWIN: memset
-  ; CHECK-EABI: __aeabi_memclr
+  ; CHECK-DARWIN: bl _memset
+  ; CHECK-EABI: bl __aeabi_memclr
+  ; CHECK-GNUEABI: bl memset
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 0, i32 500, i32 0, i1 false)
-  
+
   ; EABI uses aligned function variants if possible
 
-  ; CHECK-IOS: memmove
-  ; CHECK-DARWIN: memmove
-  ; CHECK-EABI: __aeabi_memmove4
+  ; CHECK-IOS: bl _memmove
+  ; CHECK-DARWIN: bl _memmove
+  ; CHECK-EABI: bl __aeabi_memmove4
+  ; CHECK-GNUEABI: bl memmove
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 4, i1 false)
 
-  ; CHECK-IOS: memcpy
-  ; CHECK-DARWIN: memcpy
-  ; CHECK-EABI: __aeabi_memcpy4
+  ; CHECK-IOS: bl _memcpy
+  ; CHECK-DARWIN: bl _memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy4
+  ; CHECK-GNUEABI: bl memcpy
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 4, i1 false)
 
-  ; CHECK-IOS: memset
-  ; CHECK-DARWIN: memset
-  ; CHECK-EABI: __aeabi_memset4
+  ; CHECK-IOS: bl _memset
+  ; CHECK-DARWIN: bl _memset
+  ; CHECK-EABI: bl __aeabi_memset4
+  ; CHECK-GNUEABI: bl memset
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 500, i32 4, i1 false)
 
-  ; CHECK-IOS: memset
-  ; CHECK-DARWIN: memset
-  ; CHECK-EABI: __aeabi_memclr4
+  ; CHECK-IOS: bl _memset
+  ; CHECK-DARWIN: bl _memset
+  ; CHECK-EABI: bl __aeabi_memclr4
+  ; CHECK-GNUEABI: bl memset
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 0, i32 500, i32 4, i1 false)
 
-  ; CHECK-IOS: memmove
-  ; CHECK-DARWIN: memmove
-  ; CHECK-EABI: __aeabi_memmove8
+  ; CHECK-IOS: bl _memmove
+  ; CHECK-DARWIN: bl _memmove
+  ; CHECK-EABI: bl __aeabi_memmove8
+  ; CHECK-GNUEABI: bl memmove
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 8, i1 false)
 
-  ; CHECK-IOS: memcpy
-  ; CHECK-DARWIN: memcpy
-  ; CHECK-EABI: __aeabi_memcpy8
+  ; CHECK-IOS: bl _memcpy
+  ; CHECK-DARWIN: bl _memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy8
+  ; CHECK-GNUEABI: bl memcpy
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 8, i1 false)
 
-  ; CHECK-IOS: memset
-  ; CHECK-DARWIN: memset
-  ; CHECK-EABI: __aeabi_memset8
+  ; CHECK-IOS: bl _memset
+  ; CHECK-DARWIN: bl _memset
+  ; CHECK-EABI: bl __aeabi_memset8
+  ; CHECK-GNUEABI: bl memset
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 500, i32 8, i1 false)
 
-  ; CHECK-IOS: memset
-  ; CHECK-DARWIN: memset
-  ; CHECK-EABI: __aeabi_memclr8
+  ; CHECK-IOS: bl _memset
+  ; CHECK-DARWIN: bl _memset
+  ; CHECK-EABI: bl __aeabi_memclr8
+  ; CHECK-GNUEABI: bl memset
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 0, i32 500, i32 8, i1 false)
 
   unreachable
@@ -86,32 +102,38 @@ entry:
 
   ; IOS (ARMv7) should 8-byte align, others should 4-byte align
   ; CHECK-IOS: add r1, sp, #32
-  ; CHECK-IOS: memmove
+  ; CHECK-IOS: bl _memmove
   ; CHECK-DARWIN: add r1, sp, #28
-  ; CHECK-DARWIN: memmove
+  ; CHECK-DARWIN: bl _memmove
   ; CHECK-EABI: add r1, sp, #28
-  ; CHECK-EABI: __aeabi_memmove
+  ; CHECK-EABI: bl __aeabi_memmove
+  ; CHECK-GNUEABI: add r1, sp, #28
+  ; CHECK-GNUEABI: bl memmove
   %arr0 = alloca [9 x i8], align 1
   %0 = bitcast [9 x i8]* %arr0 to i8*
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
 
   ; CHECK: add r1, sp, #16
-  ; CHECK-IOS: memcpy
-  ; CHECK-DARWIN: memcpy
-  ; CHECK-EABI: __aeabi_memcpy
+  ; CHECK-IOS: bl _memcpy
+  ; CHECK-DARWIN: bl _memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy
+  ; CHECK-GNUEABI: bl memcpy
   %arr1 = alloca [9 x i8], align 1
   %1 = bitcast [9 x i8]* %arr1 to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
 
   ; CHECK-IOS: mov r0, sp
   ; CHECK-IOS: mov r1, #1
-  ; CHECK-IOS: memset
+  ; CHECK-IOS: bl _memset
   ; CHECK-DARWIN: add r0, sp, #4
   ; CHECK-DARWIN: movs r1, #1
-  ; CHECK-DARWIN: memset
+  ; CHECK-DARWIN: bl _memset
   ; CHECK-EABI: add r0, sp, #4
   ; CHECK-EABI: mov r2, #1
-  ; CHECK-EABI: __aeabi_memset
+  ; CHECK-EABI: bl __aeabi_memset
+  ; CHECK-GNUEABI: add r0, sp, #4
+  ; CHECK-GNUEABI: mov r1, #1
+  ; CHECK-GNUEABI: bl memset
   %arr2 = alloca [9 x i8], align 1
   %2 = bitcast [9 x i8]* %arr2 to i8*
   call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false)
@@ -125,28 +147,32 @@ entry:
   ; CHECK-LABEL: f3
 
   ; CHECK: {{add(.w)? r1, sp, #17|sub(.w)? r1, r7, #15}}
-  ; CHECK-IOS: memmove
-  ; CHECK-DARWIN: memmove
-  ; CHECK-EABI: __aeabi_memmove
+  ; CHECK-IOS: bl _memmove
+  ; CHECK-DARWIN: bl _memmove
+  ; CHECK-EABI: bl __aeabi_memmove
+  ; CHECK-GNUEABI: bl memmove
   %arr0 = alloca [7 x i8], align 1
   %0 = bitcast [7 x i8]* %arr0 to i8*
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r1, sp, #10}}
-  ; CHECK-IOS: memcpy
-  ; CHECK-DARWIN: memcpy
-  ; CHECK-EABI: __aeabi_memcpy
+  ; CHECK-IOS: bl _memcpy
+  ; CHECK-DARWIN: bl _memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy
+  ; CHECK-GNUEABI: bl memcpy
   %arr1 = alloca [7 x i8], align 1
   %1 = bitcast [7 x i8]* %arr1 to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r0, sp, #3}}
   ; CHECK-IOS: mov r1, #1
-  ; CHECK-IOS: memset
+  ; CHECK-IOS: bl _memset
   ; CHECK-DARWIN: movs r1, #1
-  ; CHECK-DARWIN: memset
+  ; CHECK-DARWIN: bl _memset
   ; CHECK-EABI: mov r2, #1
-  ; CHECK-EABI: __aeabi_memset
+  ; CHECK-EABI: bl __aeabi_memset
+  ; CHECK-GNUEABI: mov r1, #1
+  ; CHECK-GNUEABI: bl memset
   %arr2 = alloca [7 x i8], align 1
   %2 = bitcast [7 x i8]* %arr2 to i8*
   call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false)
@@ -160,28 +186,32 @@ entry:
   ; CHECK-LABEL: f4
 
   ; CHECK: {{add(.w)? r., sp, #23|sub(.w)? r., r7, #17}}
-  ; CHECK-IOS: memmove
-  ; CHECK-DARWIN: memmove
-  ; CHECK-EABI: __aeabi_memmove
+  ; CHECK-IOS: bl _memmove
+  ; CHECK-DARWIN: bl _memmove
+  ; CHECK-EABI: bl __aeabi_memmove
+  ; CHECK-GNUEABI: bl memmove
   %arr0 = alloca [9 x i8], align 1
   %0 = getelementptr inbounds [9 x i8], [9 x i8]* %arr0, i32 0, i32 4
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r., sp, #(10|14)}}
-  ; CHECK-IOS: memcpy
-  ; CHECK-DARWIN: memcpy
-  ; CHECK-EABI: __aeabi_memcpy
+  ; CHECK-IOS: bl _memcpy
+  ; CHECK-DARWIN: bl _memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy
+  ; CHECK-GNUEABI: bl memcpy
   %arr1 = alloca [9 x i8], align 1
   %1 = getelementptr inbounds [9 x i8], [9 x i8]* %arr1, i32 0, i32 4
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r., sp, #(1|5)}}
   ; CHECK-IOS: mov r1, #1
-  ; CHECK-IOS: memset
+  ; CHECK-IOS: bl _memset
   ; CHECK-DARWIN: movs r1, #1
-  ; CHECK-DARWIN: memset
+  ; CHECK-DARWIN: bl _memset
   ; CHECK-EABI: mov r2, #1
-  ; CHECK-EABI: __aeabi_memset
+  ; CHECK-EABI: bl __aeabi_memset
+  ; CHECK-GNUEABI: mov r1, #1
+  ; CHECK-GNUEABI: bl memset
   %arr2 = alloca [9 x i8], align 1
   %2 = getelementptr inbounds [9 x i8], [9 x i8]* %arr2, i32 0, i32 4
   call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false)
@@ -195,28 +225,32 @@ entry:
   ; CHECK-LABEL: f5
 
   ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #21}}
-  ; CHECK-IOS: memmove
-  ; CHECK-DARWIN: memmove
-  ; CHECK-EABI: __aeabi_memmove
+  ; CHECK-IOS: bl _memmove
+  ; CHECK-DARWIN: bl _memmove
+  ; CHECK-EABI: bl __aeabi_memmove
+  ; CHECK-GNUEABI: bl memmove
   %arr0 = alloca [13 x i8], align 1
   %0 = getelementptr inbounds [13 x i8], [13 x i8]* %arr0, i32 0, i32 1
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r., sp, #(10|14)}}
-  ; CHECK-IOS: memcpy
-  ; CHECK-DARWIN: memcpy
-  ; CHECK-EABI: __aeabi_memcpy
+  ; CHECK-IOS: bl _memcpy
+  ; CHECK-DARWIN: bl _memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy
+  ; CHECK-GNUEABI: bl memcpy
   %arr1 = alloca [13 x i8], align 1
   %1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 1
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r., sp, #(1|5)}}
   ; CHECK-IOS: mov r1, #1
-  ; CHECK-IOS: memset
+  ; CHECK-IOS: bl _memset
   ; CHECK-DARWIN: movs r1, #1
-  ; CHECK-DARWIN: memset
+  ; CHECK-DARWIN: bl _memset
   ; CHECK-EABI: mov r2, #1
-  ; CHECK-EABI: __aeabi_memset
+  ; CHECK-EABI: bl __aeabi_memset
+  ; CHECK-GNUEABI: mov r1, #1
+  ; CHECK-GNUEABI: bl memset
   %arr2 = alloca [13 x i8], align 1
   %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 1
   call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false)
@@ -230,28 +264,32 @@ entry:
   ; CHECK-LABEL: f6
 
   ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #25}}
-  ; CHECK-IOS: memmove
-  ; CHECK-DARWIN: memmove
-  ; CHECK-EABI: __aeabi_memmove
+  ; CHECK-IOS: bl _memmove
+  ; CHECK-DARWIN: bl _memmove
+  ; CHECK-EABI: bl __aeabi_memmove
+  ; CHECK-GNUEABI: bl memmove
   %arr0 = alloca [13 x i8], align 1
   %0 = getelementptr inbounds [13 x i8], [13 x i8]* %arr0, i32 0, i32 %i
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r., sp, #(10|14)}}
-  ; CHECK-IOS: memcpy
-  ; CHECK-DARWIN: memcpy
-  ; CHECK-EABI: __aeabi_memcpy
+  ; CHECK-IOS: bl _memcpy
+  ; CHECK-DARWIN: bl _memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy
+  ; CHECK-GNUEABI: bl memcpy
   %arr1 = alloca [13 x i8], align 1
   %1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 %i
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r., sp, #(1|5)}}
   ; CHECK-IOS: mov r1, #1
-  ; CHECK-IOS: memset
+  ; CHECK-IOS: bl _memset
   ; CHECK-DARWIN: movs r1, #1
-  ; CHECK-DARWIN: memset
+  ; CHECK-DARWIN: bl _memset
   ; CHECK-EABI: mov r2, #1
-  ; CHECK-EABI: __aeabi_memset
+  ; CHECK-EABI: bl __aeabi_memset
+  ; CHECK-GNUEABI: mov r1, #1
+  ; CHECK-GNUEABI: bl memset
   %arr2 = alloca [13 x i8], align 1
   %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 %i
   call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false)
@@ -265,28 +303,32 @@ entry:
   ; CHECK-LABEL: f7
 
   ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #21}}
-  ; CHECK-IOS: memmove
-  ; CHECK-DARWIN: memmove
-  ; CHECK-EABI: __aeabi_memmove
+  ; CHECK-IOS: bl _memmove
+  ; CHECK-DARWIN: bl _memmove
+  ; CHECK-EABI: bl __aeabi_memmove
+  ; CHECK-GNUEABI: bl memmove
   %arr0 = alloca [13 x i8], align 1
   %0 = getelementptr [13 x i8], [13 x i8]* %arr0, i32 0, i32 4
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r., sp, #(10|14)}}
-  ; CHECK-IOS: memcpy
-  ; CHECK-DARWIN: memcpy
-  ; CHECK-EABI: __aeabi_memcpy
+  ; CHECK-IOS: bl _memcpy
+  ; CHECK-DARWIN: bl _memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy
+  ; CHECK-GNUEABI: bl memcpy
   %arr1 = alloca [13 x i8], align 1
   %1 = getelementptr [13 x i8], [13 x i8]* %arr1, i32 0, i32 4
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r., sp, #(1|5)}}
   ; CHECK-IOS: mov r1, #1
-  ; CHECK-IOS: memset
+  ; CHECK-IOS: bl _memset
   ; CHECK-DARWIN: movs r1, #1
-  ; CHECK-DARWIN: memset
+  ; CHECK-DARWIN: bl _memset
   ; CHECK-EABI: mov r2, #1
-  ; CHECK-EABI: __aeabi_memset
+  ; CHECK-EABI: bl __aeabi_memset
+  ; CHECK-GNUEABI: mov r1, #1
+  ; CHECK-GNUEABI: bl memset
   %arr2 = alloca [13 x i8], align 1
   %2 = getelementptr [13 x i8], [13 x i8]* %arr2, i32 0, i32 4
   call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false)
@@ -300,28 +342,32 @@ entry:
   ; CHECK-LABEL: f8
 
   ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #21}}
-  ; CHECK-IOS: memmove
-  ; CHECK-DARWIN: memmove
-  ; CHECK-EABI: __aeabi_memmove
+  ; CHECK-IOS: bl _memmove
+  ; CHECK-DARWIN: bl _memmove
+  ; CHECK-EABI: bl __aeabi_memmove
+  ; CHECK-GNUEABI: bl memmove
   %arr0 = alloca [13 x i8], align 1
   %0 = getelementptr inbounds [13 x i8], [13 x i8]* %arr0, i32 0, i32 16
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r., sp, #(10|14)}}
-  ; CHECK-IOS: memcpy
-  ; CHECK-DARWIN: memcpy
-  ; CHECK-EABI: __aeabi_memcpy
+  ; CHECK-IOS: bl _memcpy
+  ; CHECK-DARWIN: bl _memcpy
+  ; CHECK-EABI: bl __aeabi_memcpy
+  ; CHECK-GNUEABI: bl memcpy
   %arr1 = alloca [13 x i8], align 1
   %1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 16
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
 
   ; CHECK: {{add(.w)? r., sp, #(1|5)}}
   ; CHECK-IOS: mov r1, #1
-  ; CHECK-IOS: memset
+  ; CHECK-IOS: bl _memset
   ; CHECK-DARWIN: movs r1, #1
-  ; CHECK-DARWIN: memset
+  ; CHECK-DARWIN: bl _memset
   ; CHECK-EABI: mov r2, #1
-  ; CHECK-EABI: __aeabi_memset
+  ; CHECK-EABI: bl __aeabi_memset
+  ; CHECK-GNUEABI: mov r1, #1
+  ; CHECK-GNUEABI: bl memset
   %arr2 = alloca [13 x i8], align 1
   %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 16
   call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false)
@@ -357,6 +403,7 @@ entry:
 ; CHECK-IOS: .align 3
 ; CHECK-DARWIN: .align 2
 ; CHECK-EABI: .align 2
+; CHECK-GNUEABI: .align 2
 ; CHECK: arr2:
 ; CHECK: {{\.section.+foo,bar}}
 ; CHECK-NOT: .align
diff --git a/test/CodeGen/ARM/minmax.ll b/test/CodeGen/ARM/minmax.ll
new file mode 100644
index 0000000000000..78e8922fba0e3
--- /dev/null
+++ b/test/CodeGen/ARM/minmax.ll
@@ -0,0 +1,193 @@
+; RUN: llc < %s -mtriple=armv8-linux-gnu -mattr=+neon | FileCheck %s
+
+; CHECK-LABEL: t1
+; CHECK: vmax.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <4 x i32> @t1(<4 x i32> %a, <4 x i32> %b) {
+  %t1 = icmp sgt <4 x i32> %a, %b
+  %t2 = select <4 x i1> %t1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %t2
+}
+
+; CHECK-LABEL: t2
+; CHECK: vmin.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <4 x i32> @t2(<4 x i32> %a, <4 x i32> %b) {
+  %t1 = icmp slt <4 x i32> %a, %b
+  %t2 = select <4 x i1> %t1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %t2
+}
+
+; CHECK-LABEL: t3
+; CHECK: vmax.u32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <4 x i32> @t3(<4 x i32> %a, <4 x i32> %b) {
+  %t1 = icmp ugt <4 x i32> %a, %b
+  %t2 = select <4 x i1> %t1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %t2
+}
+
+; CHECK-LABEL: t4
+; CHECK: vmin.u32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <4 x i32> @t4(<4 x i32> %a, <4 x i32> %b) {
+  %t1 = icmp ult <4 x i32> %a, %b
+  %t2 = select <4 x i1> %t1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %t2
+}
+
+; CHECK-LABEL: t5
+; CHECK: vmax.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <2 x i32> @t5(<2 x i32> %a, <2 x i32> %b) {
+  %t1 = icmp sgt <2 x i32> %a, %b
+  %t2 = select <2 x i1> %t1, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %t2
+}
+
+; CHECK-LABEL: t6
+; CHECK: vmin.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <2 x i32> @t6(<2 x i32> %a, <2 x i32> %b) {
+  %t1 = icmp slt <2 x i32> %a, %b
+  %t2 = select <2 x i1> %t1, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %t2
+}
+
+; CHECK-LABEL: t7
+; CHECK: vmax.u32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <2 x i32> @t7(<2 x i32> %a, <2 x i32> %b) {
+  %t1 = icmp ugt <2 x i32> %a, %b
+  %t2 = select <2 x i1> %t1, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %t2
+}
+
+; CHECK-LABEL: t8
+; CHECK: vmin.u32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <2 x i32> @t8(<2 x i32> %a, <2 x i32> %b) {
+  %t1 = icmp ult <2 x i32> %a, %b
+  %t2 = select <2 x i1> %t1, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %t2
+}
+
+; CHECK-LABEL: t9
+; CHECK: vmax.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <8 x i16> @t9(<8 x i16> %a, <8 x i16> %b) {
+  %t1 = icmp sgt <8 x i16> %a, %b
+  %t2 = select <8 x i1> %t1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %t2
+}
+
+; CHECK-LABEL: t10
+; CHECK: vmin.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <8 x i16> @t10(<8 x i16> %a, <8 x i16> %b) {
+  %t1 = icmp slt <8 x i16> %a, %b
+  %t2 = select <8 x i1> %t1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %t2
+}
+
+; CHECK-LABEL: t11
+; CHECK: vmax.u16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <8 x i16> @t11(<8 x i16> %a, <8 x i16> %b) {
+  %t1 = icmp ugt <8 x i16> %a, %b
+  %t2 = select <8 x i1> %t1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %t2
+}
+
+; CHECK-LABEL: t12
+; CHECK: vmin.u16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <8 x i16> @t12(<8 x i16> %a, <8 x i16> %b) {
+  %t1 = icmp ult <8 x i16> %a, %b
+  %t2 = select <8 x i1> %t1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %t2
+}
+
+; CHECK-LABEL: t13
+; CHECK: vmax.s16
+define <4 x i16> @t13(<4 x i16> %a, <4 x i16> %b) {
+  %t1 = icmp sgt <4 x i16> %a, %b
+  %t2 = select <4 x i1> %t1, <4 x i16> %a, <4 x i16> %b
+  ret <4 x i16> %t2
+}
+
+; CHECK-LABEL: t14
+; CHECK: vmin.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <4 x i16> @t14(<4 x i16> %a, <4 x i16> %b) {
+  %t1 = icmp slt <4 x i16> %a, %b
+  %t2 = select <4 x i1> %t1, <4 x i16> %a, <4 x i16> %b
+  ret <4 x i16> %t2
+}
+
+; CHECK-LABEL: t15
+; CHECK: vmax.u16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <4 x i16> @t15(<4 x i16> %a, <4 x i16> %b) {
+  %t1 = icmp ugt <4 x i16> %a, %b
+  %t2 = select <4 x i1> %t1, <4 x i16> %a, <4 x i16> %b
+  ret <4 x i16> %t2
+}
+
+; CHECK-LABEL: t16
+; CHECK: vmin.u16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <4 x i16> @t16(<4 x i16> %a, <4 x i16> %b) {
+  %t1 = icmp ult <4 x i16> %a, %b
+  %t2 = select <4 x i1> %t1, <4 x i16> %a, <4 x i16> %b
+  ret <4 x i16> %t2
+}
+
+; CHECK-LABEL: t17
+; CHECK: vmax.s8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <16 x i8> @t17(<16 x i8> %a, <16 x i8> %b) {
+  %t1 = icmp sgt <16 x i8> %a, %b
+  %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %t2
+}
+
+; CHECK-LABEL: t18
+; CHECK: vmin.s8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <16 x i8> @t18(<16 x i8> %a, <16 x i8> %b) {
+  %t1 = icmp slt <16 x i8> %a, %b
+  %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %t2
+}
+
+; CHECK-LABEL: t19
+; CHECK: vmax.u8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <16 x i8> @t19(<16 x i8> %a, <16 x i8> %b) {
+  %t1 = icmp ugt <16 x i8> %a, %b
+  %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %t2
+}
+
+; CHECK-LABEL: t20
+; CHECK: vmin.u8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+define <16 x i8> @t20(<16 x i8> %a, <16 x i8> %b) {
+  %t1 = icmp ult <16 x i8> %a, %b
+  %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %t2
+}
+
+; CHECK-LABEL: t21
+; CHECK: vmax.s8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <8 x i8> @t21(<8 x i8> %a, <8 x i8> %b) {
+  %t1 = icmp sgt <8 x i8> %a, %b
+  %t2 = select <8 x i1> %t1, <8 x i8> %a, <8 x i8> %b
+  ret <8 x i8> %t2
+}
+
+; CHECK-LABEL: t22
+; CHECK: vmin.s8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <8 x i8> @t22(<8 x i8> %a, <8 x i8> %b) {
+  %t1 = icmp slt <8 x i8> %a, %b
+  %t2 = select <8 x i1> %t1, <8 x i8> %a, <8 x i8> %b
+  ret <8 x i8> %t2
+}
+
+; CHECK-LABEL: t23
+; CHECK: vmax.u8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <8 x i8> @t23(<8 x i8> %a, <8 x i8> %b) {
+  %t1 = icmp ugt <8 x i8> %a, %b
+  %t2 = select <8 x i1> %t1, <8 x i8> %a, <8 x i8> %b
+  ret <8 x i8> %t2
+}
+
+; CHECK-LABEL: t24
+; CHECK: vmin.u8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+define <8 x i8> @t24(<8 x i8> %a, <8 x i8> %b) {
+  %t1 = icmp ult <8 x i8> %a, %b
+  %t2 = select <8 x i1> %t1, <8 x i8> %a, <8 x i8> %b
+  ret <8 x i8> %t2
+}
diff --git a/test/CodeGen/ARM/neon_minmax.ll b/test/CodeGen/ARM/neon_minmax.ll
index 84e4b303c16de..883522a829a53 100644
--- a/test/CodeGen/ARM/neon_minmax.ll
+++ b/test/CodeGen/ARM/neon_minmax.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=arm-eabi -mcpu=swift %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 -mattr=-neon %s -o -
 
 define float @fmin_ole(float %x) nounwind {
 ;CHECK-LABEL: fmin_ole:
diff --git a/test/CodeGen/ARM/neon_spill.ll b/test/CodeGen/ARM/neon_spill.ll
index 04f08f43840f9..f9282f93f4d27 100644
--- a/test/CodeGen/ARM/neon_spill.ll
+++ b/test/CodeGen/ARM/neon_spill.ll
@@ -22,7 +22,7 @@ declare arm_aapcs_vfpcc %2* @func3(%2*, %2*, i32)
 declare arm_aapcs_vfpcc %2** @func4()
 
 define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
-  call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
+  call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
   %2 = call arm_aapcs_vfpcc  %0** @func2() nounwind
   %3 = load %0*, %0** %2, align 4
   store float 0.000000e+00, float* undef, align 4
@@ -40,10 +40,10 @@ define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
   %10 = fmul float undef, 2.000000e+05
   %11 = fadd float %10, -1.000000e+05
   store float %11, float* undef, align 4
-  call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
+  call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
   ret void
 }
 
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
 
 declare arm_aapcs_vfpcc i32 @rand()
diff --git a/test/CodeGen/ARM/neon_vabs.ll b/test/CodeGen/ARM/neon_vabs.ll
index 7a02512198beb..d32e7b78879ba 100644
--- a/test/CodeGen/ARM/neon_vabs.ll
+++ b/test/CodeGen/ARM/neon_vabs.ll
@@ -89,3 +89,41 @@ define <2 x i32> @test10(<2 x i32> %a) nounwind {
         %abs = select <2 x i1> %b, <2 x i32> %tmp1neg, <2 x i32> %a
         ret <2 x i32> %abs
 }
+
+;; Check that absdiff patterns as emitted by log2 shuffles are
+;; matched by VABD.
+
+define <4 x i32> @test11(<4 x i16> %a, <4 x i16> %b) nounwind {
+; CHECK-LABEL: test11:
+; CHECK: vabdl.u16 q
+        %zext1 = zext <4 x i16> %a to <4 x i32>
+        %zext2 = zext <4 x i16> %b to <4 x i32>
+        %diff = sub <4 x i32> %zext1, %zext2
+        %shift1 = ashr <4 x i32> %diff, <i32 31, i32 31, i32 31, i32 31>
+        %add1 = add <4 x i32> %shift1, %diff
+        %res = xor <4 x i32> %shift1, %add1
+        ret <4 x i32> %res
+}
+define <8 x i16> @test12(<8 x i8> %a, <8 x i8> %b) nounwind {
+; CHECK-LABEL: test12:
+; CHECK: vabdl.u8 q
+        %zext1 = zext <8 x i8> %a to <8 x i16>
+        %zext2 = zext <8 x i8> %b to <8 x i16>
+        %diff = sub <8 x i16> %zext1, %zext2
+        %shift1 = ashr <8 x i16> %diff,<i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+        %add1 = add <8 x i16> %shift1, %diff
+        %res = xor <8 x i16> %shift1, %add1
+        ret <8 x i16> %res
+}
+
+define <2 x i64> @test13(<2 x i32> %a, <2 x i32> %b) nounwind {
+; CHECK-LABEL: test13:
+; CHECK: vabdl.u32 q
+        %zext1 = zext <2 x i32> %a to <2 x i64>
+        %zext2 = zext <2 x i32> %b to <2 x i64>
+        %diff = sub <2 x i64> %zext1, %zext2
+        %shift1 = ashr <2 x i64> %diff,<i64 63, i64 63>
+        %add1 = add <2 x i64> %shift1, %diff
+        %res = xor <2 x i64> %shift1, %add1
+        ret <2 x i64> %res
+}
diff --git a/test/CodeGen/ARM/neon_vshl_minint.ll b/test/CodeGen/ARM/neon_vshl_minint.ll
new file mode 100644
index 0000000000000..769eff845fd6a
--- /dev/null
+++ b/test/CodeGen/ARM/neon_vshl_minint.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s
+
+define <1 x i64> @vshl_minint() #0 {
+  entry:
+    ; CHECK-LABEL: vshl_minint
+    ; CHECK: vldr
+    ; CHECK: vshl.u64
+    %vshl.i = tail call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> undef, <1 x i64> <i64 -9223372036854775808>)
+    ret <1 x i64> %vshl.i
+}
+
+declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/ARM/out-of-registers.ll b/test/CodeGen/ARM/out-of-registers.ll
index a83923db0b304..891319881f45d 100644
--- a/test/CodeGen/ARM/out-of-registers.ll
+++ b/test/CodeGen/ARM/out-of-registers.ll
@@ -8,7 +8,7 @@ target triple = "thumbv7-none-linux-gnueabi"
 
 define void @foo(float* nocapture %A) #0 {
   %1= bitcast float* %A to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8* %1, i32 4)
   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
   %divp_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %3
   %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
@@ -17,7 +17,7 @@ define void @foo(float* nocapture %A) #0 {
   %div8p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %5
   %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
   %div13p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %6
-  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
+  tail call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
  ret void
 }
 
@@ -27,8 +27,8 @@ declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1
 ; Function Attrs: nounwind readonly
 
 ; Function Attrs: nounwind
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2
+declare void @llvm.arm.neon.vst4.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8*, i32) #2
 
 ; Function Attrs: nounwind
 
diff --git a/test/CodeGen/ARM/pr25317.ll b/test/CodeGen/ARM/pr25317.ll
new file mode 100644
index 0000000000000..6770c6f84ecd7
--- /dev/null
+++ b/test/CodeGen/ARM/pr25317.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+; CHECK-LABEL: f:
+; CHECK: str lr, [r0]
+define void @f(i32* %p) {
+  call void asm sideeffect "str lr, $0", "=*o"(i32* %p)
+  ret void
+}
\ No newline at end of file
diff --git a/test/CodeGen/ARM/pr25838.ll b/test/CodeGen/ARM/pr25838.ll
new file mode 100644
index 0000000000000..0aa95fd2d720b
--- /dev/null
+++ b/test/CodeGen/ARM/pr25838.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s
+; PR25838
+
+target triple = "armv7--linux-android"
+
+%0 = type { i32, i32 }
+
+define i32 @foo(%0* readonly) {
+  br i1 undef, label %12, label %2
+
+; <label>:2
+  %3 = trunc i64 undef to i32
+  %4 = icmp eq i32 undef, 0
+  br i1 %4, label %5, label %9
+
+; <label>:5
+  %6 = icmp slt i32 %3, 0
+  %7 = sub nsw i32 0, %3
+  %8 = select i1 %6, i32 %7, i32 %3
+  br label %12
+
+; <label>:9
+  br i1 undef, label %12, label %10
+
+; <label>:10
+  %11 = tail call i32 @bar(i32 undef)
+  unreachable
+
+; <label>:12
+  %13 = phi i32 [ %8, %5 ], [ 0, %1 ], [ undef, %9 ]
+  ret i32 %13
+}
+
+declare i32 @bar(i32)
diff --git a/test/CodeGen/ARM/rbit.ll b/test/CodeGen/ARM/rbit.ll
index 41f866fc8d2f9..a2bfeca75526d 100644
--- a/test/CodeGen/ARM/rbit.ll
+++ b/test/CodeGen/ARM/rbit.ll
@@ -18,3 +18,14 @@ entry:
 }
 
 declare i32 @llvm.arm.rbit(i32)
+
+declare i32 @llvm.bitreverse.i32(i32) readnone
+
+; CHECK-LABEL: rbit_generic
+; CHECK: rbit r0, r0
+define i32 @rbit_generic(i32 %t) {
+entry:
+  %rbit = call i32 @llvm.bitreverse.i32(i32 %t)
+  ret i32 %rbit
+}
+
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 507ee48cafc23..d02f5f883795e 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -24,7 +24,7 @@ entry:
   %2 = getelementptr inbounds %struct.int32x4_t, %struct.int32x4_t* %vT1ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1]
   %3 = load <4 x i32>, <4 x i32>* %2, align 16               ; <<4 x i32>> [#uses=1]
   %4 = bitcast i16* %i_ptr to i8*                 ; <i8*> [#uses=1]
-  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
+  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
   %6 = bitcast <8 x i16> %5 to <2 x double>       ; <<2 x double>> [#uses=2]
   %7 = extractelement <2 x double> %6, i32 0      ; <double> [#uses=1]
   %8 = bitcast double %7 to <4 x i16>             ; <<4 x i16>> [#uses=1]
@@ -40,7 +40,7 @@ entry:
   %trunc_16 = trunc <4 x i32> %16 to <4 x i16>
   %17 = shufflevector <4 x i16> %trunc_15, <4 x i16> %trunc_16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
   %18 = bitcast i16* %o_ptr to i8*                ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %18, <8 x i16> %17, i32 1)
   ret void
 }
 
@@ -60,17 +60,17 @@ entry:
   %2 = getelementptr inbounds %struct.int16x8_t, %struct.int16x8_t* %vT1ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
   %3 = load <8 x i16>, <8 x i16>* %2, align 16               ; <<8 x i16>> [#uses=1]
   %4 = bitcast i16* %i_ptr to i8*                 ; <i8*> [#uses=1]
-  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
+  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
   %6 = getelementptr inbounds i16, i16* %i_ptr, i32 8  ; <i16*> [#uses=1]
   %7 = bitcast i16* %6 to i8*                     ; <i8*> [#uses=1]
-  %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7, i32 1) ; <<8 x i16>> [#uses=1]
+  %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %7, i32 1) ; <<8 x i16>> [#uses=1]
   %9 = mul <8 x i16> %1, %5                       ; <<8 x i16>> [#uses=1]
   %10 = mul <8 x i16> %3, %8                      ; <<8 x i16>> [#uses=1]
   %11 = bitcast i16* %o_ptr to i8*                ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %11, <8 x i16> %9, i32 1)
   %12 = getelementptr inbounds i16, i16* %o_ptr, i32 8 ; <i16*> [#uses=1]
   %13 = bitcast i16* %12 to i8*                   ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %13, <8 x i16> %10, i32 1)
   ret void
 }
 
@@ -81,14 +81,14 @@ define <8 x i8> @t3(i8* %A, i8* %B) nounwind {
 ; CHECK:        vmov r
 ; CHECK-NOT:    vmov d
 ; CHECK:        vst3.8
-  %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
   %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 ; <<8 x i8>> [#uses=1]
   %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 ; <<8 x i8>> [#uses=1]
   %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 1 ; <<8 x i8>> [#uses=1]
   %tmp5 = sub <8 x i8> %tmp3, %tmp4
   %tmp6 = add <8 x i8> %tmp2, %tmp3               ; <<8 x i8>> [#uses=1]
   %tmp7 = mul <8 x i8> %tmp4, %tmp2
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7, i32 1)
+  tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7, i32 1)
   ret <8 x i8> %tmp4
 }
 
@@ -101,10 +101,10 @@ entry:
 ; CHECK-NOT:    vmov
 ; CHECK:        bne
   %tmp1 = bitcast i32* %in to i8*                 ; <i8*> [#uses=1]
-  %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp1, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
   %tmp3 = getelementptr inbounds i32, i32* %in, i32 8  ; <i32*> [#uses=1]
   %tmp4 = bitcast i32* %tmp3 to i8*               ; <i8*> [#uses=1]
-  %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp4, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
   %tmp8 = bitcast i32* %out to i8*                ; <i8*> [#uses=1]
   br i1 undef, label %return1, label %return2
 
@@ -120,7 +120,7 @@ return1:
   %tmp39 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
   %tmp6 = add <4 x i32> %tmp52, %tmp              ; <<4 x i32>> [#uses=1]
   %tmp7 = add <4 x i32> %tmp57, %tmp39            ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7, i32 1)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7, i32 1)
   ret void
 
 return2:
@@ -131,7 +131,7 @@ return2:
   %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
   %tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
   %tmp102 = add <4 x i32> %tmp100, %tmp101              ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101, i32 1)
+  tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101, i32 1)
   call void @llvm.trap()
   unreachable
 }
@@ -147,7 +147,7 @@ define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
 ; CHECK:        vadd.i16
   %tmp0 = bitcast i16* %A to i8*                  ; <i8*> [#uses=1]
   %tmp1 = load <8 x i16>, <8 x i16>* %B                      ; <<8 x i16>> [#uses=2]
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
   %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 ; <<8 x i16>> [#uses=1]
   %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 ; <<8 x i16>> [#uses=1]
   %tmp5 = add <8 x i16> %tmp3, %tmp4              ; <<8 x i16>> [#uses=1]
@@ -160,7 +160,7 @@ define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
 ; CHECK:        vorr d[[D0:[0-9]+]], d[[D1:[0-9]+]]
 ; CHECK-NEXT:   vld2.8 {d[[D1]][1], d[[D0]][1]}
   %tmp1 = load <8 x i8>, <8 x i8>* %B                       ; <<8 x i8>> [#uses=2]
-  %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
+  %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
   %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
   %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1]
   %tmp5 = add <8 x i8> %tmp3, %tmp4               ; <<8 x i8>> [#uses=1]
@@ -178,14 +178,14 @@ entry:
 ; CHECK:        vuzp.32 q[[Q1]], q[[Q0]]
 ; CHECK:        vst1.32
   %0 = bitcast i32* %iptr to i8*                  ; <i8*> [#uses=2]
-  %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
   %tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1]
   %tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1]
   %2 = bitcast i32* %optr to i8*                  ; <i8*> [#uses=2]
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60, i32 1)
-  %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0, i32 1) ; <<4 x i32>> [#uses=1]
+  tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60, i32 1)
+  %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* %0, i32 1) ; <<4 x i32>> [#uses=1]
   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %2, <4 x i32> %4, i32 1)
   ret void
 }
 
@@ -307,43 +307,43 @@ bb14:                                             ; preds = %bb6
 
 ; This test crashes the coalescer because live variables were not updated properly.
 define <8 x i8> @t11(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
-  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
   %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
-  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
   %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
   %tmp2bd = add <8 x i8> zeroinitializer, %tmp2d  ; <<8 x i8>> [#uses=1]
   %tmp2abcd = mul <8 x i8> zeroinitializer, %tmp2bd ; <<8 x i8>> [#uses=1]
   %tmp2ef = sub <8 x i8> zeroinitializer, %tmp2f  ; <<8 x i8>> [#uses=1]
   %tmp2efgh = mul <8 x i8> %tmp2ef, undef         ; <<8 x i8>> [#uses=2]
-  call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh, i32 1)
+  call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh, i32 1)
   %tmp2 = sub <8 x i8> %tmp2efgh, %tmp2abcd       ; <<8 x i8>> [#uses=1]
   %tmp7 = mul <8 x i8> undef, %tmp2               ; <<8 x i8>> [#uses=1]
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7, i32 1)
+  tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7, i32 1)
   ret <8 x i8> undef
 }
 
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8*, i32) nounwind readonly
 
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
 
 declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
 
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4i32(i8*, <4 x i32>, i32) nounwind
 
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
 
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
 nounwind
 
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
 
-declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8*, i32) nounwind readonly
 
-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
 
-declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
 
 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
 
diff --git a/test/CodeGen/ARM/rotate.ll b/test/CodeGen/ARM/rotate.ll
new file mode 100644
index 0000000000000..f3f7de2160fb9
--- /dev/null
+++ b/test/CodeGen/ARM/rotate.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=thumbv8--linux-gnueabihf | FileCheck %s
+
+;; This used to cause a backend crash about not being able to
+;; select ROTL. Make sure if generates the basic VSHL/VSHR.
+define <2 x i64> @testcase(<2 x i64>* %in) {
+; CHECK-LABEL: testcase
+; CHECK: vshl.i64
+; CHECK: vshr.u64
+  %1 = load <2 x i64>, <2 x i64>* %in
+  %2 = lshr <2 x i64> %1, <i64 8, i64 8>
+  %3 = shl <2 x i64> %1, <i64 56, i64 56>
+  %4 = or <2 x i64> %2, %3
+  ret <2 x i64> %4
+}
diff --git a/test/CodeGen/ARM/sat-arith.ll b/test/CodeGen/ARM/sat-arith.ll
new file mode 100644
index 0000000000000..4844ed1bd21e2
--- /dev/null
+++ b/test/CodeGen/ARM/sat-arith.ll
@@ -0,0 +1,63 @@
+; RUN: llc -O1 -mtriple=armv6-none-none-eabi %s -o - | FileCheck %s -check-prefix=ARM -check-prefix=CHECK
+; RUN: llc -O1 -mtriple=thumbv7-none-none-eabi %s -o - | FileCheck %s -check-prefix=THUMB -check-prefix=CHECK
+
+; CHECK-LABEL: qadd
+define i32 @qadd() nounwind {
+; CHECK-DAG: mov{{s?}} [[R0:.*]], #8
+; CHECK-DAG: mov{{s?}} [[R1:.*]], #128
+; CHECK-ARM: qadd [[R0]], [[R1]], [[R0]]
+; CHECK-THRUMB: qadd [[R0]], [[R0]], [[R1]]
+  %tmp = call i32 @llvm.arm.qadd(i32 128, i32 8)
+  ret i32 %tmp
+}
+
+; CHECK-LABEL: qsub
+define i32 @qsub() nounwind {
+; CHECK-DAG: mov{{s?}} [[R0:.*]], #8
+; CHECK-DAG: mov{{s?}} [[R1:.*]], #128
+; CHECK-ARM: qsub [[R0]], [[R1]], [[R0]]
+; CHECK-THRUMB: qadd [[R0]], [[R1]], [[R0]]
+  %tmp = call i32 @llvm.arm.qsub(i32 128, i32 8)
+  ret i32 %tmp
+}
+
+; upper-bound of the immediate argument
+; CHECK-LABEL: ssat1
+define i32 @ssat1() nounwind {
+; CHECK: mov{{s?}} [[R0:.*]], #128
+; CHECK: ssat [[R1:.*]], #32, [[R0]]
+  %tmp = call i32 @llvm.arm.ssat(i32 128, i32 32)
+  ret i32 %tmp
+}
+
+; lower-bound of the immediate argument
+; CHECK-LABEL: ssat2
+define i32 @ssat2() nounwind {
+; CHECK: mov{{s?}} [[R0:.*]], #128
+; CHECK: ssat [[R1:.*]], #1, [[R0]]
+  %tmp = call i32 @llvm.arm.ssat(i32 128, i32 1)
+  ret i32 %tmp
+}
+
+; upper-bound of the immediate argument
+; CHECK-LABEL: usat1
+define i32 @usat1() nounwind {
+; CHECK: mov{{s?}} [[R0:.*]], #128
+; CHECK: usat [[R1:.*]], #31, [[R0]]
+  %tmp = call i32 @llvm.arm.usat(i32 128, i32 31)
+  ret i32 %tmp
+}
+
+; lower-bound of the immediate argument
+; CHECK-LABEL: usat2
+define i32 @usat2() nounwind {
+; CHECK: mov{{s?}} [[R0:.*]], #128
+; CHECK: usat [[R1:.*]], #0, [[R0]]
+  %tmp = call i32 @llvm.arm.usat(i32 128, i32 0)
+  ret i32 %tmp
+}
+
+declare i32 @llvm.arm.qadd(i32, i32) nounwind
+declare i32 @llvm.arm.qsub(i32, i32) nounwind
+declare i32 @llvm.arm.ssat(i32, i32) nounwind readnone
+declare i32 @llvm.arm.usat(i32, i32) nounwind readnone
diff --git a/test/CodeGen/ARM/sched-it-debug-nodes.ll b/test/CodeGen/ARM/sched-it-debug-nodes.ll
deleted file mode 100644
index 7906e7c7581e8..0000000000000
--- a/test/CodeGen/ARM/sched-it-debug-nodes.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: llc -mtriple thumbv7 -print-before=post-RA-sched -print-after=post-RA-sched %s -o /dev/null 2>&1 | FileCheck %s
-
-; ModuleID = '<stdin>'
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "thumbv7"
-
-%struct.s = type opaque
-
-; Function Attrs: nounwind
-define arm_aapcscc i32 @f(%struct.s* %s, i32 %u, i8* %b, i32 %n) #0 {
-entry:
-  tail call void @llvm.dbg.value(metadata %struct.s* %s, i64 0, metadata !19, metadata !28), !dbg !29
-  tail call void @llvm.dbg.value(metadata i32 %u, i64 0, metadata !20, metadata !28), !dbg !29
-  tail call void @llvm.dbg.value(metadata i8* %b, i64 0, metadata !21, metadata !28), !dbg !29
-  tail call void @llvm.dbg.value(metadata i32 %n, i64 0, metadata !22, metadata !28), !dbg !29
-  %cmp = icmp ult i32 %n, 4, !dbg !30
-  br i1 %cmp, label %return, label %if.end, !dbg !32
-
-if.end:                                           ; preds = %entry
-  tail call arm_aapcscc void @g(%struct.s* %s, i8* %b, i32 %n) #3, !dbg !33
-  br label %return, !dbg !34
-
-return:                                           ; preds = %entry, %if.end
-  %retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ]
-  ret i32 %retval.0, !dbg !35
-}
-
-; NOTE: This is checking that the register in the DEBUG_VALUE node is not
-; accidentally being marked as KILL.  The DBG_VALUE node gets introduced in
-; If-Conversion, and gets bundled into the IT block.  The Post RA Scheduler
-; attempts to schedule the Machine Instr, and tries to tag the register in the
-; debug value as KILL'ed, resulting in a DEBUG_VALUE node changing codegen!  (or
-; hopefully, triggering an assert).
-
-; CHECK: BUNDLE %ITSTATE<imp-def,dead>
-; CHECK:  * DBG_VALUE %R1, %noreg, !"u"
-; CHECK-NOT:  * DBG_VALUE %R1<kill>, %noreg, !"u"
-
-declare arm_aapcscc void @g(%struct.s*, i8*, i32) #1
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!23, !24, !25, !26}
-!llvm.ident = !{!27}
-
-!0 = !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0  (llvm/trunk 237059)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
-!1 = !DIFile(filename: "<stdin>", directory: "/Users/compnerd/Source/llvm")
-!2 = !{}
-!3 = !{!4}
-!4 = !DISubprogram(name: "f", scope: !5, file: !5, line: 9, type: !6, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, function: i32 (%struct.s*, i32, i8*, i32)* @f, variables: !18)
-!5 = !DIFile(filename: "<stdin>", directory: "/Users/compnerd/Source/llvm")
-!6 = !DISubroutineType(types: !7)
-!7 = !{!8, !9, !12, !13, !17}
-!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
-!10 = !DIDerivedType(tag: DW_TAG_typedef, name: "s", file: !5, line: 5, baseType: !11)
-!11 = !DICompositeType(tag: DW_TAG_structure_type, name: "s", file: !5, line: 5, flags: DIFlagFwdDecl)
-!12 = !DIBasicType(name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
-!13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 32, align: 32)
-!14 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !15)
-!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint8_t", file: !5, line: 2, baseType: !16)
-!16 = !DIBasicType(name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
-!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !5, line: 3, baseType: !12)
-!18 = !{!19, !20, !21, !22}
-!19 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "s", arg: 1, scope: !4, file: !5, line: 9, type: !9)
-!20 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "u", arg: 2, scope: !4, file: !5, line: 9, type: !12)
-!21 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "b", arg: 3, scope: !4, file: !5, line: 9, type: !13)
-!22 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "n", arg: 4, scope: !4, file: !5, line: 9, type: !17)
-!23 = !{i32 2, !"Dwarf Version", i32 4}
-!24 = !{i32 2, !"Debug Info Version", i32 3}
-!25 = !{i32 1, !"wchar_size", i32 4}
-!26 = !{i32 1, !"min_enum_size", i32 4}
-!27 = !{!"clang version 3.7.0  (llvm/trunk 237059)"}
-!28 = !DIExpression()
-!29 = !DILocation(line: 9, scope: !4)
-!30 = !DILocation(line: 10, scope: !31)
-!31 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10)
-!32 = !DILocation(line: 10, scope: !4)
-!33 = !DILocation(line: 13, scope: !4)
-!34 = !DILocation(line: 14, scope: !4)
-!35 = !DILocation(line: 15, scope: !4)
diff --git a/test/CodeGen/ARM/setjmp_longjmp.ll b/test/CodeGen/ARM/setjmp_longjmp.ll
new file mode 100644
index 0000000000000..7100175a97a4e
--- /dev/null
+++ b/test/CodeGen/ARM/setjmp_longjmp.ll
@@ -0,0 +1,113 @@
+; RUN: llc %s -o - | FileCheck %s
+target triple = "armv7-apple-ios"
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*)
+declare void @llvm.eh.sjlj.longjmp(i8*)
+@g = external global i32
+
+declare void @may_throw()
+declare i32 @__gxx_personality_sj0(...)
+declare i8* @__cxa_begin_catch(i8*)
+declare void @__cxa_end_catch()
+declare i32 @llvm.eh.typeid.for(i8*)
+declare i8* @llvm.frameaddress(i32)
+declare i8* @llvm.stacksave()
+@_ZTIPKc = external constant i8*
+
+; CHECK-LABEL: foobar
+;
+; setjmp sequence:
+; CHECK: add [[PCREG:r[0-9]+]], pc, #8
+; CHECK-NEXT: str [[PCREG]], {{\[}}[[BUFREG:r[0-9]+]], #4]
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: add pc, pc, #0
+; CHECK-NEXT: mov r0, #1
+;
+; longjmp sequence:
+; CHECK: ldr sp, [{{\s*}}[[BUFREG:r[0-9]+]], #8]
+; CHECK-NEXT: ldr [[DESTREG:r[0-9]+]], {{\[}}[[BUFREG]], #4]
+; CHECK-NEXT: ldr r7, {{\[}}[[BUFREG]]{{\]}}
+; CHECK-NEXT: bx [[DESTREG]]
+define void @foobar() {
+entry:
+  %buf = alloca [5 x i8*], align 4
+  %arraydecay = getelementptr inbounds [5 x i8*], [5 x i8*]* %buf, i32 0, i32 0
+  %bufptr = bitcast i8** %arraydecay to i8*
+  ; Note: This is simplified, in reality you have to store the framepointer +
+  ; stackpointer in the buffer as well for this to be legal!
+  %setjmpres = call i32 @llvm.eh.sjlj.setjmp(i8* %bufptr)
+  %tobool = icmp ne i32 %setjmpres, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:
+  store volatile i32 1, i32* @g, align 4
+  br label %if.end
+
+if.else:
+  store volatile i32 0, i32* @g, align 4
+  call void @llvm.eh.sjlj.longjmp(i8* %bufptr)
+  unreachable
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: combine_sjlj_eh_and_setjmp_longjmp
+; Check that we can mix sjlj exception handling with __builtin_setjmp
+; and __builtin_longjmp.
+;
+; setjmp sequence:
+; CHECK: add [[PCREG:r[0-9]+]], pc, #8
+; CHECK-NEXT: str [[PCREG]], {{\[}}[[BUFREG:r[0-9]+]], #4]
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: add pc, pc, #0
+; CHECK-NEXT: mov r0, #1
+;
+; longjmp sequence:
+; CHECK: ldr sp, [{{\s*}}[[BUFREG:r[0-9]+]], #8]
+; CHECK-NEXT: ldr [[DESTREG:r[0-9]+]], {{\[}}[[BUFREG]], #4]
+; CHECK-NEXT: ldr r7, {{\[}}[[BUFREG]]{{\]}}
+; CHECK-NEXT: bx [[DESTREG]]
+define void @combine_sjlj_eh_and_setjmp_longjmp() personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*) {
+entry:
+  %buf = alloca [5 x i8*], align 4
+  invoke void @may_throw() to label %try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 } catch i8* bitcast (i8** @_ZTIPKc to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 1
+  %2 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIPKc to i8*)) #3
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:
+  %3 = extractvalue { i8*, i32 } %0, 0
+  %4 = tail call i8* @__cxa_begin_catch(i8* %3) #3
+  store volatile i32 0, i32* @g, align 4
+  %5 = bitcast [5 x i8*]* %buf to i8*
+  %arraydecay = getelementptr inbounds [5 x i8*], [5 x i8*]* %buf, i64 0, i64 0
+  %6 = tail call i8* @llvm.frameaddress(i32 0)
+  store i8* %6, i8** %arraydecay, align 16
+  %7 = tail call i8* @llvm.stacksave()
+  %8 = getelementptr [5 x i8*], [5 x i8*]* %buf, i64 0, i64 2
+  store i8* %7, i8** %8, align 16
+  %9 = call i32 @llvm.eh.sjlj.setjmp(i8* %5)
+  %tobool = icmp eq i32 %9, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  store volatile i32 2, i32* @g, align 4
+  call void @__cxa_end_catch() #3
+  br label %try.cont
+
+if.else:
+  store volatile i32 1, i32* @g, align 4
+  call void @llvm.eh.sjlj.longjmp(i8* %5)
+  unreachable
+
+eh.resume:
+  resume { i8*, i32 } %0
+
+try.cont:
+  ret void
+}
diff --git a/test/CodeGen/ARM/shifter_operand.ll b/test/CodeGen/ARM/shifter_operand.ll
index 6f5c0e8279a93..5d44eb0f11d12 100644
--- a/test/CodeGen/ARM/shifter_operand.ll
+++ b/test/CodeGen/ARM/shifter_operand.ll
@@ -1,14 +1,14 @@
-; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
-; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=A9
+; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMB
 ; rdar://8576755
 
 
 define i32 @test1(i32 %X, i32 %Y, i8 %sh) {
-; A8-LABEL: test1:
-; A8: add r0, r0, r1, lsl r2
-
-; A9-LABEL: test1:
-; A9: add r0, r0, r1, lsl r2
+; CHECK-LABEL: test1:
+; CHECK-ARM: add r0, r0, r1, lsl r2
+; CHECK-THUMB: lsls r1, r2
+; CHECK-THUMB: add r0, r1
         %shift.upgrd.1 = zext i8 %sh to i32
         %A = shl i32 %Y, %shift.upgrd.1
         %B = add i32 %X, %A
@@ -16,11 +16,10 @@ define i32 @test1(i32 %X, i32 %Y, i8 %sh) {
 }
 
 define i32 @test2(i32 %X, i32 %Y, i8 %sh) {
-; A8-LABEL: test2:
-; A8: bic r0, r0, r1, asr r2
-
-; A9-LABEL: test2:
-; A9: bic r0, r0, r1, asr r2
+; CHECK-LABEL: test2:
+; CHECK-ARM: bic r0, r0, r1, asr r2
+; CHECK-THUMB: asrs r1, r2
+; CHECK-THUMB: bics r0, r1
         %shift.upgrd.2 = zext i8 %sh to i32
         %A = ashr i32 %Y, %shift.upgrd.2
         %B = xor i32 %A, -1
@@ -30,14 +29,9 @@ define i32 @test2(i32 %X, i32 %Y, i8 %sh) {
 
 define i32 @test3(i32 %base, i32 %base2, i32 %offset) {
 entry:
-; A8-LABEL: test3:
-; A8: ldr r0, [r0, r2, lsl #2]
-; A8: ldr r1, [r1, r2, lsl #2]
-
-; lsl #2 is free
-; A9-LABEL: test3:
-; A9: ldr r0, [r0, r2, lsl #2]
-; A9: ldr r1, [r1, r2, lsl #2]
+; CHECK-LABEL: test3:
+; CHECK: ldr{{(.w)?}} r0, [r0, r2, lsl #2]
+; CHECK: ldr{{(.w)?}} r1, [r1, r2, lsl #2]
         %tmp1 = shl i32 %offset, 2
         %tmp2 = add i32 %base, %tmp1
         %tmp3 = inttoptr i32 %tmp2 to i32*
@@ -53,17 +47,11 @@ declare i8* @malloc(...)
 
 define fastcc void @test4(i16 %addr) nounwind {
 entry:
-; A8-LABEL: test4:
-; A8: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]
-; A8-NOT: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]!
-; A8: str [[REG]], [r0, r1, lsl #2]
-; A8-NOT: str [[REG]], [r0]
-
-; A9-LABEL: test4:
-; A9: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]
-; A9-NOT: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]!
-; A9: str [[REG]], [r0, r1, lsl #2]
-; A9-NOT: str [[REG]], [r0]
+; CHECK-LABEL: test4:
+; CHECK: ldr{{(.w)?}} [[REG:r[0-9]+]], [r0, r1, lsl #2]
+; CHECK-NOT: ldr{{(.w)?}} [[REG:r[0-9]+]], [r0, r1, lsl #2]!
+; CHECK: str{{(.w)?}} [[REG]], [r0, r1, lsl #2]
+; CHECK-NOT: str{{(.w)?}} [[REG]], [r0]
   %0 = tail call i8* (...) @malloc(i32 undef) nounwind
   %1 = bitcast i8* %0 to i32*
   %2 = sext i16 %addr to i32
@@ -73,3 +61,181 @@ entry:
   store i32 %5, i32* %3, align 4
   ret void
 }
+
+define i32 @test_orr_extract_from_mul_1(i32 %x, i32 %y) {
+entry:
+; CHECK-LABEL: test_orr_extract_from_mul_1
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-ARM: orr r0, r1, r0
+; CHECK-THUMB: muls r1, r2, r1
+; CHECk-THUMB: orrs r0, r1
+  %mul = mul i32 %y, 63767
+  %or = or i32 %mul, %x
+  ret i32 %or
+}
+
+define i32 @test_orr_extract_from_mul_2(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orr_extract_from_mul_2
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #1
+entry:
+  %mul1 = mul i32 %y, 127534
+  %or = or i32 %mul1, %x
+  ret i32 %or
+}
+
+define i32 @test_orr_extract_from_mul_3(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orr_extract_from_mul_3
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #2
+entry:
+  %mul1 = mul i32 %y, 255068
+  %or = or i32 %mul1, %x
+  ret i32 %or
+}
+
+define i32 @test_orr_extract_from_mul_4(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orr_extract_from_mul_4
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #3
+entry:
+  %mul1 = mul i32 %y, 510136
+  %or = or i32 %mul1, %x
+  ret i32 %or
+}
+
+define i32 @test_orr_extract_from_mul_5(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orr_extract_from_mul_5
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #4
+entry:
+  %mul1 = mul i32 %y, 1020272
+  %or = or i32 %mul1, %x
+  ret i32 %or
+}
+
+define i32 @test_orr_extract_from_mul_6(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orr_extract_from_mul_6
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #16
+entry:
+  %mul = mul i32 %y, -115933184
+  %or = or i32 %mul, %x
+  ret i32 %or
+}
+
+define i32 @test_load_extract_from_mul_1(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_1
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: ldrb r0, [r0, r1]
+entry:
+  %mul = mul i32 %y, 63767
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @test_load_extract_from_mul_2(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_2
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: ldrb{{(.w)?}} r0, [r0, r1, lsl #1]
+entry:
+  %mul1 = mul i32 %y, 127534
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @test_load_extract_from_mul_3(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_3
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: ldrb{{(.w)?}} r0, [r0, r1, lsl #2]
+entry:
+  %mul1 = mul i32 %y, 255068
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @test_load_extract_from_mul_4(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_4
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: ldrb{{(.w)?}} r0, [r0, r1, lsl #3]
+entry:
+  %mul1 = mul i32 %y, 510136
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @test_load_extract_from_mul_5(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_5
+; CHECK-ARM: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-ARM: ldrb r0, [r0, r1, lsl #4]
+; CHECK-THUMB: movw r2, #37232
+; CHECK-THUMB: movt r2, #15
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK-THUMB: ldrb r0, [r0, r1]
+entry:
+  %mul1 = mul i32 %y, 1020272
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @test_load_extract_from_mul_6(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_6
+; CHECK-ARM: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-ARM: ldrb r0, [r0, r1, lsl #16]
+; CHECK-THUMB: movs r2, #0
+; CHECK-THUMB: movt r2, #63767
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK-THUMB: ldrb r0, [r0, r1]
+entry:
+  %mul = mul i32 %y, -115933184
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+
+define void @test_well_formed_dag(i32 %in1, i32 %in2, i32* %addr) {
+; CHECK-LABEL: test_well_formed_dag:
+; CHECK-ARM: movw [[SMALL_CONST:r[0-9]+]], #675
+; CHECK-ARM: mul [[SMALL_PROD:r[0-9]+]], r0, [[SMALL_CONST]]
+; CHECK-ARM: add {{r[0-9]+}}, r1, [[SMALL_PROD]], lsl #7
+
+  %mul.small = mul i32 %in1, 675
+  store i32 %mul.small, i32* %addr
+  %mul.big = mul i32 %in1, 86400
+  %add = add i32 %in2, %mul.big
+  store i32 %add, i32* %addr
+  ret void
+}
diff --git a/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll b/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
index c7f47b0962dc6..a1abef9605ca1 100644
--- a/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
+++ b/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
@@ -77,8 +77,8 @@ declare void @terminatev()
 ; CHECK: blx __Znwm
 ; CHECK: {{.*}}@ %entry.do.body.i.i.i_crit_edge
 ; CHECK: str r0, [sp, [[OFFSET:#[0-9]+]]]
-; CHECK: ldr [[R0:r[0-9]+]], [sp, [[OFFSET]]]
 ; CHECK: {{.*}}@ %do.body.i.i.i
+; CHECK: ldr [[R0:r[0-9]+]], [sp, [[OFFSET]]]
 ; CHECK: cbz [[R0]]
 
 %"class.std::__1::basic_string" = type { %"class.std::__1::__compressed_pair" }
diff --git a/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll b/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll
index 5d015738623af..b44b447b3dffb 100644
--- a/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll
+++ b/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=armv7-apple-ios -O1 < %s | FileCheck %s
 ; RUN: llc -mtriple=armv7-apple-ios -O2 < %s | FileCheck %s
 ; RUN: llc -mtriple=armv7-apple-ios -O3 < %s | FileCheck %s
+; RUN: llc -mtriple=armv7k-apple-ios < %s | FileCheck %s
 
 ; SjLjEHPrepare shouldn't crash when lowering empty structs.
 ;
diff --git a/test/CodeGen/ARM/softfp-fabs-fneg.ll b/test/CodeGen/ARM/softfp-fabs-fneg.ll
new file mode 100644
index 0000000000000..b608fb840218a
--- /dev/null
+++ b/test/CodeGen/ARM/softfp-fabs-fneg.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=armv7 < %s | FileCheck %s --check-prefix=CHECK-ARM --check-prefix=CHECK
+; RUN: llc -mtriple=thumbv7 < %s | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--"
+
+define double @f(double %a) {
+  ; CHECK-LABEL: f:
+  ; CHECK: bfc r1, #31, #1
+  ; CHECK-NEXT: bx lr
+  %x = call double @llvm.fabs.f64(double %a) readnone
+  ret double %x
+}
+
+define float @g(float %a) {
+  ; CHECK-LABEL: g:
+  ; CHECK-THUMB: bic r0, r0, #-2147483648
+  ; CHECK-ARM: bfc r0, #31, #1
+  ; CHECK-NEXT: bx lr
+  %x = call float @llvm.fabs.f32(float %a) readnone
+  ret float %x
+}
+
+define double @h(double %a) {
+  ; CHECK-LABEL: h:
+  ; CHECK: eor r1, r1, #-2147483648
+  ; CHECK-NEXT: bx lr
+  %x = fsub nsz double -0.0, %a
+  ret double %x
+}
+
+define float @i(float %a) {
+  ; CHECK-LABEL: i:
+  ; CHECK: eor r0, r0, #-2147483648
+  ; CHECK-NEXT: bx lr
+  %x = fsub nsz float -0.0, %a
+  ret float %x
+}
+
+declare double @llvm.fabs.f64(double) readnone
+declare float @llvm.fabs.f32(float) readnone
diff --git a/test/CodeGen/ARM/special-reg-mcore.ll b/test/CodeGen/ARM/special-reg-mcore.ll
index 686da0f6b8397..45e6db9e78fe1 100644
--- a/test/CodeGen/ARM/special-reg-mcore.ll
+++ b/test/CodeGen/ARM/special-reg-mcore.ll
@@ -3,7 +3,7 @@
 ; RUN: not llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s --check-prefix=ACORE
 
 ; ACORE: LLVM ERROR: Invalid register name "control".
-; M3CORE: LLVM ERROR: Invalid register name "control".
+; M3CORE: LLVM ERROR: Invalid register name "xpsr_nzcvqg".
 
 define i32 @read_mclass_registers() nounwind {
 entry:
diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll
index 1a102e3d971fb..845018ebb0e71 100644
--- a/test/CodeGen/ARM/spill-q.ll
+++ b/test/CodeGen/ARM/spill-q.ll
@@ -7,7 +7,7 @@
 %quux = type { i32 (...)**, %baz*, i32 }
 %quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
 
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
 
 define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK-LABEL: aaa:
@@ -18,30 +18,30 @@ entry:
   %aligned_vec = alloca <4 x float>, align 16
   %"alloca point" = bitcast i32 0 to i32
   %vecptr = bitcast <4 x float>* %aligned_vec to i8*
-  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %vecptr, i32 1) nounwind ; <<4 x float>> [#uses=1]
+  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %vecptr, i32 1) nounwind ; <<4 x float>> [#uses=1]
   store float 6.300000e+01, float* undef, align 4
-  %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
+  %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
   store float 0.000000e+00, float* undef, align 4
-  %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
-  %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
+  %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
   %val173 = load <4 x float>, <4 x float>* undef               ; <<4 x float>> [#uses=1]
   br label %bb4
diff --git a/test/CodeGen/ARM/ssat-lower.ll b/test/CodeGen/ARM/ssat-lower.ll
new file mode 100644
index 0000000000000..9f0cd0364bcf9
--- /dev/null
+++ b/test/CodeGen/ARM/ssat-lower.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -O1 -mtriple=armv6-none-none-eabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -O1 -mtriple=thumbv7-none-none-eabi 2>&1 | FileCheck %s
+
+; immediate argument < lower-bound
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.ssat
+define i32 @ssat1() nounwind {
+  %tmp = call i32 @llvm.arm.ssat(i32 128, i32 0)
+  ret i32 %tmp
+}
+
+declare i32 @llvm.arm.ssat(i32, i32) nounwind readnone
diff --git a/test/CodeGen/ARM/ssat-upper.ll b/test/CodeGen/ARM/ssat-upper.ll
new file mode 100644
index 0000000000000..e53f82b3efa38
--- /dev/null
+++ b/test/CodeGen/ARM/ssat-upper.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -O1 -mtriple=armv6-none-none-eabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -O1 -mtriple=thumbv7-none-none-eabi 2>&1 | FileCheck %s
+
+; immediate argument > upper-bound
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.ssat
+define i32 @ssat1() nounwind {
+  %tmp = call i32 @llvm.arm.ssat(i32 128, i32 33)
+  ret i32 %tmp
+}
+
+declare i32 @llvm.arm.ssat(i32, i32) nounwind readnone
diff --git a/test/CodeGen/ARM/subtarget-no-movt.ll b/test/CodeGen/ARM/subtarget-no-movt.ll
new file mode 100644
index 0000000000000..cb61bde3f9c08
--- /dev/null
+++ b/test/CodeGen/ARM/subtarget-no-movt.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - | FileCheck -check-prefix=NO-OPTION %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -mattr=-no-movt | FileCheck -check-prefix=USE-MOVT %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -mattr=+no-movt | FileCheck -check-prefix=NO-USE-MOVT %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -O0 | FileCheck -check-prefix=NO-OPTION %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -O0 -mattr=-no-movt | FileCheck -check-prefix=USE-MOVT %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -O0 -mattr=+no-movt | FileCheck -check-prefix=NO-USE-MOVT %s
+
+; NO-OPTION-LABEL: {{_?}}foo0
+; NO-OPTION: ldr [[R0:r[0-9]+]], [[L0:.*]]
+; NO-OPTION: [[L0]]:
+; NO-OPTION: .long 2296237089
+
+; USE-MOVT-LABEL: {{_?}}foo0
+; USE-MOVT: movw [[R0:r[0-9]+]], #52257
+; USE-MOVT: movt [[R0]], #35037
+
+; NO-USE-MOVT-LABEL: {{_?}}foo0
+; NO-USE-MOVT: ldr [[R0:r[0-9]+]], [[L0:.*]]
+; NO-USE-MOVT: [[L0]]:
+; NO-USE-MOVT: .long 2296237089
+
+define i32 @foo0(i32 %a) #0 {
+  %1 = xor i32 -1998730207, %a
+  ret i32 %1
+}
+
+; NO-OPTION-LABEL: {{_?}}foo1
+; NO-OPTION: movw [[R0:r[0-9]+]], #52257
+; NO-OPTION: movt [[R0]], #35037
+
+; USE-MOVT-LABEL: {{_?}}foo1
+; USE-MOVT: movw [[R0:r[0-9]+]], #52257
+; USE-MOVT: movt [[R0]], #35037
+
+; NO-USE-MOVT-LABEL: {{_?}}foo1
+; NO-USE-MOVT: ldr [[R0:r[0-9]+]], [[L0:.*]]
+; NO-USE-MOVT: [[L0]]:
+; NO-USE-MOVT: .long 2296237089
+
+define i32 @foo1(i32 %a) {
+  %1 = xor i32 -1998730207, %a
+  ret i32 %1
+}
+
+attributes #0 = { "target-features"="+no-movt" }
diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll
index 95b0a202e7ff8..f83f288157935 100644
--- a/test/CodeGen/ARM/tail-merge-branch-weight.ll
+++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll
@@ -9,7 +9,7 @@
 ;                = 0.2 * 0.4 + 0.8 * 0.7 = 0.64
 
 ; CHECK: # Machine code for function test0:
-; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24)
+; CHECK: Successors according to CFG: BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}20.00%) BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}80.00%)
 ; CHECK: BB#{{[0-9]+}}:
 ; CHECK: BB#{{[0-9]+}}:
 ; CHECK: # End machine code for function test0.
diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll
index 64e0f4bcdefc1..6f8d245e74a02 100644
--- a/test/CodeGen/ARM/taildup-branch-weight.ll
+++ b/test/CodeGen/ARM/taildup-branch-weight.ll
@@ -3,7 +3,7 @@
 ; RUN:	| FileCheck %s
 
 ; CHECK: Machine code for function test0:
-; CHECK: Successors according to CFG: BB#1(4) BB#2(124)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%)
 
 define void @test0(i32 %a, i32 %b, i32* %c, i32* %d) {
 entry:
@@ -30,7 +30,7 @@ B4:
 !0 = !{!"branch_weights", i32 4, i32 124}
 
 ; CHECK: Machine code for function test1:
-; CHECK: Successors according to CFG: BB#1(8) BB#2(248)
+; CHECK: Successors according to CFG: BB#2(0x7c000000 / 0x80000000 = 96.88%) BB#1(0x04000000 / 0x80000000 = 3.12%)
 
 @g0 = common global i32 0, align 4
 
diff --git a/test/CodeGen/ARM/test-sharedidx.ll b/test/CodeGen/ARM/test-sharedidx.ll
index 377996c4c3c8b..db32f18d82c00 100644
--- a/test/CodeGen/ARM/test-sharedidx.ll
+++ b/test/CodeGen/ARM/test-sharedidx.ll
@@ -20,8 +20,8 @@ entry:
 
 for.body:                                         ; preds = %entry, %for.body.3
 ; CHECK: %for.body
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
   %i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.09
   %0 = load i8, i8* %arrayidx, align 1
@@ -42,8 +42,8 @@ for.end:                                          ; preds = %for.body, %for.body
 
 for.body.1:                                       ; preds = %for.body
 ; CHECK: %for.body.1
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
   %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %add5
   %2 = load i8, i8* %arrayidx.1, align 1
   %conv6.1 = zext i8 %2 to i32
@@ -59,9 +59,6 @@ for.body.1:                                       ; preds = %for.body
   br i1 %cmp.1, label %for.body.2, label %for.end
 
 for.body.2:                                       ; preds = %for.body.1
-; CHECK: %for.body.2
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
   %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %add5.1
   %4 = load i8, i8* %arrayidx.2, align 1
   %conv6.2 = zext i8 %4 to i32
@@ -78,8 +75,8 @@ for.body.2:                                       ; preds = %for.body.1
 
 for.body.3:                                       ; preds = %for.body.2
 ; CHECK: %for.body.3
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
   %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %add5.2
   %6 = load i8, i8* %arrayidx.3, align 1
   %conv6.3 = zext i8 %6 to i32
diff --git a/test/CodeGen/ARM/thumb-alignment.ll b/test/CodeGen/ARM/thumb-alignment.ll
index c11d4b6da3c99..b9ddfbb714d1a 100644
--- a/test/CodeGen/ARM/thumb-alignment.ll
+++ b/test/CodeGen/ARM/thumb-alignment.ll
@@ -23,7 +23,7 @@ define i32* @bar() {
 
 ; CHECK: .globl	baz
 ; CHECK-NEXT: .align	2
-; CHECK: adr.w
+; CHECK: tbb
 define i32 @baz() {
   %1 = load i32, i32* @c, align 4
   switch i32 %1, label %7 [
diff --git a/test/CodeGen/ARM/thumb1-ldst-opt.ll b/test/CodeGen/ARM/thumb1-ldst-opt.ll
new file mode 100644
index 0000000000000..eb82385de0c56
--- /dev/null
+++ b/test/CodeGen/ARM/thumb1-ldst-opt.ll
@@ -0,0 +1,27 @@
+; RUN: llc -stop-after block-placement -o /dev/null %s | FileCheck %s
+
+target triple = "thumbv6m-none-none"
+
+define i32* @foo(i32* readonly %p0) {
+entry:
+  %add.ptr = getelementptr inbounds i32, i32* %p0, i32 10
+  %arrayidx = getelementptr inbounds i32, i32* %p0, i32 13
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %p0, i32 12
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %p0, i32 11
+  %2 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %add, %2
+  %3 = load i32, i32* %add.ptr, align 4
+  %add5 = add nsw i32 %add3, %3
+  tail call void @g(i32 %add5)
+  ret i32* %p0
+}
+
+declare void @g(i32)
+
+; CHECK-LABEL: name: foo
+; CHECK: [[BASE:%r[0-7]]], {{.*}} tADDi8
+; CHECK-NOT: [[BASE]] = tLDMIA_UPD {{.*}} [[BASE]]
+; CHECK: tLDMIA killed [[BASE]], {{.*}} def [[BASE]]
diff --git a/test/CodeGen/ARM/thumb1_return_sequence.ll b/test/CodeGen/ARM/thumb1_return_sequence.ll
index 9c62faeaa684b..67d1cad2cf68f 100644
--- a/test/CodeGen/ARM/thumb1_return_sequence.ll
+++ b/test/CodeGen/ARM/thumb1_return_sequence.ll
@@ -23,11 +23,22 @@ entry:
 ; --------
 ; CHECK-V4T:         add sp,
 ; CHECK-V4T-NEXT:    pop {[[SAVED]]}
-; CHECK-V4T-NEXT:    mov r12, r3
-; CHECK-V4T-NEXT:    pop {r3}
-; CHECK-V4T-NEXT:    mov lr, r3
-; CHECK-V4T-NEXT:    mov r3, r12
-; CHECK-V4T:         bx  lr
+; The ISA for v4 does not support pop pc, so make sure we do not emit
+; one even when we do not need to update SP.
+; CHECK-V4T-NOT:     pop {pc}
+; We may only use lo register to pop, but in that case, all the scratch
+; ones are used.
+; r12 is the only register we are allowed to clobber for AAPCS.
+; Use it to save a lo register.
+; CHECK-V4T-NEXT:    mov [[TEMP_REG:r12]], [[POP_REG:r[0-7]]]
+; Pop the value of LR.
+; CHECK-V4T-NEXT:    pop {[[POP_REG]]}
+; Copy the value of LR in the right register.
+; CHECK-V4T-NEXT:    mov lr, [[POP_REG]]
+; Restore the value that was in the register we used to pop the value of LR.
+; CHECK-V4T-NEXT:    mov [[POP_REG]], [[TEMP_REG]]
+; Return.
+; CHECK-V4T-NEXT:    bx lr
 ; CHECK-V5T:         pop {[[SAVED]], pc}
 }
 
@@ -53,19 +64,19 @@ entry:
 ; Epilogue
 ; --------
 ; CHECK-V4T:         pop {[[SAVED]]}
-; CHECK-V4T-NEXT:    mov r12, r3
-; CHECK-V4T-NEXT:    pop {r3}
+; CHECK-V4T-NEXT:    mov r12, [[POP_REG:r[0-7]]]
+; CHECK-V4T-NEXT:    pop {[[POP_REG]]}
 ; CHECK-V4T-NEXT:    add sp,
-; CHECK-V4T-NEXT:    mov lr, r3
-; CHECK-V4T-NEXT:    mov r3, r12
+; CHECK-V4T-NEXT:    mov lr, [[POP_REG]]
+; CHECK-V4T-NEXT:    mov [[POP_REG]], r12
 ; CHECK-V4T:         bx  lr
 ; CHECK-V5T:         add sp,
 ; CHECK-V5T-NEXT:    pop {[[SAVED]]}
-; CHECK-V5T-NEXT:    mov r12, r3
-; CHECK-V5T-NEXT:    pop {r3}
+; CHECK-V5T-NEXT:    mov r12, [[POP_REG:r[0-7]]]
+; CHECK-V5T-NEXT:    pop {[[POP_REG]]}
 ; CHECK-V5T-NEXT:    add sp,
-; CHECK-V5T-NEXT:    mov lr, r3
-; CHECK-V5T-NEXT:    mov r3, r12
+; CHECK-V5T-NEXT:    mov lr, [[POP_REG]]
+; CHECK-V5T-NEXT:    mov [[POP_REG]], r12
 ; CHECK-V5T-NEXT:    bx lr
 }
 
@@ -95,8 +106,13 @@ entry:
 ; Epilogue
 ; --------
 ; CHECK-V4T:    pop {[[SAVED]]}
-; CHECK-V4T:    pop {r3}
-; CHECK-V4T:    bx r3
+; The ISA for v4 does not support pop pc, so make sure we do not emit
+; one even when we do not need to update SP.
+; CHECK-V4T-NOT:     pop {pc}
+; Pop the value of LR into a scratch lo register other than r0 (it is
+; used for the return value).
+; CHECK-V4T-NEXT:    pop {[[POP_REG:r[1-3]]]}
+; CHECK-V4T-NEXT:    bx [[POP_REG]]
 ; CHECK-V5T:    pop {[[SAVED]], pc}
 }
 
@@ -148,14 +164,18 @@ entry:
 ; --------
 ; CHECK-V4T:         add sp,
 ; CHECK-V4T-NEXT:    pop {[[SAVED]]}
-; CHECK-V4T-NEXT:    pop {r3}
+; Only r1 to r3 are available to pop LR.
+; r0 is used for the return value.
+; CHECK-V4T-NEXT:    pop {[[POP_REG:r[1-3]]]}
 ; CHECK-V4T-NEXT:    add sp,
-; CHECK-V4T-NEXT:    bx r3
+; CHECK-V4T-NEXT:    bx [[POP_REG]]
 ; CHECK-V5T:         add sp,
 ; CHECK-V5T-NEXT:    pop {[[SAVED]]}
-; CHECK-V5T-NEXT:    pop {r3}
+; Only r1 to r3 are available to pop LR.
+; r0 is used for the return value.
+; CHECK-V5T-NEXT:    pop {[[POP_REG:r[1-3]]]}
 ; CHECK-V5T-NEXT:    add sp,
-; CHECK-V5T-NEXT:    bx r3
+; CHECK-V5T-NEXT:    bx [[POP_REG]]
 }
 
 ; CHECK-V4T-LABEL: noframe
@@ -191,13 +211,17 @@ entry:
 ; Epilogue
 ; --------
 ; CHECK-V4T:         pop {[[SAVED]]}
-; CHECK-V4T-NEXT:    pop {r3}
+; Only r1 to r3 are available to pop LR.
+; r0 is used for the return value.
+; CHECK-V4T-NEXT:    pop {[[POP_REG:r[1-3]]]}
 ; CHECK-V4T-NEXT:    add sp,
-; CHECK-V4T-NEXT:    bx r3
+; CHECK-V4T-NEXT:    bx [[POP_REG]]
 ; CHECK-V5T:         pop {[[SAVED]]}
-; CHECK-V5T-NEXT:    pop {r3}
+; Only r1 to r3 are available to pop LR.
+; r0 is used for the return value.
+; CHECK-V5T-NEXT:    pop {[[POP_REG:r[1-3]]]}
 ; CHECK-V5T-NEXT:    add sp,
-; CHECK-V5T-NEXT:    bx r3
+; CHECK-V5T-NEXT:    bx [[POP_REG]]
 }
 
 declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index 2675a733da97e..aaefc0a148639 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck -check-prefix CHECK-V7 %s
-; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s -check-prefix CHECK-V8
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s
 ; PR11107
 
 define i32 @test(i32 %a, i32 %b) {
@@ -14,17 +14,13 @@ entry:
  ret i32 %add
 }
 
-; CHECK-V7:        cmp
-; CHECK-V7-NEXT:   it    mi
-; CHECK-V7-NEXT:   rsbmi
-; CHECK-V7-NEXT:   cmp
-; CHECK-V7-NEXT:   it    mi
-; CHECK-V7-NEXT:   rsbmi
+; CHECK:        cmp
+; CHECK-NEXT:   it    mi
+; We shouldn't need to check for the extra 's' here; tRSB should be printed as
+; "rsb" inside an IT block, not "rsbs".
+; CHECK-NEXT:   rsb{{s?}}mi
+; CHECK-NEXT:   cmp
+; CHECK-NEXT:   it    mi
+; CHECK-NEXT:   rsb{{s?}}mi
 
-; CHECK-V8:        cmp
-; CHECK-V8-NEXT:   bpl
-; CHECK-V8:        rsbs
-; CHECK-V8:        cmp
-; CHECK-V8-NEXT:   bpl
-; CHECK-V8:        rsbs
 
diff --git a/test/CodeGen/ARM/thumb_indirect_calls.ll b/test/CodeGen/ARM/thumb_indirect_calls.ll
index 9f1950c743c00..67346c6fde9c1 100644
--- a/test/CodeGen/ARM/thumb_indirect_calls.ll
+++ b/test/CodeGen/ARM/thumb_indirect_calls.ll
@@ -3,7 +3,7 @@
 
 @f = common global void (i32)* null, align 4
 
-; CHECK-LABEL foo:
+; CHECK-LABEL: foo:
 define void @foo(i32 %x) {
 entry:
   %0 = load void (i32)*, void (i32)** @f, align 4
@@ -21,7 +21,7 @@ entry:
 ; CHECK-V5T: blx [[CALLEE]]
 }
 
-; CHECK-LABEL bar:
+; CHECK-LABEL: bar:
 define void @bar(void (i32)* nocapture %g, i32 %x, void (i32)* nocapture %h) {
 entry:
   tail call void %g(i32 %x)
@@ -37,4 +37,3 @@ entry:
 ; CHECK-V5T: blx
 ; CHECK-V5T: blx
 }
-
diff --git a/test/CodeGen/ARM/tls-models.ll b/test/CodeGen/ARM/tls-models.ll
index 42c1ba9110288..f3c58f74ebf74 100644
--- a/test/CodeGen/ARM/tls-models.ll
+++ b/test/CodeGen/ARM/tls-models.ll
@@ -1,5 +1,11 @@
-; RUN: llc -march=arm -mtriple=arm-linux-gnueabi < %s | FileCheck -check-prefix=CHECK-NONPIC %s
-; RUN: llc -march=arm -mtriple=arm-linux-gnueabi -relocation-model=pic < %s | FileCheck -check-prefix=CHECK-PIC %s
+; RUN: llc -march=arm -mtriple=arm-linux-gnueabi < %s \
+; RUN:     | FileCheck -check-prefix=CHECK-NONPIC -check-prefix=COMMON %s
+; RUN: llc -march=arm -mtriple=arm-linux-gnueabi -relocation-model=pic < %s \
+; RUN:     | FileCheck -check-prefix=CHECK-PIC  -check-prefix=COMMON %s
+; RUN: llc -emulated-tls -march=arm -mtriple=arm-linux-gnueabi < %s \
+; RUN:     | FileCheck -check-prefix=EMUNONPIC -check-prefix=EMU -check-prefix=COMMON %s
+; RUN: llc -emulated-tls -march=arm -mtriple=arm-linux-gnueabi -relocation-model=pic < %s \
+; RUN:     | FileCheck -check-prefix=EMUPIC -check-prefix=EMU -check-prefix=COMMON %s
 
 
 @external_gd = external thread_local global i32
@@ -20,23 +26,23 @@ define i32* @f1() {
 entry:
   ret i32* @external_gd
 
+  ; COMMON-LABEL:   f1:
   ; Non-PIC code can use initial-exec, PIC code has to use general dynamic.
-  ; CHECK-NONPIC-LABEL:   f1:
   ; CHECK-NONPIC:   external_gd(GOTTPOFF)
-  ; CHECK-PIC-LABEL:      f1:
   ; CHECK-PIC:      external_gd(TLSGD)
+  ; EMU:            __emutls_get_address
 }
 
 define i32* @f2() {
 entry:
   ret i32* @internal_gd
 
+  ; COMMON-LABEL:   f2:
   ; Non-PIC code can use local exec, PIC code can use local dynamic,
   ; but that is not implemented, so falls back to general dynamic.
-  ; CHECK-NONPIC-LABEL:   f2:
   ; CHECK-NONPIC:   internal_gd(TPOFF)
-  ; CHECK-PIC-LABEL:      f2:
   ; CHECK-PIC:      internal_gd(TLSGD)
+  ; EMU:            __emutls_get_address
 }
 
 
@@ -46,24 +52,24 @@ define i32* @f3() {
 entry:
   ret i32* @external_ld
 
+  ; COMMON-LABEL:   f3:
   ; Non-PIC code can use initial exec, PIC should use local dynamic,
   ; but that is not implemented, so falls back to general dynamic.
-  ; CHECK-NONPIC-LABEL:   f3:
   ; CHECK-NONPIC:   external_ld(GOTTPOFF)
-  ; CHECK-PIC-LABEL:      f3:
   ; CHECK-PIC:      external_ld(TLSGD)
+  ; EMU:            __emutls_get_address
 }
 
 define i32* @f4() {
 entry:
   ret i32* @internal_ld
 
+  ; COMMON-LABEL:   f4:
   ; Non-PIC code can use local exec, PIC code can use local dynamic,
   ; but that is not implemented, so it falls back to general dynamic.
-  ; CHECK-NONPIC-LABEL:   f4:
   ; CHECK-NONPIC:   internal_ld(TPOFF)
-  ; CHECK-PIC-LABEL:      f4:
   ; CHECK-PIC:      internal_ld(TLSGD)
+  ; EMU:            __emutls_get_address
 }
 
 
@@ -73,22 +79,22 @@ define i32* @f5() {
 entry:
   ret i32* @external_ie
 
+  ; COMMON-LABEL:   f5:
   ; Non-PIC and PIC code will use initial exec as specified.
-  ; CHECK-NONPIC-LABEL:   f5:
   ; CHECK-NONPIC:   external_ie(GOTTPOFF)
-  ; CHECK-PIC-LABEL:      f5:
   ; CHECK-PIC:      external_ie(GOTTPOFF)
+  ; EMU:            __emutls_get_address
 }
 
 define i32* @f6() {
 entry:
   ret i32* @internal_ie
 
+  ; COMMON-LABEL:   f6:
   ; Non-PIC code can use local exec, PIC code use initial exec as specified.
-  ; CHECK-NONPIC-LABEL:   f6:
   ; CHECK-NONPIC:   internal_ie(TPOFF)
-  ; CHECK-PIC-LABEL:      f6:
   ; CHECK-PIC:      internal_ie(GOTTPOFF)
+  ; EMU:            __emutls_get_address
 }
 
 
@@ -98,20 +104,52 @@ define i32* @f7() {
 entry:
   ret i32* @external_le
 
+  ; COMMON-LABEL:   f7:
   ; Non-PIC and PIC code will use local exec as specified.
-  ; CHECK-NONPIC-LABEL:   f7:
   ; CHECK-NONPIC:   external_le(TPOFF)
-  ; CHECK-PIC-LABEL:      f7:
   ; CHECK-PIC:      external_le(TPOFF)
+  ; EMU:            __emutls_get_address
 }
 
 define i32* @f8() {
 entry:
   ret i32* @internal_le
 
+  ; COMMON-LABEL:   f8:
   ; Non-PIC and PIC code will use local exec as specified.
-  ; CHECK-NONPIC-LABEL:   f8:
   ; CHECK-NONPIC:   internal_le(TPOFF)
-  ; CHECK-PIC-LABEL:      f8:
   ; CHECK-PIC:      internal_le(TPOFF)
+  ; EMU:            __emutls_get_address
 }
+
+
+; ----- emulated specified -----
+
+; External declaration has no initializer.
+; Internal definition has initializer.
+
+; EMU-NOT:   __emutls_t.external_gd
+; EMU-NOT:   __emutls_v.external_gd
+; EMU:       .align 2
+; EMU-LABEL: __emutls_v.internal_gd:
+; EMU-NEXT:  .long 4
+; EMU-NEXT:  .long 4
+; EMU-NEXT:  .long 0
+; EMU-NEXT:  .long __emutls_t.internal_gd
+; EMU-LABEL: __emutls_t.internal_gd:
+; EMU-NEXT:  .long 42
+; EMU-NOT:   __emutls_t.external_gd
+
+; __emutls_t and __emutls_v are the same for PIC and non-PIC modes.
+
+; EMU-NOT:   __emutls_t.external_gd
+; EMU-NOT:   __emutls_v.external_gd
+; EMU:       .align 2
+; EMU-LABEL: __emutls_v.internal_le:
+; EMU-NEXT:  .long 4
+; EMU-NEXT:  .long 4
+; EMU-NEXT:  .long 0
+; EMU-NEXT:  .long __emutls_t.internal_le
+; EMU-LABEL: __emutls_t.internal_le:
+; EMU-NEXT:  .long 42
+; EMU-NOT:   __emutls_t.external_le
diff --git a/test/CodeGen/ARM/tls3.ll b/test/CodeGen/ARM/tls3.ll
index 7e17b13a3c99f..94cadeedd938c 100644
--- a/test/CodeGen/ARM/tls3.ll
+++ b/test/CodeGen/ARM/tls3.ll
@@ -1,11 +1,34 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
 ; RUN:     grep "tbss"
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
+; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=NOEMU
+; RUN: llc < %s -emulated-tls -march=arm -mtriple=arm-linux-gnueabi | \
+; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=EMU
 
 %struct.anon = type { i32, i32 }
-@teste = internal thread_local global %struct.anon zeroinitializer		; <%struct.anon*> [#uses=1]
+@teste = internal thread_local global %struct.anon zeroinitializer ; <%struct.anon*> [#uses=1]
 
 define i32 @main() {
 entry:
-	%tmp2 = load i32, i32* getelementptr (%struct.anon, %struct.anon* @teste, i32 0, i32 0), align 8		; <i32> [#uses=1]
-	ret i32 %tmp2
+  %tmp2 = load i32, i32* getelementptr (%struct.anon, %struct.anon* @teste, i32 0, i32 0), align 8 ; <i32> [#uses=1]
+  ret i32 %tmp2
 }
+
+; CHECK-LABEL: main:
+; NOEMU-NOT:   __emutls_get_address
+
+; NOEMU:       .section .tbss
+; NOEMU-LABEL: teste:
+; NOEMU-NEXT:  .zero 8
+
+; CHECK-NOT: __emutls_t.teste
+
+; EMU:       .align 2
+; EMU-LABEL: __emutls_v.teste:
+; EMU-NEXT:  .long 8
+; EMU-NEXT:  .long 4
+; EMU-NEXT:  .long 0
+; EMU-NEXT:  .long 0
+
+; CHECK-NOT: teste:
+; CHECK-NOT: __emutls_t.teste
diff --git a/test/CodeGen/ARM/unaligned_load_store.ll b/test/CodeGen/ARM/unaligned_load_store.ll
index 0be3917ffa266..4e16bda6c4d96 100644
--- a/test/CodeGen/ARM/unaligned_load_store.ll
+++ b/test/CodeGen/ARM/unaligned_load_store.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=arm-eabi -pre-RA-sched=source %s -o - \
+; RUN: llc -mtriple=arm-eabi -pre-RA-sched=source -mattr=+strict-align %s -o - \
 ; RUN:	| FileCheck %s -check-prefix=EXPANDED
 
-; RUN: llc -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -mattr=-neon -arm-strict-align -pre-RA-sched=source %s -o - \
+; RUN: llc -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -mattr=-neon,+strict-align -pre-RA-sched=source %s -o - \
 ; RUN:	| FileCheck %s -check-prefix=EXPANDED
 
 ; RUN: llc -mtriple=armv6-apple-darwin -mcpu=cortex-a8 %s -o - \
diff --git a/test/CodeGen/ARM/unaligned_load_store_vfp.ll b/test/CodeGen/ARM/unaligned_load_store_vfp.ll
new file mode 100644
index 0000000000000..90d17e19c2868
--- /dev/null
+++ b/test/CodeGen/ARM/unaligned_load_store_vfp.ll
@@ -0,0 +1,98 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf %s -o - | FileCheck %s
+
+define float @test_load_s32_float(i32* %addr) {
+; CHECK-LABEL: test_load_s32_float:
+; CHECK: ldr [[TMP:r[0-9]+]], [r0]
+; CHECK: vmov [[RES_INT:s[0-9]+]], [[TMP]]
+; CHECK: vcvt.f32.s32 s0, [[RES_INT]]
+
+  %val = load i32, i32* %addr, align 1
+  %res = sitofp i32 %val to float
+  ret float %res
+}
+
+define double @test_load_s32_double(i32* %addr) {
+; CHECK-LABEL: test_load_s32_double:
+; CHECK: ldr [[TMP:r[0-9]+]], [r0]
+; CHECK: vmov [[RES_INT:s[0-9]+]], [[TMP]]
+; CHECK: vcvt.f64.s32 d0, [[RES_INT]]
+
+  %val = load i32, i32* %addr, align 1
+  %res = sitofp i32 %val to double
+  ret double %res
+}
+
+define float @test_load_u32_float(i32* %addr) {
+; CHECK-LABEL: test_load_u32_float:
+; CHECK: ldr [[TMP:r[0-9]+]], [r0]
+; CHECK: vmov [[RES_INT:s[0-9]+]], [[TMP]]
+; CHECK: vcvt.f32.u32 s0, [[RES_INT]]
+
+  %val = load i32, i32* %addr, align 1
+  %res = uitofp i32 %val to float
+  ret float %res
+}
+
+define double @test_load_u32_double(i32* %addr) {
+; CHECK-LABEL: test_load_u32_double:
+; CHECK: ldr [[TMP:r[0-9]+]], [r0]
+; CHECK: vmov [[RES_INT:s[0-9]+]], [[TMP]]
+; CHECK: vcvt.f64.u32 d0, [[RES_INT]]
+
+  %val = load i32, i32* %addr, align 1
+  %res = uitofp i32 %val to double
+  ret double %res
+}
+
+define void @test_store_f32(float %in, float* %addr) {
+; CHECK-LABEL: test_store_f32:
+; CHECK: vmov [[TMP:r[0-9]+]], s0
+; CHECK: str [[TMP]], [r0]
+
+  store float %in, float* %addr, align 1
+  ret void
+}
+
+define void @test_store_float_s32(float %in, i32* %addr) {
+; CHECK-LABEL: test_store_float_s32:
+; CHECK: vcvt.s32.f32 [[TMP:s[0-9]+]], s0
+; CHECK: vmov [[TMP_INT:r[0-9]+]], [[TMP]]
+; CHECK: str [[TMP_INT]], [r0]
+
+  %val = fptosi float %in to i32
+  store i32 %val, i32* %addr, align 1
+  ret void
+}
+
+define void @test_store_double_s32(double %in, i32* %addr) {
+; CHECK-LABEL: test_store_double_s32:
+; CHECK: vcvt.s32.f64 [[TMP:s[0-9]+]], d0
+; CHECK: vmov [[TMP_INT:r[0-9]+]], [[TMP]]
+; CHECK: str [[TMP_INT]], [r0]
+
+  %val = fptosi double %in to i32
+  store i32 %val, i32* %addr, align 1
+  ret void
+}
+
+define void @test_store_float_u32(float %in, i32* %addr) {
+; CHECK-LABEL: test_store_float_u32:
+; CHECK: vcvt.u32.f32 [[TMP:s[0-9]+]], s0
+; CHECK: vmov [[TMP_INT:r[0-9]+]], [[TMP]]
+; CHECK: str [[TMP_INT]], [r0]
+
+  %val = fptoui float %in to i32
+  store i32 %val, i32* %addr, align 1
+  ret void
+}
+
+define void @test_store_double_u32(double %in, i32* %addr) {
+; CHECK-LABEL: test_store_double_u32:
+; CHECK: vcvt.u32.f64 [[TMP:s[0-9]+]], d0
+; CHECK: vmov [[TMP_INT:r[0-9]+]], [[TMP]]
+; CHECK: str [[TMP_INT]], [r0]
+
+  %val = fptoui double %in to i32
+  store i32 %val, i32* %addr, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM/usat-lower.ll b/test/CodeGen/ARM/usat-lower.ll
new file mode 100644
index 0000000000000..58d3bba5a1f8e
--- /dev/null
+++ b/test/CodeGen/ARM/usat-lower.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -O1 -mtriple=armv6-none-none-eabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -O1 -mtriple=thumbv7-none-none-eabi 2>&1 | FileCheck %s
+
+; immediate argument < lower-bound
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.usat
+define i32 @usat1() nounwind {
+  %tmp = call i32 @llvm.arm.usat(i32 128, i32 -1)
+  ret i32 %tmp
+}
+
+declare i32 @llvm.arm.usat(i32, i32) nounwind readnone
diff --git a/test/CodeGen/ARM/usat-upper.ll b/test/CodeGen/ARM/usat-upper.ll
new file mode 100644
index 0000000000000..84ad694725b4e
--- /dev/null
+++ b/test/CodeGen/ARM/usat-upper.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -O1 -mtriple=armv6-none-none-eabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -O1 -mtriple=thumbv7-none-none-eabi 2>&1 | FileCheck %s
+
+; immediate argument > upper-bound
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.usat
+define i32 @usat1() nounwind {
+  %tmp = call i32 @llvm.arm.usat(i32 128, i32 32)
+  ret i32 %tmp
+}
+
+declare i32 @llvm.arm.usat(i32, i32) nounwind readnone
diff --git a/test/CodeGen/ARM/v7k-abi-align.ll b/test/CodeGen/ARM/v7k-abi-align.ll
new file mode 100644
index 0000000000000..e9b67f22edf2d
--- /dev/null
+++ b/test/CodeGen/ARM/v7k-abi-align.ll
@@ -0,0 +1,152 @@
+; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 -o - %s | FileCheck %s
+
+%struct = type { i8, i64, i8, double, i8, <2 x float>, i8, <4 x float> }
+
+define i32 @test_i64_align() {
+; CHECK-LABEL: test_i64_align:
+; CHECL: movs r0, #8
+  ret i32 ptrtoint(i64* getelementptr(%struct, %struct* null, i32 0, i32 1) to i32)
+}
+
+define i32 @test_f64_align() {
+; CHECK-LABEL: test_f64_align:
+; CHECL: movs r0, #24
+  ret i32 ptrtoint(double* getelementptr(%struct, %struct* null, i32 0, i32 3) to i32)
+}
+
+define i32 @test_v2f32_align() {
+; CHECK-LABEL: test_v2f32_align:
+; CHECL: movs r0, #40
+  ret i32 ptrtoint(<2 x float>* getelementptr(%struct, %struct* null, i32 0, i32 5) to i32)
+}
+
+define i32 @test_v4f32_align() {
+; CHECK-LABEL: test_v4f32_align:
+; CHECL: movs r0, #64
+  ret i32 ptrtoint(<4 x float>* getelementptr(%struct, %struct* null, i32 0, i32 7) to i32)
+}
+
+; Key point here is than an extra register has to be saved so that the DPRs end
+; up in an aligned location (as prologue/epilogue inserter had calculated).
+define void @test_dpr_unwind_align() {
+; CHECK-LABEL: test_dpr_unwind_align:
+; CHECK: push {r5, r6, r7, lr}
+; CHECK-NOT: sub sp
+; CHECK: vpush {d8, d9}
+; CHECK: .cfi_offset d9, -24
+; CHECK: .cfi_offset d8, -32
+; [...]
+; CHECK: bl _test_i64_align
+; CHECK-NOT: add sp,
+; CHECK: vpop {d8, d9}
+; CHECK-NOT: add sp,
+; CHECK: pop {r5, r6, r7, pc}
+
+  call void asm sideeffect "", "~{r6},~{d8},~{d9}"()
+
+  ; Whatever
+  call i32 @test_i64_align()
+  ret void
+}
+
+; This time, there's no viable way to tack CS-registers onto the list: a real SP
+; adjustment needs to be performed to put d8 and d9 where they should be.
+define void @test_dpr_unwind_align_manually() {
+; CHECK-LABEL: test_dpr_unwind_align_manually:
+; CHECK: push {r4, r5, r6, r7, lr}
+; CHECK-NOT: sub sp
+; CHECK: push.w {r8, r11}
+; CHECK: sub sp, #4
+; CHECK: vpush {d8, d9}
+; CHECK: .cfi_offset d9, -40
+; CHECK: .cfi_offset d8, -48
+; [...]
+; CHECK: bl _test_i64_align
+; CHECK-NOT: add sp,
+; CHECK: vpop {d8, d9}
+; CHECK: add sp, #4
+; CHECK: pop.w {r8, r11}
+; CHECK: pop {r4, r5, r6, r7, pc}
+
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{d8},~{d9}"()
+
+  ; Whatever
+  call i32 @test_i64_align()
+  ret void
+}
+
+; If there's only a CS1 area, the sub should be in the right place:
+define void @test_dpr_unwind_align_just_cs1() {
+; CHECK-LABEL: test_dpr_unwind_align_just_cs1:
+; CHECK: push {r4, r5, r6, r7, lr}
+; CHECK: sub sp, #4
+; CHECK: vpush {d8, d9}
+; CHECK: .cfi_offset d9, -32
+; CHECK: .cfi_offset d8, -40
+; CHECK: sub sp, #8
+; [...]
+; CHECK: bl _test_i64_align
+; CHECK: add sp, #8
+; CHECK: vpop {d8, d9}
+; CHECK: add sp, #4
+; CHECK: pop {r4, r5, r6, r7, pc}
+
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{d8},~{d9}"()
+
+  ; Whatever
+  call i32 @test_i64_align()
+  ret void
+}
+
+; If there are no DPRs, we shouldn't try to align the stack in stages anyway
+define void @test_dpr_unwind_align_no_dprs() {
+; CHECK-LABEL: test_dpr_unwind_align_no_dprs:
+; CHECK: push {r4, r5, r6, r7, lr}
+; CHECK: sub sp, #12
+; [...]
+; CHECK: bl _test_i64_align
+; CHECK: add sp, #12
+; CHECK: pop {r4, r5, r6, r7, pc}
+
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7}"()
+
+  ; Whatever
+  call i32 @test_i64_align()
+  ret void
+}
+
+; 128-bit vectors should use 128-bit (i.e. correctly aligned) slots on
+; the stack.
+define <4 x float> @test_v128_stack_pass([8 x double], float, <4 x float> %in) {
+; CHECK-LABEL: test_v128_stack_pass:
+; CHECK: add r[[ADDR:[0-9]+]], sp, #16
+; CHECK: vld1.64 {d0, d1}, [r[[ADDR]]:128]
+
+  ret <4 x float> %in
+}
+
+declare void @varargs(i32, ...)
+
+; When varargs are enabled, we go down a different route. Still want 128-bit
+; alignment though.
+define void @test_v128_stack_pass_varargs(<4 x float> %in) {
+; CHECK-LABEL: test_v128_stack_pass_varargs:
+; CHECK: add r[[ADDR:[0-9]+]], sp, #16
+; CHECK: vst1.64 {d0, d1}, [r[[ADDR]]:128]
+
+  call void(i32, ...) @varargs(i32 undef, [3 x i32] undef, float undef, <4 x float> %in)
+  ret void
+}
+
+; To be compatible with AAPCS's va_start model (store r0-r3 at incoming SP, give
+; a single pointer), 64-bit quantities must be pass
+define i64 @test_64bit_gpr_align(i32, i64 %r2_r3, i32 %sp) {
+; CHECK-LABEL: test_64bit_gpr_align:
+; CHECK: ldr [[RHS:r[0-9]+]], [sp]
+; CHECK: adds r0, [[RHS]], r2
+; CHECK: adc r1, r3, #0
+
+  %ext = zext i32 %sp to i64
+  %sum = add i64 %ext, %r2_r3
+  ret i64 %sum
+}
diff --git a/test/CodeGen/ARM/v7k-libcalls.ll b/test/CodeGen/ARM/v7k-libcalls.ll
new file mode 100644
index 0000000000000..a1dfb07ca6144
--- /dev/null
+++ b/test/CodeGen/ARM/v7k-libcalls.ll
@@ -0,0 +1,154 @@
+; RUN: llc -mtriple=armv7k-apple-watchos2.0 -mcpu=cortex-a7 < %s | FileCheck %s
+
+define arm_aapcs_vfpcc float @t1(float %a, float %b) {
+entry:
+; CHECK: t1
+; CHECK-NOT: vmov
+; CHECK: vadd.f32
+  %a.addr = alloca float, align 4
+  %b.addr = alloca float, align 4
+  store float %a, float* %a.addr, align 4
+  store float %b, float* %b.addr, align 4
+  %0 = load float, float* %a.addr, align 4
+  %1 = load float, float* %b.addr, align 4
+  %add = fadd float %0, %1
+  ret float %add
+}
+
+define arm_aapcs_vfpcc double @t2(double %a, double %b) {
+entry:
+; CHECK: t2
+; CHECK-NOT: vmov
+; CHECK: vadd.f64
+  %a.addr = alloca double, align 8
+  %b.addr = alloca double, align 8
+  store double %a, double* %a.addr, align 8
+  store double %b, double* %b.addr, align 8
+  %0 = load double, double* %a.addr, align 8
+  %1 = load double, double* %b.addr, align 8
+  %add = fadd double %0, %1
+  ret double %add
+}
+
+define arm_aapcs_vfpcc i64 @t3(double %ti) {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: vmov
+; CHECK: bl ___fixunsdfdi
+  %conv = fptoui double %ti to i64
+  ret i64 %conv
+}
+
+define arm_aapcs_vfpcc i64 @t4(double %ti) {
+entry:
+; CHECK-LABEL: t4:
+; CHECK-NOT: vmov
+; CHECK: bl ___fixdfdi
+  %conv = fptosi double %ti to i64
+  ret i64 %conv
+}
+
+define arm_aapcs_vfpcc double @t5(i64 %ti) {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: bl ___floatundidf
+; CHECK-NOT: vmov
+; CHECK: pop
+  %conv = uitofp i64 %ti to double
+  ret double %conv
+}
+
+define arm_aapcs_vfpcc double @t6(i64 %ti) {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: bl ___floatdidf
+; CHECK-NOT: vmov
+; CHECK: pop
+  %conv = sitofp i64 %ti to double
+  ret double %conv
+}
+
+define arm_aapcs_vfpcc float @t7(i64 %ti) {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: bl ___floatundisf
+; CHECK-NOT: vmov
+; CHECK: pop
+  %conv = uitofp i64 %ti to float
+  ret float %conv
+}
+
+define arm_aapcs_vfpcc float @t8(i64 %ti) {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: bl ___floatdisf
+; CHECK-NOT: vmov
+; CHECK: pop
+  %conv = sitofp i64 %ti to float
+  ret float %conv
+}
+
+define arm_aapcs_vfpcc double @t9(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, float %a, float %b) {
+entry:
+; CHECK-LABEL: t9:
+; CHECK-NOT: vmov
+; CHECK: vldr
+  %add = fadd float %a, %b
+  %conv = fpext float %add to double
+  ret double %conv
+}
+
+define arm_aapcs_vfpcc double @t10(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %a, float %b, double %c) {
+entry:
+; CHECK-LABEL: t10:
+; CHECK-NOT: vmov
+; CHECK: vldr
+  %add = fadd double %a, %c
+  ret double %add
+}
+
+define arm_aapcs_vfpcc float @t11(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, float %a, double %b, float %c) {
+entry:
+; CHECK-LABEL: t11:
+; CHECK: vldr
+  %add = fadd float %a, %c
+  ret float %add
+}
+
+; rdar://16039676
+define arm_aapcs_vfpcc double @t12(double %a, double %b) {
+entry:
+; CHECK-LABEL: t12:
+; CHECK: vstr
+  %add = fadd double %a, %b
+  %sub = fsub double %a, %b
+  %call = tail call arm_aapcs_vfpcc double @x(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double %add, float 0.000000e+00, double %sub)
+  ret double %call
+}
+
+define arm_aapcs_vfpcc double @t13(double %x) {
+entry:
+; CHECK-LABEL: t13:
+; CHECK-NOT: vmov
+; CHECK: bl ___sincos_stret
+  %call = tail call arm_aapcs_vfpcc double @cos(double %x)
+  %call1 = tail call arm_aapcs_vfpcc double @sin(double %x)
+  %mul = fmul double %call, %call1
+  ret double %mul
+}
+
+define arm_aapcs_vfpcc double @t14(double %x) {
+; CHECK-LABEL: t14:
+; CHECK-NOT: vmov
+; CHECK: b ___exp10
+  %__exp10 = tail call double @__exp10(double %x) #1
+  ret double %__exp10
+}
+
+declare arm_aapcs_vfpcc double @x(double, double, double, double, double, double, double, float, double)
+declare arm_aapcs_vfpcc double @cos(double) #0
+declare arm_aapcs_vfpcc double @sin(double) #0
+declare double @__exp10(double)
+
+attributes #0 = { readnone }
+attributes #1 = { readonly }
diff --git a/test/CodeGen/ARM/v7k-sincos.ll b/test/CodeGen/ARM/v7k-sincos.ll
new file mode 100644
index 0000000000000..b89d4dc8120bf
--- /dev/null
+++ b/test/CodeGen/ARM/v7k-sincos.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 -o - %s | FileCheck %s
+
+declare double @sin(double) nounwind readnone
+declare double @cos(double) nounwind readnone
+
+define double @test_stret(double %in) {
+; CHECK-LABEL: test_stret:
+; CHECK: blx ___sincos_stret
+; CHECK-NOT: ldr
+; CHECK: vadd.f64 d0, d0, d1
+
+  %sin = call double @sin(double %in)
+  %cos = call double @cos(double %in)
+  %sum = fadd double %sin, %cos
+  ret double %sum
+}
diff --git a/test/CodeGen/ARM/vcge.ll b/test/CodeGen/ARM/vcge.ll
index 2cd33cf3a422d..e34b3e5e365ab 100644
--- a/test/CodeGen/ARM/vcge.ll
+++ b/test/CodeGen/ARM/vcge.ll
@@ -196,8 +196,8 @@ entry:
   %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %4 = add <8 x i16> %3, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   %5 = trunc <8 x i16> %4 to <8 x i8>
-  tail call void @llvm.arm.neon.vst1.v8i8(i8* undef, <8 x i8> %5, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* undef, <8 x i8> %5, i32 1)
   unreachable
 }
 
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll
index 9491c15aef589..fc171889f5f83 100644
--- a/test/CodeGen/ARM/vcombine.ll
+++ b/test/CodeGen/ARM/vcombine.ll
@@ -2,11 +2,15 @@
 ; RUN: llc -mtriple=armeb-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
 
 define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-; CHECK: vcombine8
-; CHECK-LE: vmov r0, r1, d16
-; CHECK-LE: vmov r2, r3, d17
-; CHECK-BE: vmov r1, r0, d16
-; CHECK-BE: vmov r3, r2, d17
+; CHECK-LABEL: vcombine8
+; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
+; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
+
+; CHECK-LE-DAG: vmov r0, r1, [[LD0]]
+; CHECK-LE-DAG: vmov r2, r3, [[LD1]]
+
+; CHECK-BE-DAG: vmov r1, r0, d16
+; CHECK-BE-DAG: vmov r3, r2, d17
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -14,11 +18,15 @@ define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 }
 
 define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-; CHECK: vcombine16
-; CHECK-LE: vmov r0, r1, d16
-; CHECK-LE: vmov r2, r3, d17
-; CHECK-BE: vmov r1, r0, d16
-; CHECK-BE: vmov r3, r2, d17
+; CHECK-LABEL: vcombine16
+; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
+; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
+
+; CHECK-LE-DAG: vmov r0, r1, [[LD0]]
+; CHECK-LE-DAG: vmov r2, r3, [[LD1]]
+
+; CHECK-BE-DAG: vmov r1, r0, d16
+; CHECK-BE-DAG: vmov r3, r2, d17
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = load <4 x i16>, <4 x i16>* %B
 	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -26,9 +34,14 @@ define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 }
 
 define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-; CHECK: vcombine32
-; CHECK-LE: vmov r0, r1, d16
-; CHECK-LE: vmov r2, r3, d17
+; CHECK-LABEL: vcombine32
+
+; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
+; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
+
+; CHECK-LE: vmov r0, r1, [[LD0]]
+; CHECK-LE: vmov r2, r3, [[LD1]]
+
 ; CHECK-BE: vmov r1, r0, d16
 ; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
@@ -38,9 +51,14 @@ define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 }
 
 define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind {
-; CHECK: vcombinefloat
-; CHECK-LE: vmov r0, r1, d16
-; CHECK-LE: vmov r2, r3, d17
+; CHECK-LABEL: vcombinefloat
+
+; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
+; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
+
+; CHECK-LE: vmov r0, r1, [[LD0]]
+; CHECK-LE: vmov r2, r3, [[LD1]]
+
 ; CHECK-BE: vmov r1, r0, d16
 ; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <2 x float>, <2 x float>* %A
@@ -50,11 +68,15 @@ define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind {
 }
 
 define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
-; CHECK: vcombine64
-; CHECK-LE: vmov r0, r1, d16
-; CHECK-LE: vmov r2, r3, d17
-; CHECK-BE: vmov r1, r0, d16
-; CHECK-BE: vmov r3, r2, d17
+; CHECK-LABEL: vcombine64
+; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
+; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
+
+; CHECK-LE: vmov r0, r1, [[LD0]]
+; CHECK-LE: vmov r2, r3, [[LD1]]
+
+; CHECK-BE: vmov r1, r0, [[LD0]]
+; CHECK-BE: vmov r3, r2, [[LD1]]
 	%tmp1 = load <1 x i64>, <1 x i64>* %A
 	%tmp2 = load <1 x i64>, <1 x i64>* %B
 	%tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> <i32 0, i32 1>
diff --git a/test/CodeGen/ARM/vcvt_combine.ll b/test/CodeGen/ARM/vcvt_combine.ll
index 0c856e8d7617d..9a8f084d23034 100644
--- a/test/CodeGen/ARM/vcvt_combine.ll
+++ b/test/CodeGen/ARM/vcvt_combine.ll
@@ -1,95 +1,64 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
 
-@in = global float 0x400921FA00000000, align 4
-
 ; Test signed conversion.
-; CHECK: t0
-; CHECK-NOT: vmul
-define void @t0() nounwind {
-entry:
-  %tmp = load float, float* @in, align 4
-  %vecinit.i = insertelement <2 x float> undef, float %tmp, i32 0
-  %vecinit2.i = insertelement <2 x float> %vecinit.i, float %tmp, i32 1
-  %mul.i = fmul <2 x float> %vecinit2.i, <float 8.000000e+00, float 8.000000e+00>
+; CHECK-LABEL: @t0
+; CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #2
+; CHECK: bx lr
+define <2 x i32> @t0(<2 x float> %in) {
+  %mul.i = fmul <2 x float> %in, <float 4.0, float 4.0>
   %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
-  tail call void @foo_int32x2_t(<2 x i32> %vcvt.i) nounwind
-  ret void
+  ret <2 x i32> %vcvt.i
 }
 
-declare void @foo_int32x2_t(<2 x i32>)
-
 ; Test unsigned conversion.
-; CHECK: t1
-; CHECK-NOT: vmul
-define void @t1() nounwind {
-entry:
-  %tmp = load float, float* @in, align 4
-  %vecinit.i = insertelement <2 x float> undef, float %tmp, i32 0
-  %vecinit2.i = insertelement <2 x float> %vecinit.i, float %tmp, i32 1
-  %mul.i = fmul <2 x float> %vecinit2.i, <float 8.000000e+00, float 8.000000e+00>
+; CHECK-LABEL: @t1
+; CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #3
+; CHECK: bx lr
+define <2 x i32> @t1(<2 x float> %in) {
+  %mul.i = fmul <2 x float> %in, <float 8.0, float 8.0>
   %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
-  tail call void @foo_uint32x2_t(<2 x i32> %vcvt.i) nounwind
-  ret void
+  ret <2 x i32> %vcvt.i
 }
 
-declare void @foo_uint32x2_t(<2 x i32>)
-
 ; Test which should not fold due to non-power of 2.
-; CHECK: t2
+; CHECK-LABEL: @t2
 ; CHECK: vmul
-define void @t2() nounwind {
+; CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bx lr
+define <2 x i32> @t2(<2 x float> %in) {
 entry:
-  %tmp = load float, float* @in, align 4
-  %vecinit.i = insertelement <2 x float> undef, float %tmp, i32 0
-  %vecinit2.i = insertelement <2 x float> %vecinit.i, float %tmp, i32 1
-  %mul.i = fmul <2 x float> %vecinit2.i, <float 0x401B333340000000, float 0x401B333340000000>
+  %mul.i = fmul <2 x float> %in, <float 0x401B333340000000, float 0x401B333340000000>
   %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
-  tail call void @foo_int32x2_t(<2 x i32> %vcvt.i) nounwind
-  ret void
+  ret <2 x i32> %vcvt.i
 }
 
 ; Test which should not fold due to power of 2 out of range.
-; CHECK: t3
+; CHECK-LABEL: @t3
 ; CHECK: vmul
-define void @t3() nounwind {
-entry:
-  %tmp = load float, float* @in, align 4
-  %vecinit.i = insertelement <2 x float> undef, float %tmp, i32 0
-  %vecinit2.i = insertelement <2 x float> %vecinit.i, float %tmp, i32 1
-  %mul.i = fmul <2 x float> %vecinit2.i, <float 0x4200000000000000, float 0x4200000000000000>
+; CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bx lr
+define <2 x i32> @t3(<2 x float> %in) {
+  %mul.i = fmul <2 x float> %in, <float 0x4200000000000000, float 0x4200000000000000>
   %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
-  tail call void @foo_int32x2_t(<2 x i32> %vcvt.i) nounwind
-  ret void
+  ret <2 x i32> %vcvt.i
 }
 
 ; Test which case where const is max power of 2 (i.e., 2^32).
-; CHECK: t4
-; CHECK-NOT: vmul
-define void @t4() nounwind {
-entry:
-  %tmp = load float, float* @in, align 4
-  %vecinit.i = insertelement <2 x float> undef, float %tmp, i32 0
-  %vecinit2.i = insertelement <2 x float> %vecinit.i, float %tmp, i32 1
-  %mul.i = fmul <2 x float> %vecinit2.i, <float 0x41F0000000000000, float 0x41F0000000000000>
+; CHECK-LABEL: @t4
+; CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #32
+; CHECK: bx lr
+define <2 x i32> @t4(<2 x float> %in) {
+  %mul.i = fmul <2 x float> %in, <float 0x41F0000000000000, float 0x41F0000000000000>
   %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
-  tail call void @foo_int32x2_t(<2 x i32> %vcvt.i) nounwind
-  ret void
+  ret <2 x i32> %vcvt.i
 }
 
 ; Test quadword.
-; CHECK: t5
-; CHECK-NOT: vmul
-define void @t5() nounwind {
-entry:
-  %tmp = load float, float* @in, align 4
-  %vecinit.i = insertelement <4 x float> undef, float %tmp, i32 0
-  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %tmp, i32 1
-  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %tmp, i32 2
-  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %tmp, i32 3
-  %mul.i = fmul <4 x float> %vecinit6.i, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-LABEL: @t5
+; CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #3
+; CHECK: bx lr
+define <4 x i32> @t5(<4 x float> %in) {
+  %mul.i = fmul <4 x float> %in, <float 8.0, float 8.0, float 8.0, float 8.0>
   %vcvt.i = fptosi <4 x float> %mul.i to <4 x i32>
-  tail call void @foo_int32x4_t(<4 x i32> %vcvt.i) nounwind
-  ret void
+  ret <4 x i32> %vcvt.i
 }
-
-declare void @foo_int32x4_t(<4 x i32>)
diff --git a/test/CodeGen/ARM/vdiv_combine.ll b/test/CodeGen/ARM/vdiv_combine.ll
index 8c6e4ba350549..8511dbcb68767 100644
--- a/test/CodeGen/ARM/vdiv_combine.ll
+++ b/test/CodeGen/ARM/vdiv_combine.ll
@@ -136,3 +136,20 @@ define <2 x double> @fix_i64_to_double(<2 x i64> %in) {
     ret <2 x double> %shift
 }
 
+; Don't combine with 8 lanes.  Just make sure things don't crash.
+; CHECK-LABEL: test7
+define <8 x float> @test7(<8 x i32> %in) nounwind {
+entry:
+  %vcvt.i = sitofp <8 x i32> %in to <8 x float>
+  %div.i = fdiv <8 x float> %vcvt.i, <float 8.0, float 8.0, float 8.0, float 8.0, float 8.0, float 8.0, float 8.0, float 8.0>
+  ret <8 x float> %div.i
+}
+
+; Can combine splat with an undef.
+; CHECK-LABEL: test8
+; CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}}, #1
+define <4 x float> @test8(<4 x i32> %in) {
+  %vcvt.i = sitofp <4 x i32> %in to <4 x float>
+  %div.i = fdiv <4 x float> %vcvt.i, <float 2.0, float 2.0, float 2.0, float undef>
+  ret <4 x float> %div.i
+}
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 36eebbfc4650f..25c4807d9862b 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -364,3 +364,19 @@ define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
   %sub = fsub <4 x float> %splat.splat, %p
   ret <4 x float> %sub
 }
+
+; Also make sure we don't barf on variable-index extractelts, where we almost
+; could have generated a vdup.
+
+define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
+; CHECK-LABEL: check_i8_varidx:
+; CHECK: mov r[[FP:[0-9]+]], sp
+; CHECK: ldr r[[IDX:[0-9]+]], [r[[FP]], #4]
+; CHECK: mov r[[SPCOPY:[0-9]+]], sp
+; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[IDX]]
+; CHECK: vld1.8 {d{{.*}}[]}, [r[[SPCOPY]]]
+  %x = extractelement <16 x i8> %v, i32 %idx
+  %1 = insertelement  <8 x i8> undef, i8 %x, i32 0
+  %2 = insertelement  <8 x i8> %1, i8 %x, i32 1
+  ret <8 x i8> %2
+}
diff --git a/test/CodeGen/ARM/vector-DAGCombine.ll b/test/CodeGen/ARM/vector-DAGCombine.ll
index 29f4bb972a240..2ef2a0697ec95 100644
--- a/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -78,11 +78,11 @@ entry:
   %2 = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %3 = add <8 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   %4 = trunc <8 x i16> %3 to <8 x i8>
-  tail call void @llvm.arm.neon.vst1.v8i8(i8* undef, <8 x i8> %4, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* undef, <8 x i8> %4, i32 1)
   unreachable
 }
 
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
 
 ; Test that loads and stores of i64 vector elements are handled as f64 values
 ; so they are not split up into i32 values.  Radar 8755338.
diff --git a/test/CodeGen/ARM/vector-load.ll b/test/CodeGen/ARM/vector-load.ll
index 17f134f458a2a..a638c2bdb9bed 100644
--- a/test/CodeGen/ARM/vector-load.ll
+++ b/test/CodeGen/ARM/vector-load.ll
@@ -238,12 +238,12 @@ define <4 x i32> @zextload_v8i8tov8i32(<4 x i8>** %ptr) {
 
 define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
 ;CHECK-LABEL: zextload_v8i8tov8i32_fake_update:
-;CHECK: ldr.w   r[[PTRREG:[0-9]+]], [r0]
+;CHECK: ldr   r[[PTRREG:[0-9]+]], [r0]
 ;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32]
 ;CHECK: add.w   r[[INCREG:[0-9]+]], r[[PTRREG]], #16
-;CHECK: str.w   r[[INCREG]], [r0]
 ;CHECK: vmovl.u8        {{q[0-9]+}}, {{d[0-9]+}}
 ;CHECK: vmovl.u16       {{q[0-9]+}}, {{d[0-9]+}}
+;CHECK: str   r[[INCREG]], [r0]
 	%A = load <4 x i8>*, <4 x i8>** %ptr
 	%lA = load <4 x i8>, <4 x i8>* %A, align 4
 	%inc = getelementptr <4 x i8>, <4 x i8>* %A, i38 4
diff --git a/test/CodeGen/ARM/vector-store.ll b/test/CodeGen/ARM/vector-store.ll
index 30baa9a20ddc1..161bbf1d0fde8 100644
--- a/test/CodeGen/ARM/vector-store.ll
+++ b/test/CodeGen/ARM/vector-store.ll
@@ -228,9 +228,9 @@ define void @truncstore_v4i32tov4i8(<4 x i8>** %ptr, <4 x i32> %val) {
 ;CHECK: ldr.w   r9, [sp]
 ;CHECK: vmov    {{d[0-9]+}}, r3, r9
 ;CHECK: vmov    {{d[0-9]+}}, r1, r2
+;CHECK: ldr     r[[PTRREG:[0-9]+]], [r0]
 ;CHECK: vmovn.i32       [[VECLO:d[0-9]+]], {{q[0-9]+}}
 ;CHECK: vuzp.8  [[VECLO]], {{d[0-9]+}}
-;CHECK: ldr     r[[PTRREG:[0-9]+]], [r0]
 ;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32]
 	%A = load <4 x i8>*, <4 x i8>** %ptr
         %trunc = trunc <4 x i32> %val to <4 x i8>
@@ -243,10 +243,10 @@ define void @truncstore_v4i32tov4i8_fake_update(<4 x i8>** %ptr, <4 x i32> %val)
 ;CHECK: ldr.w   r9, [sp]
 ;CHECK: vmov    {{d[0-9]+}}, r3, r9
 ;CHECK: vmov    {{d[0-9]+}}, r1, r2
-;CHECK: movs    [[IMM16:r[0-9]+]], #16
+;CHECK: ldr     r[[PTRREG:[0-9]+]], [r0]
 ;CHECK: vmovn.i32       [[VECLO:d[0-9]+]], {{q[0-9]+}}
 ;CHECK: vuzp.8  [[VECLO]], {{d[0-9]+}}
-;CHECK: ldr     r[[PTRREG:[0-9]+]], [r0]
+;CHECK: movs    [[IMM16:r[0-9]+]], #16
 ;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]]
 ;CHECK: str     r[[PTRREG]], [r0]
 	%A = load <4 x i8>*, <4 x i8>** %ptr
diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll
index 72ecf0ef0626e..394ecfb281fc3 100644
--- a/test/CodeGen/ARM/vext.ll
+++ b/test/CodeGen/ARM/vext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - -lower-interleaved-accesses=false | FileCheck %s
 
 define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: test_vextd:
@@ -196,3 +196,35 @@ define arm_aapcscc void @test_elem_mismatch(<2 x i64>* nocapture %src, <4 x i16>
   store <4 x i16> %tmp7, <4 x i16>* %dest, align 4
   ret void
 }
+
+define <4 x i32> @test_reverse_and_extract(<2 x i32>* %A) {
+entry:
+  ; CHECK-LABEL: test_reverse_and_extract
+  ; CHECK-NOT: vtrn
+  ; CHECK: vrev
+  ; CHECK: vext
+	%tmp1 = load <2 x i32>, <2 x i32>* %A
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 0>
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @test_dup_and_extract(<2 x i32>* %A) {
+entry:
+  ; CHECK-LABEL: test_dup_and_extract
+  ; CHECK-NOT: vtrn
+  ; CHECK: vdup
+  ; CHECK: vext
+	%tmp1 = load <2 x i32>, <2 x i32>* %A
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @test_zip_and_extract(<2 x i32>* %A) {
+entry:
+  ; CHECK-LABEL: test_zip_and_extract
+  ; CHECK: vzip
+  ; CHECK: vext
+	%tmp1 = load <2 x i32>, <2 x i32>* %A
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 1>
+  ret <4 x i32> %0
+}
diff --git a/test/CodeGen/ARM/vfp-reg-stride.ll b/test/CodeGen/ARM/vfp-reg-stride.ll
new file mode 100644
index 0000000000000..c5339db68e30f
--- /dev/null
+++ b/test/CodeGen/ARM/vfp-reg-stride.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mcpu=swift -mtriple=thumbv7s-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-STRIDE4
+; RUN: llc -mcpu=swift -mtriple=thumbv7k-apple-watchos -o - %s | FileCheck %s --check-prefix=CHECK-STRIDE4-WATCH
+; RUN: llc -mcpu=cortex-a57 -mtriple=thumbv7-linux-gnueabihf -o - %s | FileCheck %s --check-prefix=CHECK-GENERIC
+
+define void @test_reg_stride(double %a, double %b) {
+; CHECK-STRIDE4-LABEL: test_reg_stride:
+; CHECK-STRIDE4-DAG: vmov d16, r
+; CHECK-STRIDE4-DAG: vmov d18, r
+
+; CHECK-STRIDE4-WATCH-LABEL: test_reg_stride:
+; CHECK-STRIDE4-WATCH-DAG: vmov.f64 d16, d
+; CHECK-STRIDE4-WATCH-DAG: vmov.f64 d18, d
+
+; CHECK-GENERIC-LABEL: test_reg_stride:
+; CHECK-GENERIC-DAG: vmov.f64 d16, {{d[01]}}
+; CHECK-GENERIC-DAG: vmov.f64 d17, {{d[01]}}
+
+  call void asm "", "~{r0},~{r1},~{d0},~{d1}"()
+  call arm_aapcs_vfpcc void @eat_doubles(double %a, double %b)
+  ret void
+}
+
+define void @test_stride_minsize(float %a, float %b) minsize {
+; CHECK-STRIDE4-LABEL: test_stride_minsize:
+; CHECK-STRIDE4: vmov d2, {{r[01]}}
+; CHECK-STRIDE4: vmov d3, {{r[01]}}
+
+; CHECK-STRIDE4-WATCH-LABEL: test_stride_minsize:
+; CHECK-STRIDE4-WATCH-DAG: vmov.f32 s4, {{s[01]}}
+; CHECK-STRIDE4-WATCH-DAG: vmov.f32 s8, {{s[01]}}
+
+; CHECK-GENERIC-LABEL: test_stride_minsize:
+; CHECK-GENERIC-DAG: vmov.f32 s4, {{s[01]}}
+; CHECK-GENERIC-DAG: vmov.f32 s6, {{s[01]}}
+  call void asm "", "~{r0},~{r1},~{s0},~{s1},~{d0},~{d1}"()
+  call arm_aapcs_vfpcc void @eat_floats(float %a, float %b)
+  ret void
+}
+
+
+declare arm_aapcs_vfpcc void @eat_doubles(double, double)
+declare arm_aapcs_vfpcc void @eat_floats(float, float)
diff --git a/test/CodeGen/ARM/vfp-regs-dwarf.ll b/test/CodeGen/ARM/vfp-regs-dwarf.ll
index eca0c26e55622..1b2055e5aff1a 100644
--- a/test/CodeGen/ARM/vfp-regs-dwarf.ll
+++ b/test/CodeGen/ARM/vfp-regs-dwarf.ll
@@ -10,7 +10,7 @@
 ; the layout of the VFP registers correctly. The fact that the numbers are
 ; monotonic in memory is also a nice property to have.
 
-define void @stack_offsets() {
+define void @stack_offsets() !dbg !4 {
 ; CHECK-LABEL: stack_offsets:
 ; CHECK: vpush {d13}
 ; CHECK: vpush {d11}
@@ -31,11 +31,11 @@ define void @stack_offsets() {
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
 !1 = !DIFile(filename: "tmp.c", directory: "/Users/tim/llvm/build")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "bar", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 1, file: !1, scope: !5, type: !6, function: void ()* @stack_offsets, variables: !2)
+!4 = distinct !DISubprogram(name: "bar", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2)
 !5 = !DIFile(filename: "tmp.c", directory: "/Users/tim/llvm/build")
 !6 = !DISubroutineType(types: !7)
 !7 = !{null}
diff --git a/test/CodeGen/ARM/vld-vst-upgrade.ll b/test/CodeGen/ARM/vld-vst-upgrade.ll
new file mode 100644
index 0000000000000..fe868f6cb0781
--- /dev/null
+++ b/test/CodeGen/ARM/vld-vst-upgrade.ll
@@ -0,0 +1,139 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+neon < %s | FileCheck %s
+
+%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+
+; vld[1234] auto-upgrade tests
+
+; CHECK-LABEL: test_vld1_upgrade:
+; CHECK: vld1.32 {d16}, [r0]
+define <2 x i32> @test_vld1_upgrade(i8* %ptr) {
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %ptr, i32 1)
+  ret <2 x i32> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly
+
+; CHECK-LABEL: test_vld2_upgrade:
+; CHECK: vld2.32 {d16, d17}, [r0]
+define %struct.__neon_int32x2x2_t @test_vld2_upgrade(i8* %ptr) {
+  %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %ptr, i32 1)
+  ret %struct.__neon_int32x2x2_t %tmp1
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*, i32) nounwind readonly
+
+; CHECK-LABEL: test_vld3_upgrade:
+; CHECK: vld3.32 {d16, d17, d18}, [r1]
+define %struct.__neon_int32x2x3_t @test_vld3_upgrade(i8* %ptr) {
+  %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %ptr, i32 1)
+  ret %struct.__neon_int32x2x3_t %tmp1
+}
+
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*, i32) nounwind readonly
+
+; CHECK-LABEL: test_vld4_upgrade:
+; CHECK: vld4.32 {d16, d17, d18, d19}, [r1]
+define %struct.__neon_int32x2x4_t @test_vld4_upgrade(i8* %ptr) {
+  %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %ptr, i32 1)
+  ret %struct.__neon_int32x2x4_t %tmp1
+}
+
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly
+
+; vld[234]lane auto-upgrade tests
+
+; CHECK-LABEL: test_vld2lane_upgrade:
+; CHECK: vld2.32 {d16[1], d17[1]}, [r0]
+define %struct.__neon_int32x2x2_t @test_vld2lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
+  %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1, i32 1)
+  ret %struct.__neon_int32x2x2_t %tmp1
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+
+; CHECK-LABEL: test_vld3lane_upgrade:
+; CHECK: vld3.32 {d16[1], d17[1], d18[1]}, [r1]
+define %struct.__neon_int32x2x3_t @test_vld3lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+  %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1, i32 1)
+  ret %struct.__neon_int32x2x3_t %tmp1
+}
+
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+
+; CHECK-LABEL: test_vld4lane_upgrade:
+; CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r1]
+define %struct.__neon_int32x2x4_t @test_vld4lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
+  %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1, i32 1)
+  ret %struct.__neon_int32x2x4_t %tmp1
+}
+
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+
+; vst[1234] auto-upgrade tests
+
+; CHECK-LABEL: test_vst1_upgrade:
+; CHECK: vst1.32 {d16}, [r0]
+define void @test_vst1_upgrade(i8* %ptr, <2 x i32> %A) {
+  call void @llvm.arm.neon.vst1.v2i32(i8* %ptr, <2 x i32> %A, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
+
+; CHECK-LABEL: test_vst2_upgrade:
+; CHECK: vst2.32 {d16, d17}, [r0]
+define void @test_vst2_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
+  call void @llvm.arm.neon.vst2.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
+
+; CHECK-LABEL: test_vst3_upgrade:
+; CHECK: vst3.32 {d16, d17, d18}, [r0]
+define void @test_vst3_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+  call void @llvm.arm.neon.vst3.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+
+; CHECK-LABEL: test_vst4_upgrade:
+; CHECK: vst4.32 {d16, d17, d18, d19}, [r0]
+define void @test_vst4_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
+  call void @llvm.arm.neon.vst4.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+
+; vst[234]lane auto-upgrade tests
+
+; CHECK-LABEL: test_vst2lane_upgrade:
+; CHECK: vst2.32 {d16[1], d17[1]}, [r0]
+define void @test_vst2lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
+  call void @llvm.arm.neon.vst2lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
+
+; CHECK-LABEL: test_vst3lane_upgrade:
+; CHECK: vst3.32 {d16[1], d17[1], d18[1]}, [r0]
+define void @test_vst3lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+  call void @llvm.arm.neon.vst3lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
+
+; CHECK-LABEL: test_vst4lane_upgrade:
+; CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0]
+define void @test_vst4lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
+  call void @llvm.arm.neon.vst4lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1, i32 1)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
diff --git a/test/CodeGen/ARM/vld1.ll b/test/CodeGen/ARM/vld1.ll
index 8064ea4a320ae..bdb3847697410 100644
--- a/test/CodeGen/ARM/vld1.ll
+++ b/test/CodeGen/ARM/vld1.ll
@@ -7,7 +7,7 @@ define <8 x i8> @vld1i8(i8* %A) nounwind {
 ;CHECK-LABEL: vld1i8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld1.8 {d16}, [r0:64]
-	%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 16)
+	%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %A, i32 16)
 	ret <8 x i8> %tmp1
 }
 
@@ -15,7 +15,7 @@ define <4 x i16> @vld1i16(i16* %A) nounwind {
 ;CHECK-LABEL: vld1i16:
 ;CHECK: vld1.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
+	%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* %tmp0, i32 1)
 	ret <4 x i16> %tmp1
 }
 
@@ -25,7 +25,7 @@ define <4 x i16> @vld1i16_update(i16** %ptr) nounwind {
 ;CHECK: vld1.16 {d16}, [{{r[0-9]+}}]!
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
+	%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* %tmp0, i32 1)
 	%tmp2 = getelementptr i16, i16* %A, i32 4
 	       store i16* %tmp2, i16** %ptr
 	ret <4 x i16> %tmp1
@@ -35,7 +35,7 @@ define <2 x i32> @vld1i32(i32* %A) nounwind {
 ;CHECK-LABEL: vld1i32:
 ;CHECK: vld1.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1)
+	%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* %tmp0, i32 1)
 	ret <2 x i32> %tmp1
 }
 
@@ -45,7 +45,7 @@ define <2 x i32> @vld1i32_update(i32** %ptr, i32 %inc) nounwind {
 ;CHECK: vld1.32 {d16}, [{{r[0-9]+}}], {{r[0-9]+}}
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1)
+	%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* %tmp0, i32 1)
 	%tmp2 = getelementptr i32, i32* %A, i32 %inc
 	store i32* %tmp2, i32** %ptr
 	ret <2 x i32> %tmp1
@@ -55,7 +55,7 @@ define <2 x float> @vld1f(float* %A) nounwind {
 ;CHECK-LABEL: vld1f:
 ;CHECK: vld1.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %tmp0, i32 1)
+	%tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* %tmp0, i32 1)
 	ret <2 x float> %tmp1
 }
 
@@ -63,7 +63,7 @@ define <1 x i64> @vld1i64(i64* %A) nounwind {
 ;CHECK-LABEL: vld1i64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %tmp0, i32 1)
+	%tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %tmp0, i32 1)
 	ret <1 x i64> %tmp1
 }
 
@@ -71,7 +71,7 @@ define <16 x i8> @vld1Qi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld1Qi8:
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vld1.8 {d16, d17}, [r0:64]
-	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
+	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %A, i32 8)
 	ret <16 x i8> %tmp1
 }
 
@@ -80,7 +80,7 @@ define <16 x i8> @vld1Qi8_update(i8** %ptr) nounwind {
 ;CHECK-LABEL: vld1Qi8_update:
 ;CHECK: vld1.8 {d16, d17}, [{{r[0-9]+}}:64]!
 	%A = load i8*, i8** %ptr
-	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
+	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %A, i32 8)
 	%tmp2 = getelementptr i8, i8* %A, i32 16
 	store i8* %tmp2, i8** %ptr
 	ret <16 x i8> %tmp1
@@ -91,7 +91,7 @@ define <8 x i16> @vld1Qi16(i16* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vld1.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 32)
+	%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %tmp0, i32 32)
 	ret <8 x i16> %tmp1
 }
 
@@ -99,7 +99,7 @@ define <4 x i32> @vld1Qi32(i32* %A) nounwind {
 ;CHECK-LABEL: vld1Qi32:
 ;CHECK: vld1.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %tmp0, i32 1)
+	%tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* %tmp0, i32 1)
 	ret <4 x i32> %tmp1
 }
 
@@ -107,7 +107,7 @@ define <4 x float> @vld1Qf(float* %A) nounwind {
 ;CHECK-LABEL: vld1Qf:
 ;CHECK: vld1.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %tmp0, i32 1)
+	%tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %tmp0, i32 1)
 	ret <4 x float> %tmp1
 }
 
@@ -115,7 +115,7 @@ define <2 x i64> @vld1Qi64(i64* %A) nounwind {
 ;CHECK-LABEL: vld1Qi64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %tmp0, i32 1)
+	%tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %tmp0, i32 1)
 	ret <2 x i64> %tmp1
 }
 
@@ -123,28 +123,28 @@ define <2 x double> @vld1Qf64(double* %A) nounwind {
 ;CHECK-LABEL: vld1Qf64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast double* %A to i8*
-	%tmp1 = call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %tmp0, i32 1)
+	%tmp1 = call <2 x double> @llvm.arm.neon.vld1.v2f64.p0i8(i8* %tmp0, i32 1)
 	ret <2 x double> %tmp1
 }
 
-declare <8 x i8>  @llvm.arm.neon.vld1.v8i8(i8*, i32) nounwind readonly
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32) nounwind readonly
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly
-declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
-declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32) nounwind readonly
+declare <8 x i8>  @llvm.arm.neon.vld1.v8i8.p0i8(i8*, i32) nounwind readonly
+declare <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8*, i32) nounwind readonly
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8*, i32) nounwind readonly
+declare <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8*, i32) nounwind readonly
+declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8*, i32) nounwind readonly
 
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
-declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32) nounwind readonly
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8*, i32) nounwind readonly
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly
+declare <2 x double> @llvm.arm.neon.vld1.v2f64.p0i8(i8*, i32) nounwind readonly
 
 ; Radar 8355607
 ; Do not crash if the vld1 result is not used.
 define void @unused_vld1_result() {
 entry:
-  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) 
+  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1)
   call void @llvm.trap()
   unreachable
 }
diff --git a/test/CodeGen/ARM/vld2.ll b/test/CodeGen/ARM/vld2.ll
index 391b49152cd9a..1ca16587bd911 100644
--- a/test/CodeGen/ARM/vld2.ll
+++ b/test/CodeGen/ARM/vld2.ll
@@ -15,7 +15,7 @@ define <8 x i8> @vld2i8(i8* %A) nounwind {
 ;CHECK-LABEL: vld2i8:
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vld2.8 {d16, d17}, [r0:64]
-	%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A, i32 8)
+	%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8.p0i8(i8* %A, i32 8)
         %tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
         %tmp4 = add <8 x i8> %tmp2, %tmp3
@@ -27,7 +27,7 @@ define <4 x i16> @vld2i16(i16* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vld2.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0, i32 32)
+	%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16.p0i8(i8* %tmp0, i32 32)
         %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 1
         %tmp4 = add <4 x i16> %tmp2, %tmp3
@@ -38,7 +38,7 @@ define <2 x i32> @vld2i32(i32* %A) nounwind {
 ;CHECK-LABEL: vld2i32:
 ;CHECK: vld2.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 1
         %tmp4 = add <2 x i32> %tmp2, %tmp3
@@ -49,7 +49,7 @@ define <2 x float> @vld2f(float* %A) nounwind {
 ;CHECK-LABEL: vld2f:
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
         %tmp4 = fadd <2 x float> %tmp2, %tmp3
@@ -62,7 +62,7 @@ define <2 x float> @vld2f_update(float** %ptr) nounwind {
 ;CHECK: vld2.32 {d16, d17}, [r1]!
 	%A = load float*, float** %ptr
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8* %tmp0, i32 1)
 	%tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
 	%tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
 	%tmp4 = fadd <2 x float> %tmp2, %tmp3
@@ -76,7 +76,7 @@ define <1 x i64> @vld2i64(i64* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vld1.64 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0, i32 32)
+	%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64.p0i8(i8* %tmp0, i32 32)
         %tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 1
         %tmp4 = add <1 x i64> %tmp2, %tmp3
@@ -87,7 +87,7 @@ define <16 x i8> @vld2Qi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld2Qi8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld2.8 {d16, d17, d18, d19}, [r0:64]
-	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 8)
+	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8* %A, i32 8)
         %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
         %tmp4 = add <16 x i8> %tmp2, %tmp3
@@ -99,7 +99,7 @@ define <16 x i8> @vld2Qi8_update(i8** %ptr, i32 %inc) nounwind {
 ;CHECK-LABEL: vld2Qi8_update:
 ;CHECK: vld2.8 {d16, d17, d18, d19}, [r2:128], r1
 	%A = load i8*, i8** %ptr
-	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 16)
+	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8* %A, i32 16)
         %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
         %tmp4 = add <16 x i8> %tmp2, %tmp3
@@ -113,7 +113,7 @@ define <8 x i16> @vld2Qi16(i16* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld2.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0, i32 16)
+	%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16.p0i8(i8* %tmp0, i32 16)
         %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 1
         %tmp4 = add <8 x i16> %tmp2, %tmp3
@@ -125,7 +125,7 @@ define <4 x i32> @vld2Qi32(i32* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld2.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0, i32 64)
+	%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp0, i32 64)
         %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 1
         %tmp4 = add <4 x i32> %tmp2, %tmp3
@@ -136,20 +136,20 @@ define <4 x float> @vld2Qf(float* %A) nounwind {
 ;CHECK-LABEL: vld2Qf:
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 1
         %tmp4 = fadd <4 x float> %tmp2, %tmp3
 	ret <4 x float> %tmp4
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8*, i32) nounwind readonly
-declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64.p0i8(i8*, i32) nounwind readonly
 
-declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32.p0i8(i8*, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/vld3.ll b/test/CodeGen/ARM/vld3.ll
index 0d14179ba73a3..c3e8ee8691fd7 100644
--- a/test/CodeGen/ARM/vld3.ll
+++ b/test/CodeGen/ARM/vld3.ll
@@ -16,7 +16,7 @@ define <8 x i8> @vld3i8(i8* %A) nounwind {
 ;CHECK-LABEL: vld3i8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld3.8 {d16, d17, d18}, [r0:64]
-	%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 32)
+	%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A, i32 32)
         %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
         %tmp4 = add <8 x i8> %tmp2, %tmp3
@@ -27,7 +27,7 @@ define <4 x i16> @vld3i16(i16* %A) nounwind {
 ;CHECK-LABEL: vld3i16:
 ;CHECK: vld3.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
         %tmp4 = add <4 x i16> %tmp2, %tmp3
@@ -40,7 +40,7 @@ define <4 x i16> @vld3i16_update(i16** %ptr, i32 %inc) nounwind {
 ;CHECK: vld3.16 {d16, d17, d18}, [{{r[0-9]+}}], {{r[0-9]+}}
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8* %tmp0, i32 1)
 	%tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
 	%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
 	%tmp4 = add <4 x i16> %tmp2, %tmp3
@@ -53,7 +53,7 @@ define <2 x i32> @vld3i32(i32* %A) nounwind {
 ;CHECK-LABEL: vld3i32:
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 2
         %tmp4 = add <2 x i32> %tmp2, %tmp3
@@ -64,7 +64,7 @@ define <2 x float> @vld3f(float* %A) nounwind {
 ;CHECK-LABEL: vld3f:
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 2
         %tmp4 = fadd <2 x float> %tmp2, %tmp3
@@ -76,7 +76,7 @@ define <1 x i64> @vld3i64(i64* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld1.64 {d16, d17, d18}, [r0:64]
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
+	%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16)
         %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2
         %tmp4 = add <1 x i64> %tmp2, %tmp3
@@ -87,7 +87,7 @@ define <1 x i64> @vld3i64_update(i64** %ptr, i64* %A) nounwind {
 ;CHECK-LABEL: vld3i64_update:
 ;CHECK: vld1.64	{d16, d17, d18}, [r1:64]!
         %tmp0 = bitcast i64* %A to i8*
-        %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
+        %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16)
         %tmp5 = getelementptr i64, i64* %A, i32 3
         store i64* %tmp5, i64** %ptr
         %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
@@ -101,7 +101,7 @@ define <16 x i8> @vld3Qi8(i8* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld3.8 {d16, d18, d20}, [r0:64]!
 ;CHECK: vld3.8 {d17, d19, d21}, [r0:64]
-	%tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A, i32 32)
+	%tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8.p0i8(i8* %A, i32 32)
         %tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2
         %tmp4 = add <16 x i8> %tmp2, %tmp3
@@ -113,7 +113,7 @@ define <8 x i16> @vld3Qi16(i16* %A) nounwind {
 ;CHECK: vld3.16
 ;CHECK: vld3.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 2
         %tmp4 = add <8 x i16> %tmp2, %tmp3
@@ -125,7 +125,7 @@ define <4 x i32> @vld3Qi32(i32* %A) nounwind {
 ;CHECK: vld3.32
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
         %tmp4 = add <4 x i32> %tmp2, %tmp3
@@ -139,7 +139,7 @@ define <4 x i32> @vld3Qi32_update(i32** %ptr) nounwind {
 ;CHECK: vld3.32 {d17, d19, d21}, [r[[R]]]!
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8* %tmp0, i32 1)
 	%tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
 	%tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
 	%tmp4 = add <4 x i32> %tmp2, %tmp3
@@ -153,20 +153,20 @@ define <4 x float> @vld3Qf(float* %A) nounwind {
 ;CHECK: vld3.32
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 2
         %tmp4 = fadd <4 x float> %tmp2, %tmp3
 	ret <4 x float> %tmp4
 }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8*, i32) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8*, i32) nounwind readonly
 
-declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32.p0i8(i8*, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/vld4.ll b/test/CodeGen/ARM/vld4.ll
index 575e0fa717fb3..10570039a9d2a 100644
--- a/test/CodeGen/ARM/vld4.ll
+++ b/test/CodeGen/ARM/vld4.ll
@@ -15,7 +15,7 @@ define <8 x i8> @vld4i8(i8* %A) nounwind {
 ;CHECK-LABEL: vld4i8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld4.8 {d16, d17, d18, d19}, [r0:64]
-	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 8)
+	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 8)
         %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
         %tmp4 = add <8 x i8> %tmp2, %tmp3
@@ -27,7 +27,7 @@ define <8 x i8> @vld4i8_update(i8** %ptr, i32 %inc) nounwind {
 ;CHECK-LABEL: vld4i8_update:
 ;CHECK: vld4.8 {d16, d17, d18, d19}, [r2:128], r1
 	%A = load i8*, i8** %ptr
-	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 16)
+	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 16)
 	%tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
 	%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
 	%tmp4 = add <8 x i8> %tmp2, %tmp3
@@ -41,7 +41,7 @@ define <4 x i16> @vld4i16(i16* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld4.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0, i32 16)
+	%tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16.p0i8(i8* %tmp0, i32 16)
         %tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 2
         %tmp4 = add <4 x i16> %tmp2, %tmp3
@@ -53,7 +53,7 @@ define <2 x i32> @vld4i32(i32* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld4.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0, i32 32)
+	%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8* %tmp0, i32 32)
         %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
         %tmp4 = add <2 x i32> %tmp2, %tmp3
@@ -64,7 +64,7 @@ define <2 x float> @vld4f(float* %A) nounwind {
 ;CHECK-LABEL: vld4f:
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 2
         %tmp4 = fadd <2 x float> %tmp2, %tmp3
@@ -76,7 +76,7 @@ define <1 x i64> @vld4i64(i64* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld1.64 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
+	%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64)
         %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2
         %tmp4 = add <1 x i64> %tmp2, %tmp3
@@ -87,7 +87,7 @@ define <1 x i64> @vld4i64_update(i64** %ptr, i64* %A) nounwind {
 ;CHECK-LABEL: vld4i64_update:
 ;CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]!
         %tmp0 = bitcast i64* %A to i8*
-        %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
+        %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64)
         %tmp5 = getelementptr i64, i64* %A, i32 4
         store i64* %tmp5, i64** %ptr
         %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
@@ -101,7 +101,7 @@ define <16 x i8> @vld4Qi8(i8* %A) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vld4.8 {d16, d18, d20, d22}, [r0:256]!
 ;CHECK: vld4.8 {d17, d19, d21, d23}, [r0:256]
-	%tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A, i32 64)
+	%tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8.p0i8(i8* %A, i32 64)
         %tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
         %tmp4 = add <16 x i8> %tmp2, %tmp3
@@ -114,7 +114,7 @@ define <8 x i16> @vld4Qi16(i16* %A) nounwind {
 ;CHECK: vld4.16 {d16, d18, d20, d22}, [r0]!
 ;CHECK: vld4.16 {d17, d19, d21, d23}, [r0]
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
         %tmp4 = add <8 x i16> %tmp2, %tmp3
@@ -128,7 +128,7 @@ define <8 x i16> @vld4Qi16_update(i16** %ptr) nounwind {
 ;CHECK: vld4.16 {d17, d19, d21, d23}, [r1:64]!
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
+	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8* %tmp0, i32 8)
 	%tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
 	%tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
 	%tmp4 = add <8 x i16> %tmp2, %tmp3
@@ -142,7 +142,7 @@ define <4 x i32> @vld4Qi32(i32* %A) nounwind {
 ;CHECK: vld4.32
 ;CHECK: vld4.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 2
         %tmp4 = add <4 x i32> %tmp2, %tmp3
@@ -154,20 +154,20 @@ define <4 x float> @vld4Qf(float* %A) nounwind {
 ;CHECK: vld4.32
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8* %tmp0, i32 1)
+	%tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32.p0i8(i8* %tmp0, i32 1)
         %tmp2 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 2
         %tmp4 = fadd <4 x float> %tmp2, %tmp3
 	ret <4 x float> %tmp4
 }
 
-declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8*, i32) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8*, i32) nounwind readonly
 
-declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8*, i32) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32.p0i8(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32.p0i8(i8*, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll
index 09304d87d53ba..c115a3863d0d8 100644
--- a/test/CodeGen/ARM/vlddup.ll
+++ b/test/CodeGen/ARM/vlddup.ll
@@ -66,7 +66,7 @@ define <8 x i8> @vld2dupi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld2dupi8:
 ;Check the (default) alignment value.
 ;CHECK: vld2.8 {d16[], d17[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+	%tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 	%tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0
 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1
@@ -80,7 +80,7 @@ define <4 x i16> @vld2dupi16(i8* %A) nounwind {
 ;Check that a power-of-two alignment smaller than the total size of the memory
 ;being loaded is ignored.
 ;CHECK: vld2.16 {d16[], d17[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 	%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@@ -95,7 +95,7 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
 ;CHECK: vld2.16 {d16[], d17[]}, [r1]!
 	%A = load i16*, i16** %ptr
         %A2 = bitcast i16* %A to i8*
-	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 	%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@@ -110,7 +110,7 @@ define <2 x i32> @vld2dupi32(i8* %A) nounwind {
 ;CHECK-LABEL: vld2dupi32:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld2.32 {d16[], d17[]}, [r0:64]
-	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
+	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
 	%tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
@@ -119,9 +119,9 @@ define <2 x i32> @vld2dupi32(i8* %A) nounwind {
         ret <2 x i32> %tmp5
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
 
 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@@ -131,7 +131,7 @@ define <8 x i8> @vld3dupi8_update(i8** %ptr, i32 %inc) nounwind {
 ;CHECK-LABEL: vld3dupi8_update:
 ;CHECK: vld3.8 {d16[], d17[], d18[]}, [r2], r1
 	%A = load i8*, i8** %ptr
-	%tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 0
 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 1
@@ -149,7 +149,7 @@ define <4 x i16> @vld3dupi16(i8* %A) nounwind {
 ;CHECK-LABEL: vld3dupi16:
 ;Check the (default) alignment value. VLD3 does not support alignment.
 ;CHECK: vld3.16 {d16[], d17[], d18[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1
@@ -161,8 +161,8 @@ define <4 x i16> @vld3dupi16(i8* %A) nounwind {
         ret <4 x i16> %tmp8
 }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
 
 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
@@ -173,7 +173,7 @@ define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
 ;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
 	%A = load i16*, i16** %ptr
         %A2 = bitcast i16* %A to i8*
-	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
+	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
 	%tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
@@ -195,7 +195,7 @@ define <2 x i32> @vld4dupi32(i8* %A) nounwind {
 ;Check the alignment value.  An 8-byte alignment is allowed here even though
 ;it is smaller than the total size of the memory being loaded.
 ;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0:64]
-	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1
@@ -210,5 +210,5 @@ define <2 x i32> @vld4dupi32(i8* %A) nounwind {
         ret <2 x i32> %tmp11
 }
 
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/vldlane.ll b/test/CodeGen/ARM/vldlane.ll
index ac2be7f87f539..2c14bc2d8f4eb 100644
--- a/test/CodeGen/ARM/vldlane.ll
+++ b/test/CodeGen/ARM/vldlane.ll
@@ -102,7 +102,7 @@ define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 16 bits:
 ;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
         %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
         %tmp5 = add <8 x i8> %tmp3, %tmp4
@@ -115,7 +115,7 @@ define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
         %tmp5 = add <4 x i16> %tmp3, %tmp4
@@ -127,7 +127,7 @@ define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
         %tmp5 = add <2 x i32> %tmp3, %tmp4
@@ -141,7 +141,7 @@ define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
 	%tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
 	%tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
 	%tmp5 = add <2 x i32> %tmp3, %tmp4
@@ -155,7 +155,7 @@ define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
         %tmp5 = fadd <2 x float> %tmp3, %tmp4
@@ -168,7 +168,7 @@ define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
         %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
         %tmp5 = add <8 x i16> %tmp3, %tmp4
@@ -181,7 +181,7 @@ define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
         %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
         %tmp5 = add <4 x i32> %tmp3, %tmp4
@@ -193,21 +193,21 @@ define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
         %tmp5 = fadd <4 x float> %tmp3, %tmp4
 	ret <4 x float> %tmp5
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
 
-declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
 
 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@@ -222,7 +222,7 @@ define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vld3lanei8:
 ;CHECK: vld3.8
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
@@ -237,7 +237,7 @@ define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
@@ -251,7 +251,7 @@ define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
@@ -265,7 +265,7 @@ define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
@@ -280,7 +280,7 @@ define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
@@ -296,7 +296,7 @@ define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounw
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
 	%tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
 	%tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
 	%tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
@@ -312,7 +312,7 @@ define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
         %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
@@ -326,7 +326,7 @@ define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
@@ -335,14 +335,14 @@ define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
 	ret <4 x float> %tmp7
 }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
 
-declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
 
 %struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>,  <8 x i8> }
 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
@@ -358,7 +358,7 @@ define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 32 bits:
 ;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
@@ -375,7 +375,7 @@ define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]!
 	%A = load i8*, i8** %ptr
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
 	%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
 	%tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
 	%tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
@@ -395,7 +395,7 @@ define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
         %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
@@ -413,7 +413,7 @@ define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
@@ -429,7 +429,7 @@ define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
@@ -446,7 +446,7 @@ define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
         %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
@@ -463,7 +463,7 @@ define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
         %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
@@ -479,7 +479,7 @@ define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
+	%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
         %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
         %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
         %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
@@ -490,14 +490,14 @@ define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
 	ret <4 x float> %tmp9
 }
 
-declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
 
-declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
 
 ; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
 ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
@@ -511,7 +511,7 @@ define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
   %tmp65 = shl i128 %tmp64, 64
   %ins67 = or i128 %tmp65, 0
   %tmp78 = bitcast i128 %ins67 to <8 x i16>
-  %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
+  %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
   %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
   %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
   %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
diff --git a/test/CodeGen/ARM/vminmaxnm-safe.ll b/test/CodeGen/ARM/vminmaxnm-safe.ll
new file mode 100644
index 0000000000000..ce1aab2dbcec3
--- /dev/null
+++ b/test/CodeGen/ARM/vminmaxnm-safe.ll
@@ -0,0 +1,396 @@
+; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 | FileCheck %s
+
+; vectors
+
+define <4 x float> @vmaxnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
+; CHECK-LABEL: vmaxnmq:
+; CHECK: vmaxnm.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %tmp1 = load <4 x float>, <4 x float>* %A
+  %tmp2 = load <4 x float>, <4 x float>* %B
+  %tmp3 = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+  ret <4 x float> %tmp3
+}
+
+define <2 x float> @vmaxnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
+; CHECK-LABEL: vmaxnmd:
+; CHECK: vmaxnm.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %tmp1 = load <2 x float>, <2 x float>* %A
+  %tmp2 = load <2 x float>, <2 x float>* %B
+  %tmp3 = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+  ret <2 x float> %tmp3
+}
+
+define <4 x float> @vminnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
+; CHECK-LABEL: vminnmq:
+; CHECK: vminnm.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %tmp1 = load <4 x float>, <4 x float>* %A
+  %tmp2 = load <4 x float>, <4 x float>* %B
+  %tmp3 = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+  ret <4 x float> %tmp3
+}
+
+define <2 x float> @vminnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
+; CHECK-LABEL: vminnmd:
+; CHECK: vminnm.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %tmp1 = load <2 x float>, <2 x float>* %A
+  %tmp2 = load <2 x float>, <2 x float>* %B
+  %tmp3 = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+  ret <2 x float> %tmp3
+}
+
+; scalars
+
+define float @fp-armv8_vminnm_o(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_o":
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp olt float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define double @fp-armv8_vminnm_ole(double %a, double %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_ole":
+; CHECK-NOT: vminnm.f64
+  %cmp = fcmp ole double %a, %b
+  %cond = select i1 %cmp, double %a, double %b
+  ret double %cond
+}
+
+define float @fp-armv8_vminnm_o_rev(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_o_rev":
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ogt float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
+define double @fp-armv8_vminnm_oge_rev(double %a, double %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_oge_rev":
+; CHECK-NOT: vminnm.f64
+  %cmp = fcmp oge double %a, %b
+  %cond = select i1 %cmp, double %b, double %a
+  ret double %cond
+}
+
+define float @fp-armv8_vminnm_u(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_u":
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ult float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vminnm_ule(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_ule":
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ule float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vminnm_u_rev(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_u_rev":
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ugt float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
+define double @fp-armv8_vminnm_uge_rev(double %a, double %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_uge_rev":
+; CHECK-NOT: vminnm.f64
+  %cmp = fcmp uge double %a, %b
+  %cond = select i1 %cmp, double %b, double %a
+  ret double %cond
+}
+
+define float @fp-armv8_vmaxnm_o(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_o":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ogt float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_oge(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_oge":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp oge float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_o_rev(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_o_rev":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp olt float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_ole_rev(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_ole_rev":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ole float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_u(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_u":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ugt float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_uge(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_uge":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp uge float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_u_rev(float %a, float %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_u_rev":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ult float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
+define double @fp-armv8_vmaxnm_ule_rev(double %a, double %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_ule_rev":
+; CHECK-NOT: vmaxnm.f64
+  %cmp = fcmp ule double %a, %b
+  %cond = select i1 %cmp, double %b, double %a
+  ret double %cond
+}
+
+; known non-NaNs
+
+define float @fp-armv8_vminnm_NNNo(float %a) {
+; CHECK-LABEL: "fp-armv8_vminnm_NNNo":
+; CHECK: vminnm.f32
+; CHECK-NOT: vminnm.f32
+  %cmp1 = fcmp olt float %a, 12.
+  %cond1 = select i1 %cmp1, float %a, float 12.
+  %cmp2 = fcmp olt float 34., %cond1
+  %cond2 = select i1 %cmp2, float 34., float %cond1
+  ret float %cond2
+}
+
+define double @fp-armv8_vminnm_NNNole(double %a) {
+; CHECK-LABEL: "fp-armv8_vminnm_NNNole":
+; CHECK: vminnm.f64
+; CHECK-NOT: vminnm.f64
+  %cmp1 = fcmp ole double %a, 34.
+  %cond1 = select i1 %cmp1, double %a, double 34.
+  %cmp2 = fcmp ole double 56., %cond1
+  %cond2 = select i1 %cmp2, double 56., double %cond1
+  ret double %cond2
+}
+
+define float @fp-armv8_vminnm_NNNo_rev(float %a) {
+; CHECK-LABEL: "fp-armv8_vminnm_NNNo_rev":
+; CHECK: vminnm.f32
+; CHECK-NOT: vminnm.f32
+  %cmp1 = fcmp ogt float %a, 56.
+  %cond1 = select i1 %cmp1, float 56., float %a
+  %cmp2 = fcmp ogt float 78., %cond1
+  %cond2 = select i1 %cmp2, float %cond1, float 78.
+  ret float %cond2
+}
+
+define double @fp-armv8_vminnm_NNNoge_rev(double %a) {
+; CHECK-LABEL: "fp-armv8_vminnm_NNNoge_rev":
+; CHECK: vminnm.f64
+; CHECK-NOT: vminnm.f64
+  %cmp1 = fcmp oge double %a, 78.
+  %cond1 = select i1 %cmp1, double 78., double %a
+  %cmp2 = fcmp oge double 90., %cond1
+  %cond2 = select i1 %cmp2, double %cond1, double 90.
+  ret double %cond2
+}
+
+define float @fp-armv8_vminnm_NNNu(float %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_NNNu":
+; CHECK: vminnm.f32
+; CHECK-NOT: vminnm.f32
+  %cmp1 = fcmp ult float 12., %b
+  %cond1 = select i1 %cmp1, float 12., float %b
+  %cmp2 = fcmp ult float %cond1, 34.
+  %cond2 = select i1 %cmp2, float %cond1, float 34.
+  ret float %cond2
+}
+
+define float @fp-armv8_vminnm_NNNule(float %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_NNNule":
+; CHECK: vminnm.f32
+; CHECK-NOT: vminnm.f32
+  %cmp1 = fcmp ule float 34., %b
+  %cond1 = select i1 %cmp1, float 34., float %b
+  %cmp2 = fcmp ule float %cond1, 56.
+  %cond2 = select i1 %cmp2, float %cond1, float 56.
+  ret float %cond2
+}
+
+define float @fp-armv8_vminnm_NNNu_rev(float %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_NNNu_rev":
+; CHECK: vminnm.f32
+; CHECK-NOT: vminnm.f32
+  %cmp1 = fcmp ugt float 56., %b
+  %cond1 = select i1 %cmp1, float %b, float 56.
+  %cmp2 = fcmp ugt float %cond1, 78.
+  %cond2 = select i1 %cmp2, float 78., float %cond1
+  ret float %cond2
+}
+
+define double @fp-armv8_vminnm_NNNuge_rev(double %b) {
+; CHECK-LABEL: "fp-armv8_vminnm_NNNuge_rev":
+; CHECK: vminnm.f64
+; CHECK-NOT: vminnm.f64
+  %cmp1 = fcmp uge double 78., %b
+  %cond1 = select i1 %cmp1, double %b, double 78.
+  %cmp2 = fcmp uge double %cond1, 90.
+  %cond2 = select i1 %cmp2, double 90., double %cond1
+  ret double %cond2
+}
+
+define float @fp-armv8_vmaxnm_NNNo(float %a) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_NNNo":
+; CHECK: vmaxnm.f32
+; CHECK-NOT: vmaxnm.f32
+  %cmp1 = fcmp ogt float %a, 12.
+  %cond1 = select i1 %cmp1, float %a, float 12.
+  %cmp2 = fcmp ogt float 34., %cond1
+  %cond2 = select i1 %cmp2, float 34., float %cond1
+  ret float %cond2
+}
+
+define float @fp-armv8_vmaxnm_NNNoge(float %a) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_NNNoge":
+; CHECK: vmaxnm.f32
+; CHECK-NOT: vmaxnm.f32
+  %cmp1 = fcmp oge float %a, 34.
+  %cond1 = select i1 %cmp1, float %a, float 34.
+  %cmp2 = fcmp oge float 56., %cond1
+  %cond2 = select i1 %cmp2, float 56., float %cond1
+  ret float %cond2
+}
+
+define float @fp-armv8_vmaxnm_NNNo_rev(float %a) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_NNNo_rev":
+; CHECK: vmaxnm.f32
+; CHECK-NOT: vmaxnm.f32
+  %cmp1 = fcmp olt float %a, 56.
+  %cond1 = select i1 %cmp1, float 56., float %a
+  %cmp2 = fcmp olt float 78., %cond1
+  %cond2 = select i1 %cmp2, float %cond1, float 78.
+  ret float %cond2
+}
+
+define float @fp-armv8_vmaxnm_NNNole_rev(float %a) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_NNNole_rev":
+; CHECK: vmaxnm.f32
+; CHECK-NOT: vmaxnm.f32
+  %cmp1 = fcmp ole float %a, 78.
+  %cond1 = select i1 %cmp1, float 78., float %a
+  %cmp2 = fcmp ole float 90., %cond1
+  %cond2 = select i1 %cmp2, float %cond1, float 90.
+  ret float %cond2
+}
+
+define float @fp-armv8_vmaxnm_NNNu(float %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_NNNu":
+; CHECK: vmaxnm.f32
+; CHEC-NOT: vmaxnm.f32
+  %cmp1 = fcmp ugt float 12., %b
+  %cond1 = select i1 %cmp1, float 12., float %b
+  %cmp2 = fcmp ugt float %cond1, 34.
+  %cond2 = select i1 %cmp2, float %cond1, float 34.
+  ret float %cond2
+}
+
+define float @fp-armv8_vmaxnm_NNNuge(float %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_NNNuge":
+; CHECK: vmaxnm.f32
+; CHECK-NOT: vmaxnm.f32
+  %cmp1 = fcmp uge float 34., %b
+  %cond1 = select i1 %cmp1, float 34., float %b
+  %cmp2 = fcmp uge float %cond1, 56.
+  %cond2 = select i1 %cmp2, float %cond1, float 56.
+  ret float %cond2
+}
+
+define float @fp-armv8_vmaxnm_NNNu_rev(float %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_NNNu_rev":
+; CHECK: vmaxnm.f32
+; CHECK-NOT: vmaxnm.f32
+  %cmp1 = fcmp ult float 56., %b
+  %cond1 = select i1 %cmp1, float %b, float 56.
+  %cmp2 = fcmp ult float %cond1, 78.
+  %cond2 = select i1 %cmp2, float 78., float %cond1
+  ret float %cond2
+}
+
+define double @fp-armv8_vmaxnm_NNNule_rev( double %b) {
+; CHECK-LABEL: "fp-armv8_vmaxnm_NNNule_rev":
+; CHECK: vmaxnm.f64
+; CHECK-NOT: vmaxnm.f64
+  %cmp1 = fcmp ule double 78., %b
+  %cond1 = select i1 %cmp1, double %b, double 78.
+  %cmp2 = fcmp ule double %cond1, 90.
+  %cond2 = select i1 %cmp2, double 90., double %cond1
+  ret double %cond2
+}
+
+define float @fp-armv8_vminmaxnm_0(float %a) {
+; CHECK-LABEL: "fp-armv8_vminmaxnm_0":
+; CHECK-NOT: vminnm.f32
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp ult float %a, 0.
+  %cond1 = select i1 %cmp1, float %a, float 0.
+  %cmp2 = fcmp ogt float %cond1, 0.
+  %cond2 = select i1 %cmp2, float %cond1, float 0.
+  ret float %cond2
+}
+
+define float @fp-armv8_vminmaxnm_neg0(float %a) {
+; CHECK-LABEL: "fp-armv8_vminmaxnm_neg0":
+; CHECK: vminnm.f32
+; CHECK-NOT: vmaxnm.f32
+  %cmp1 = fcmp olt float %a, -0.
+  %cond1 = select i1 %cmp1, float %a, float -0.
+  %cmp2 = fcmp ugt float %cond1, -0.
+  %cond2 = select i1 %cmp2, float %cond1, float -0.
+  ret float %cond2
+}
+
+define float @fp-armv8_vminmaxnm_e_0(float %a) {
+; CHECK-LABEL: "fp-armv8_vminmaxnm_e_0":
+; CHECK-NOT: vminnm.f32
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp nsz ole float 0., %a
+  %cond1 = select i1 %cmp1, float 0., float %a
+  %cmp2 = fcmp nsz uge float 0., %cond1
+  %cond2 = select i1 %cmp2, float 0., float %cond1
+  ret float %cond2
+}
+
+define float @fp-armv8_vminmaxnm_e_neg0(float %a) {
+; CHECK-LABEL: "fp-armv8_vminmaxnm_e_neg0":
+; CHECK: vminnm.f32
+; CHECK-NOT: vmaxnm.f32
+  %cmp1 = fcmp nsz ule float -0., %a
+  %cond1 = select i1 %cmp1, float -0., float %a
+  %cmp2 = fcmp nsz oge float -0., %cond1
+  %cond2 = select i1 %cmp2, float -0., float %cond1
+  ret float %cond2
+}
+
+declare <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vminmaxnm.ll b/test/CodeGen/ARM/vminmaxnm.ll
index 3632ffd00213f..a6803fc78d8ce 100644
--- a/test/CodeGen/ARM/vminmaxnm.ll
+++ b/test/CodeGen/ARM/vminmaxnm.ll
@@ -1,219 +1,147 @@
-; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 | FileCheck %s
-; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 \
-; RUN:          -enable-no-nans-fp-math -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-FAST
-
-; vectors
-
-define <4 x float> @vmaxnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
-; CHECK-LABEL: vmaxnmq:
-; CHECK: vmaxnm.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-  %tmp1 = load <4 x float>, <4 x float>* %A
-  %tmp2 = load <4 x float>, <4 x float>* %B
-  %tmp3 = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-  ret <4 x float> %tmp3
-}
-
-define <2 x float> @vmaxnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
-; CHECK-LABEL: vmaxnmd:
-; CHECK: vmaxnm.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %tmp1 = load <2 x float>, <2 x float>* %A
-  %tmp2 = load <2 x float>, <2 x float>* %B
-  %tmp3 = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-  ret <2 x float> %tmp3
-}
-
-define <4 x float> @vminnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
-; CHECK-LABEL: vminnmq:
-; CHECK: vminnm.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-  %tmp1 = load <4 x float>, <4 x float>* %A
-  %tmp2 = load <4 x float>, <4 x float>* %B
-  %tmp3 = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-  ret <4 x float> %tmp3
-}
-
-define <2 x float> @vminnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
-; CHECK-LABEL: vminnmd:
-; CHECK: vminnm.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %tmp1 = load <2 x float>, <2 x float>* %A
-  %tmp2 = load <2 x float>, <2 x float>* %B
-  %tmp3 = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-  ret <2 x float> %tmp3
-}
+; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s
 
 ; scalars
 
 define float @fp-armv8_vminnm_o(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_o":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f32
 ; CHECK-LABEL: "fp-armv8_vminnm_o":
-; CHECK-NOT: vminnm.f32
-  %cmp = fcmp olt float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vminnm.f32
+  %cmp = fcmp fast olt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
 define double @fp-armv8_vminnm_ole(double %a, double %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_ole":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f64
 ; CHECK-LABEL: "fp-armv8_vminnm_ole":
-; CHECK-NOT: vminnm.f64
-  %cmp = fcmp ole double %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vminnm.f64
+  %cmp = fcmp fast ole double %a, %b
   %cond = select i1 %cmp, double %a, double %b
   ret double %cond
 }
 
 define float @fp-armv8_vminnm_o_rev(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_o_rev":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f32
 ; CHECK-LABEL: "fp-armv8_vminnm_o_rev":
-; CHECK-NOT: vminnm.f32
-  %cmp = fcmp ogt float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vminnm.f32
+  %cmp = fcmp fast ogt float %a, %b
   %cond = select i1 %cmp, float %b, float %a
   ret float %cond
 }
 
 define double @fp-armv8_vminnm_oge_rev(double %a, double %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_oge_rev":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f64
 ; CHECK-LABEL: "fp-armv8_vminnm_oge_rev":
-; CHECK-NOT: vminnm.f64
-  %cmp = fcmp oge double %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vminnm.f64
+  %cmp = fcmp fast oge double %a, %b
   %cond = select i1 %cmp, double %b, double %a
   ret double %cond
 }
 
 define float @fp-armv8_vminnm_u(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_u":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f32
 ; CHECK-LABEL: "fp-armv8_vminnm_u":
-; CHECK-NOT: vminnm.f32
-  %cmp = fcmp ult float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vminnm.f32
+  %cmp = fcmp fast ult float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
 define float @fp-armv8_vminnm_ule(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_ule":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f32
 ; CHECK-LABEL: "fp-armv8_vminnm_ule":
-; CHECK-NOT: vminnm.f32
-  %cmp = fcmp ule float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vminnm.f32
+  %cmp = fcmp fast ule float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
 define float @fp-armv8_vminnm_u_rev(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_u_rev":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f32
 ; CHECK-LABEL: "fp-armv8_vminnm_u_rev":
-; CHECK-NOT: vminnm.f32
-  %cmp = fcmp ugt float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vminnm.f32
+  %cmp = fcmp fast ugt float %a, %b
   %cond = select i1 %cmp, float %b, float %a
   ret float %cond
 }
 
 define double @fp-armv8_vminnm_uge_rev(double %a, double %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_uge_rev":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f64
 ; CHECK-LABEL: "fp-armv8_vminnm_uge_rev":
-; CHECK-NOT: vminnm.f64
-  %cmp = fcmp uge double %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vminnm.f64
+  %cmp = fcmp fast uge double %a, %b
   %cond = select i1 %cmp, double %b, double %a
   ret double %cond
 }
 
 define float @fp-armv8_vmaxnm_o(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_o":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_o":
-; CHECK-NOT: vmaxnm.f32
-  %cmp = fcmp ogt float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vmaxnm.f32
+  %cmp = fcmp fast ogt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
 define float @fp-armv8_vmaxnm_oge(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_oge":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_oge":
-; CHECK-NOT: vmaxnm.f32
-  %cmp = fcmp oge float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vmaxnm.f32
+  %cmp = fcmp fast oge float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
 define float @fp-armv8_vmaxnm_o_rev(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_o_rev":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_o_rev":
-; CHECK-NOT: vmaxnm.f32
-  %cmp = fcmp olt float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vmaxnm.f32
+  %cmp = fcmp fast olt float %a, %b
   %cond = select i1 %cmp, float %b, float %a
   ret float %cond
 }
 
 define float @fp-armv8_vmaxnm_ole_rev(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_ole_rev":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_ole_rev":
-; CHECK-NOT: vmaxnm.f32
-  %cmp = fcmp ole float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vmaxnm.f32
+  %cmp = fcmp fast ole float %a, %b
   %cond = select i1 %cmp, float %b, float %a
   ret float %cond
 }
 
 define float @fp-armv8_vmaxnm_u(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_u":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_u":
-; CHECK-NOT: vmaxnm.f32
-  %cmp = fcmp ugt float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vmaxnm.f32
+  %cmp = fcmp fast ugt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
 define float @fp-armv8_vmaxnm_uge(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_uge":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_uge":
-; CHECK-NOT: vmaxnm.f32
-  %cmp = fcmp uge float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vmaxnm.f32
+  %cmp = fcmp fast uge float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
 define float @fp-armv8_vmaxnm_u_rev(float %a, float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_u_rev":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_u_rev":
-; CHECK-NOT: vmaxnm.f32
-  %cmp = fcmp ult float %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vmaxnm.f32
+  %cmp = fcmp fast ult float %a, %b
   %cond = select i1 %cmp, float %b, float %a
   ret float %cond
 }
 
 define double @fp-armv8_vmaxnm_ule_rev(double %a, double %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_ule_rev":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vmaxnm.f64
 ; CHECK-LABEL: "fp-armv8_vmaxnm_ule_rev":
-; CHECK-NOT: vmaxnm.f64
-  %cmp = fcmp ule double %a, %b
+; CHECK-NOT: vcmp
+; CHECK: vmaxnm.f64
+  %cmp = fcmp fast ule double %a, %b
   %cond = select i1 %cmp, double %b, double %a
   ret double %cond
 }
@@ -221,285 +149,225 @@ define double @fp-armv8_vmaxnm_ule_rev(double %a, double %b) {
 ; known non-NaNs
 
 define float @fp-armv8_vminnm_NNNo(float %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_NNNo":
-; CHECK-FAST: vminnm.f32
-; CHECK-FAST: vminnm.f32
 ; CHECK-LABEL: "fp-armv8_vminnm_NNNo":
 ; CHECK: vminnm.f32
-; CHECK-NOT: vminnm.f32
-  %cmp1 = fcmp olt float %a, 12.
+; CHECK: vminnm.f32
+  %cmp1 = fcmp fast olt float %a, 12.
   %cond1 = select i1 %cmp1, float %a, float 12.
-  %cmp2 = fcmp olt float 34., %cond1
+  %cmp2 = fcmp fast olt float 34., %cond1
   %cond2 = select i1 %cmp2, float 34., float %cond1
   ret float %cond2
 }
 
 define double @fp-armv8_vminnm_NNNole(double %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_NNNole":
-; CHECK-FAST: vminnm.f64
-; CHECK-FAST: vminnm.f64
 ; CHECK-LABEL: "fp-armv8_vminnm_NNNole":
 ; CHECK: vminnm.f64
-; CHECK-NOT: vminnm.f64
-  %cmp1 = fcmp ole double %a, 34.
+; CHECK: vminnm.f64
+  %cmp1 = fcmp fast ole double %a, 34.
   %cond1 = select i1 %cmp1, double %a, double 34.
-  %cmp2 = fcmp ole double 56., %cond1
+  %cmp2 = fcmp fast ole double 56., %cond1
   %cond2 = select i1 %cmp2, double 56., double %cond1
   ret double %cond2
 }
 
 define float @fp-armv8_vminnm_NNNo_rev(float %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_NNNo_rev":
-; CHECK-FAST: vminnm.f32
-; CHECK-FAST: vminnm.f32
 ; CHECK-LABEL: "fp-armv8_vminnm_NNNo_rev":
 ; CHECK: vminnm.f32
-; CHECK-NOT: vminnm.f32
-  %cmp1 = fcmp ogt float %a, 56.
+; CHECK: vminnm.f32
+  %cmp1 = fcmp fast ogt float %a, 56.
   %cond1 = select i1 %cmp1, float 56., float %a
-  %cmp2 = fcmp ogt float 78., %cond1
+  %cmp2 = fcmp fast ogt float 78., %cond1
   %cond2 = select i1 %cmp2, float %cond1, float 78.
   ret float %cond2
 }
 
 define double @fp-armv8_vminnm_NNNoge_rev(double %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_NNNoge_rev":
-; CHECK-FAST: vminnm.f64
-; CHECK-FAST: vminnm.f64
 ; CHECK-LABEL: "fp-armv8_vminnm_NNNoge_rev":
 ; CHECK: vminnm.f64
-; CHECK-NOT: vminnm.f64
-  %cmp1 = fcmp oge double %a, 78.
+; CHECK: vminnm.f64
+  %cmp1 = fcmp fast oge double %a, 78.
   %cond1 = select i1 %cmp1, double 78., double %a
-  %cmp2 = fcmp oge double 90., %cond1
+  %cmp2 = fcmp fast oge double 90., %cond1
   %cond2 = select i1 %cmp2, double %cond1, double 90.
   ret double %cond2
 }
 
 define float @fp-armv8_vminnm_NNNu(float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_NNNu":
-; CHECK-FAST: vminnm.f32
-; CHECK-FAST: vminnm.f32
 ; CHECK-LABEL: "fp-armv8_vminnm_NNNu":
 ; CHECK: vminnm.f32
-; CHECK-NOT: vminnm.f32
-  %cmp1 = fcmp ult float 12., %b
+; CHECK: vminnm.f32
+  %cmp1 = fcmp fast ult float 12., %b
   %cond1 = select i1 %cmp1, float 12., float %b
-  %cmp2 = fcmp ult float %cond1, 34.
+  %cmp2 = fcmp fast ult float %cond1, 34.
   %cond2 = select i1 %cmp2, float %cond1, float 34.
   ret float %cond2
 }
 
 define float @fp-armv8_vminnm_NNNule(float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_NNNule":
-; CHECK-FAST: vminnm.f32
-; CHECK-FAST: vminnm.f32
 ; CHECK-LABEL: "fp-armv8_vminnm_NNNule":
 ; CHECK: vminnm.f32
-; CHECK-NOT: vminnm.f32
-  %cmp1 = fcmp ule float 34., %b
+; CHECK: vminnm.f32
+  %cmp1 = fcmp fast ule float 34., %b
   %cond1 = select i1 %cmp1, float 34., float %b
-  %cmp2 = fcmp ule float %cond1, 56.
+  %cmp2 = fcmp fast ule float %cond1, 56.
   %cond2 = select i1 %cmp2, float %cond1, float 56.
   ret float %cond2
 }
 
 define float @fp-armv8_vminnm_NNNu_rev(float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_NNNu_rev":
-; CHECK-FAST: vminnm.f32
-; CHECK-FAST: vminnm.f32
 ; CHECK-LABEL: "fp-armv8_vminnm_NNNu_rev":
 ; CHECK: vminnm.f32
-; CHECK-NOT: vminnm.f32
-  %cmp1 = fcmp ugt float 56., %b
+; CHECK: vminnm.f32
+  %cmp1 = fcmp fast ugt float 56., %b
   %cond1 = select i1 %cmp1, float %b, float 56.
-  %cmp2 = fcmp ugt float %cond1, 78.
+  %cmp2 = fcmp fast ugt float %cond1, 78.
   %cond2 = select i1 %cmp2, float 78., float %cond1
   ret float %cond2
 }
 
 define double @fp-armv8_vminnm_NNNuge_rev(double %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vminnm_NNNuge_rev":
-; CHECK-FAST: vminnm.f64
-; CHECK-FAST: vminnm.f64
 ; CHECK-LABEL: "fp-armv8_vminnm_NNNuge_rev":
 ; CHECK: vminnm.f64
-; CHECK-NOT: vminnm.f64
-  %cmp1 = fcmp uge double 78., %b
+; CHECK: vminnm.f64
+  %cmp1 = fcmp fast uge double 78., %b
   %cond1 = select i1 %cmp1, double %b, double 78.
-  %cmp2 = fcmp uge double %cond1, 90.
+  %cmp2 = fcmp fast uge double %cond1, 90.
   %cond2 = select i1 %cmp2, double 90., double %cond1
   ret double %cond2
 }
 
 define float @fp-armv8_vmaxnm_NNNo(float %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_NNNo":
-; CHECK-FAST: vmaxnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_NNNo":
 ; CHECK: vmaxnm.f32
-; CHECK-NOT: vmaxnm.f32
-  %cmp1 = fcmp ogt float %a, 12.
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp fast ogt float %a, 12.
   %cond1 = select i1 %cmp1, float %a, float 12.
-  %cmp2 = fcmp ogt float 34., %cond1
+  %cmp2 = fcmp fast ogt float 34., %cond1
   %cond2 = select i1 %cmp2, float 34., float %cond1
   ret float %cond2
 }
 
 define float @fp-armv8_vmaxnm_NNNoge(float %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_NNNoge":
-; CHECK-FAST: vmaxnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_NNNoge":
 ; CHECK: vmaxnm.f32
-; CHECK-NOT: vmaxnm.f32
-  %cmp1 = fcmp oge float %a, 34.
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp fast oge float %a, 34.
   %cond1 = select i1 %cmp1, float %a, float 34.
-  %cmp2 = fcmp oge float 56., %cond1
+  %cmp2 = fcmp fast oge float 56., %cond1
   %cond2 = select i1 %cmp2, float 56., float %cond1
   ret float %cond2
 }
 
 define float @fp-armv8_vmaxnm_NNNo_rev(float %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_NNNo_rev":
-; CHECK-FAST: vmaxnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_NNNo_rev":
 ; CHECK: vmaxnm.f32
-; CHECK-NOT: vmaxnm.f32
-  %cmp1 = fcmp olt float %a, 56.
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp fast olt float %a, 56.
   %cond1 = select i1 %cmp1, float 56., float %a
-  %cmp2 = fcmp olt float 78., %cond1
+  %cmp2 = fcmp fast olt float 78., %cond1
   %cond2 = select i1 %cmp2, float %cond1, float 78.
   ret float %cond2
 }
 
 define float @fp-armv8_vmaxnm_NNNole_rev(float %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_NNNole_rev":
-; CHECK-FAST: vmaxnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_NNNole_rev":
 ; CHECK: vmaxnm.f32
-; CHECK-NOT: vmaxnm.f32
-  %cmp1 = fcmp ole float %a, 78.
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp fast ole float %a, 78.
   %cond1 = select i1 %cmp1, float 78., float %a
-  %cmp2 = fcmp ole float 90., %cond1
+  %cmp2 = fcmp fast ole float 90., %cond1
   %cond2 = select i1 %cmp2, float %cond1, float 90.
   ret float %cond2
 }
 
 define float @fp-armv8_vmaxnm_NNNu(float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_NNNu":
-; CHECK-FAST: vmaxnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_NNNu":
 ; CHECK: vmaxnm.f32
-; CHEC-NOT: vmaxnm.f32
-  %cmp1 = fcmp ugt float 12., %b
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp fast ugt float 12., %b
   %cond1 = select i1 %cmp1, float 12., float %b
-  %cmp2 = fcmp ugt float %cond1, 34.
+  %cmp2 = fcmp fast ugt float %cond1, 34.
   %cond2 = select i1 %cmp2, float %cond1, float 34.
   ret float %cond2
 }
 
 define float @fp-armv8_vmaxnm_NNNuge(float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_NNNuge":
-; CHECK-FAST: vmaxnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_NNNuge":
 ; CHECK: vmaxnm.f32
-; CHECK-NOT: vmaxnm.f32
-  %cmp1 = fcmp uge float 34., %b
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp fast uge float 34., %b
   %cond1 = select i1 %cmp1, float 34., float %b
-  %cmp2 = fcmp uge float %cond1, 56.
+  %cmp2 = fcmp fast uge float %cond1, 56.
   %cond2 = select i1 %cmp2, float %cond1, float 56.
   ret float %cond2
 }
 
 define float @fp-armv8_vmaxnm_NNNu_rev(float %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_NNNu_rev":
-; CHECK-FAST: vmaxnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vmaxnm_NNNu_rev":
 ; CHECK: vmaxnm.f32
-; CHECK-NOT: vmaxnm.f32
-  %cmp1 = fcmp ult float 56., %b
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp fast ult float 56., %b
   %cond1 = select i1 %cmp1, float %b, float 56.
-  %cmp2 = fcmp ult float %cond1, 78.
+  %cmp2 = fcmp fast ult float %cond1, 78.
   %cond2 = select i1 %cmp2, float 78., float %cond1
   ret float %cond2
 }
 
 define double @fp-armv8_vmaxnm_NNNule_rev( double %b) {
-; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_NNNule_rev":
-; CHECK-FAST: vmaxnm.f64
-; CHECK-FAST: vmaxnm.f64
 ; CHECK-LABEL: "fp-armv8_vmaxnm_NNNule_rev":
 ; CHECK: vmaxnm.f64
-; CHECK-NOT: vmaxnm.f64
-  %cmp1 = fcmp ule double 78., %b
+; CHECK: vmaxnm.f64
+  %cmp1 = fcmp fast ule double 78., %b
   %cond1 = select i1 %cmp1, double %b, double 78.
-  %cmp2 = fcmp ule double %cond1, 90.
+  %cmp2 = fcmp fast ule double %cond1, 90.
   %cond2 = select i1 %cmp2, double 90., double %cond1
   ret double %cond2
 }
 
 define float @fp-armv8_vminmaxnm_0(float %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vminmaxnm_0":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vminmaxnm_0":
-; CHECK-NOT: vminnm.f32
+; CHECK-NOT: vcmp
+; CHECK: vminnm.f32
 ; CHECK: vmaxnm.f32
-  %cmp1 = fcmp olt float %a, 0.
+  %cmp1 = fcmp fast olt float %a, 0.
   %cond1 = select i1 %cmp1, float %a, float 0.
-  %cmp2 = fcmp ogt float %cond1, 0.
+  %cmp2 = fcmp fast ogt float %cond1, 0.
   %cond2 = select i1 %cmp2, float %cond1, float 0.
   ret float %cond2
 }
 
 define float @fp-armv8_vminmaxnm_neg0(float %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vminmaxnm_neg0":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vminmaxnm_neg0":
+; CHECK-NOT: vcmp
 ; CHECK: vminnm.f32
-; CHECK-NOT: vmaxnm.f32
-  %cmp1 = fcmp olt float %a, -0.
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp fast olt float %a, -0.
   %cond1 = select i1 %cmp1, float %a, float -0.
-  %cmp2 = fcmp ogt float %cond1, -0.
+  %cmp2 = fcmp fast ugt float %cond1, -0.
   %cond2 = select i1 %cmp2, float %cond1, float -0.
   ret float %cond2
 }
 
 define float @fp-armv8_vminmaxnm_e_0(float %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vminmaxnm_e_0":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vminmaxnm_e_0":
-; CHECK-NOT: vminnm.f32
+; CHECK-NOT: vcmp
+; CHECK: vminnm.f32
 ; CHECK: vmaxnm.f32
-  %cmp1 = fcmp ule float 0., %a
+  %cmp1 = fcmp fast ule float 0., %a
   %cond1 = select i1 %cmp1, float 0., float %a
-  %cmp2 = fcmp uge float 0., %cond1
+  %cmp2 = fcmp fast uge float 0., %cond1
   %cond2 = select i1 %cmp2, float 0., float %cond1
   ret float %cond2
 }
 
 define float @fp-armv8_vminmaxnm_e_neg0(float %a) {
-; CHECK-FAST-LABEL: "fp-armv8_vminmaxnm_e_neg0":
-; CHECK-FAST-NOT: vcmp
-; CHECK-FAST: vminnm.f32
-; CHECK-FAST: vmaxnm.f32
 ; CHECK-LABEL: "fp-armv8_vminmaxnm_e_neg0":
+; CHECK-NOT: vcmp
 ; CHECK: vminnm.f32
-; CHECK-NOT: vmaxnm.f32
-  %cmp1 = fcmp ule float -0., %a
+; CHECK: vmaxnm.f32
+  %cmp1 = fcmp fast ule float -0., %a
   %cond1 = select i1 %cmp1, float -0., float %a
-  %cmp2 = fcmp uge float -0., %cond1
+  %cmp2 = fcmp fast oge float -0., %cond1
   %cond2 = select i1 %cmp2, float -0., float %cond1
   ret float %cond2
 }
diff --git a/test/CodeGen/ARM/vmov.ll b/test/CodeGen/ARM/vmov.ll
index b7a23b7bb59c2..b1b4f1a940d4b 100644
--- a/test/CodeGen/ARM/vmov.ll
+++ b/test/CodeGen/ARM/vmov.ll
@@ -393,8 +393,8 @@ entry:
   %sub.i = sub <4 x i32> %add.i185, zeroinitializer
   %add.i = add <4 x i32> %sub.i, zeroinitializer
   %vmovn.i = trunc <4 x i32> %add.i to <4 x i16>
-  tail call void @llvm.arm.neon.vst1.v4i16(i8* undef, <4 x i16> %vmovn.i, i32 2)
+  tail call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* undef, <4 x i16> %vmovn.i, i32 2)
   unreachable
 }
 
-declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4i16(i8*, <4 x i16>, i32) nounwind
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index c3e41cacde4c6..0455190b4c9d8 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -447,7 +447,7 @@ entry:
   %0 = trunc i32 %mul to i8
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
   %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
   %4 = bitcast <16 x i8> %3 to <2 x double>
   %5 = extractelement <2 x double> %4, i32 1
   %6 = bitcast double %5 to <8 x i8>
@@ -459,13 +459,13 @@ entry:
   %12 = add <8 x i16> %7, %11
   %13 = mul <8 x i16> %12, %8
   %14 = bitcast i16* %dst to i8*
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
+  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %14, <8 x i16> %13, i32 2)
   ret void
 }
 
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8*, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
 
 ; Take advantage of the Cortex-A8 multiplier accumulator forward.
 
@@ -480,7 +480,7 @@ entry:
   %0 = trunc i32 %mul to i8
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
   %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
   %4 = bitcast <16 x i8> %3 to <2 x double>
   %5 = extractelement <2 x double> %4, i32 1
   %6 = bitcast double %5 to <8 x i8>
@@ -502,7 +502,7 @@ entry:
   %0 = trunc i32 %mul to i8
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
   %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
   %4 = bitcast <16 x i8> %3 to <2 x double>
   %5 = extractelement <2 x double> %4, i32 1
   %6 = bitcast double %5 to <8 x i8>
@@ -559,7 +559,7 @@ for.body33.lr.ph:                                 ; preds = %for.body
 
 for.body33:                                       ; preds = %for.body33, %for.body33.lr.ph
   %add45 = add i32 undef, undef
-  %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* undef, i32 1)
+  %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* undef, i32 1)
   %0 = load i32*, i32** undef, align 4
   %shuffle.i250 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
   %1 = bitcast <1 x i64> %shuffle.i250 to <8 x i8>
diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll
index e362ce36f8ba1..6ddf9850cfcb0 100644
--- a/test/CodeGen/ARM/vpadd.ll
+++ b/test/CodeGen/ARM/vpadd.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - -lower-interleaved-accesses=false | FileCheck %s
 
 define <8 x i8> @vpaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vpaddi8:
diff --git a/test/CodeGen/ARM/vselect_imax.ll b/test/CodeGen/ARM/vselect_imax.ll
index 0eb051036d996..85c8c5cfcda14 100644
--- a/test/CodeGen/ARM/vselect_imax.ll
+++ b/test/CodeGen/ARM/vselect_imax.ll
@@ -3,16 +3,13 @@
 ; Make sure that ARM backend with NEON handles vselect.
 
 define void @vmax_v4i32(<4 x i32>* %m, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: vcgt.s32 [[QR:q[0-9]+]], [[Q1:q[0-9]+]], [[Q2:q[0-9]+]]
-; CHECK: vbsl [[QR]], [[Q1]], [[Q2]]
+; CHECK: vmax.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
     %cmpres = icmp sgt <4 x i32> %a, %b
     %maxres = select <4 x i1> %cmpres, <4 x i32> %a,  <4 x i32> %b
     store <4 x i32> %maxres, <4 x i32>* %m
     ret void
 }
 
-; We adjusted the cost model of the following selects. When we improve code
-; lowering we also need to adjust the cost.
 %T0_10 = type <16 x i16>
 %T1_10 = type <16 x i1>
 ; CHECK-LABEL: func_blend10:
@@ -21,10 +18,10 @@ define void @func_blend10(%T0_10* %loadaddr, %T0_10* %loadaddr2,
   %v0 = load %T0_10, %T0_10* %loadaddr
   %v1 = load %T0_10, %T0_10* %loadaddr2
   %c = icmp slt %T0_10 %v0, %v1
-; CHECK: vbsl
-; CHECK: vbsl
+; CHECK: vmin.s16
+; CHECK: vmin.s16
 ; COST: func_blend10
-; COST: cost of 40 {{.*}} select
+; COST: cost of 2 {{.*}} select
   %r = select %T1_10 %c, %T0_10 %v0, %T0_10 %v1
   store %T0_10 %r, %T0_10* %storeaddr
   ret void
@@ -37,10 +34,10 @@ define void @func_blend14(%T0_14* %loadaddr, %T0_14* %loadaddr2,
   %v0 = load %T0_14, %T0_14* %loadaddr
   %v1 = load %T0_14, %T0_14* %loadaddr2
   %c = icmp slt %T0_14 %v0, %v1
-; CHECK: vbsl
-; CHECK: vbsl
+; CHECK: vmin.s32
+; CHECK: vmin.s32
 ; COST: func_blend14
-; COST: cost of 41 {{.*}} select
+; COST: cost of 2 {{.*}} select
   %r = select %T1_14 %c, %T0_14 %v0, %T0_14 %v1
   store %T0_14 %r, %T0_14* %storeaddr
   ret void
@@ -50,17 +47,20 @@ define void @func_blend14(%T0_14* %loadaddr, %T0_14* %loadaddr2,
 ; CHECK-LABEL: func_blend15:
 define void @func_blend15(%T0_15* %loadaddr, %T0_15* %loadaddr2,
                            %T1_15* %blend, %T0_15* %storeaddr) {
-; CHECK: vbsl
-; CHECK: vbsl
+; CHECK: vmin.s32
+; CHECK: vmin.s32
   %v0 = load %T0_15, %T0_15* %loadaddr
   %v1 = load %T0_15, %T0_15* %loadaddr2
   %c = icmp slt %T0_15 %v0, %v1
 ; COST: func_blend15
-; COST: cost of 82 {{.*}} select
+; COST: cost of 4 {{.*}} select
   %r = select %T1_15 %c, %T0_15 %v0, %T0_15 %v1
   store %T0_15 %r, %T0_15* %storeaddr
   ret void
 }
+
+; We adjusted the cost model of the following selects. When we improve code
+; lowering we also need to adjust the cost.
 %T0_18 = type <4 x i64>
 %T1_18 = type <4 x i1>
 ; CHECK-LABEL: func_blend18:
diff --git a/test/CodeGen/ARM/vst1.ll b/test/CodeGen/ARM/vst1.ll
index f605fa4d6003a..404129a7e6adf 100644
--- a/test/CodeGen/ARM/vst1.ll
+++ b/test/CodeGen/ARM/vst1.ll
@@ -5,7 +5,7 @@ define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vst1.8 {d16}, [r0:64]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
+	call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
 	ret void
 }
 
@@ -14,7 +14,7 @@ define void @vst1i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst1.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst1.v4i16(i8* %tmp0, <4 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, i32 1)
 	ret void
 }
 
@@ -23,7 +23,7 @@ define void @vst1i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst1.v2i32(i8* %tmp0, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -32,7 +32,7 @@ define void @vst1f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -43,7 +43,7 @@ define void @vst1f_update(float** %ptr, <2 x float>* %B) nounwind {
 	%A = load float*, float** %ptr
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
 	%tmp2 = getelementptr float, float* %A, i32 2
 	store float* %tmp2, float** %ptr
 	ret void
@@ -54,7 +54,7 @@ define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>, <1 x i64>* %B
-	call void @llvm.arm.neon.vst1.v1i64(i8* %tmp0, <1 x i64> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, i32 1)
 	ret void
 }
 
@@ -63,7 +63,7 @@ define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vst1.8 {d16, d17}, [r0:64]
 	%tmp1 = load <16 x i8>, <16 x i8>* %B
-	call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
 	ret void
 }
 
@@ -73,7 +73,7 @@ define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst1.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
 	ret void
 }
 
@@ -84,7 +84,7 @@ define void @vst1Qi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 8)
 	%tmp2 = getelementptr i16, i16* %A, i32 %inc
 	store i16* %tmp2, i16** %ptr
 	ret void
@@ -95,7 +95,7 @@ define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst1.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -104,7 +104,7 @@ define void @vst1Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst1.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -113,7 +113,7 @@ define void @vst1Qi64(i64* %A, <2 x i64>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <2 x i64>, <2 x i64>* %B
-	call void @llvm.arm.neon.vst1.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1)
 	ret void
 }
 
@@ -122,19 +122,19 @@ define void @vst1Qf64(double* %A, <2 x double>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast double* %A to i8*
 	%tmp1 = load <2 x double>, <2 x double>* %B
-	call void @llvm.arm.neon.vst1.v2f64(i8* %tmp0, <2 x double> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst1.p0i8.v2f64(i8* %tmp0, <2 x double> %tmp1, i32 1)
 	ret void
 }
 
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4i16(i8*, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2f32(i8*, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v1i64(i8*, <1 x i64>, i32) nounwind
 
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4i32(i8*, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2i64(i8*, <2 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v2f64(i8*, <2 x double>, i32) nounwind
diff --git a/test/CodeGen/ARM/vst2.ll b/test/CodeGen/ARM/vst2.ll
index 17c8a4bdad9b4..e0846ff512ea3 100644
--- a/test/CodeGen/ARM/vst2.ll
+++ b/test/CodeGen/ARM/vst2.ll
@@ -5,7 +5,7 @@ define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 128 bits:
 ;CHECK: vst2.8 {d16, d17}, [r0:64]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
 	ret void
 }
 
@@ -15,7 +15,7 @@ define void @vst2i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
 ;CHECK: vst2.8 {d16, d17}, [r1], r2
 	%A = load i8*, i8** %ptr
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4)
+	call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4)
 	%tmp2 = getelementptr i8, i8* %A, i32 %inc
 	store i8* %tmp2, i8** %ptr
 	ret void
@@ -27,7 +27,7 @@ define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst2.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
 	ret void
 }
 
@@ -36,7 +36,7 @@ define void @vst2i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst2.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -45,7 +45,7 @@ define void @vst2f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst2.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -55,7 +55,7 @@ define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>, <1 x i64>* %B
-	call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
 	ret void
 }
 
@@ -66,7 +66,7 @@ define void @vst2i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
 	%A = load i64*, i64** %ptr
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>, <1 x i64>* %B
-	call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 8)
 	%tmp2 = getelementptr i64, i64* %A, i32 2
 	store i64* %tmp2, i64** %ptr
 	ret void
@@ -77,7 +77,7 @@ define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vst2.8 {d16, d17, d18, d19}, [r0:64]
 	%tmp1 = load <16 x i8>, <16 x i8>* %B
-	call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
 	ret void
 }
 
@@ -87,7 +87,7 @@ define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst2.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
+	call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
 	ret void
 }
 
@@ -97,7 +97,7 @@ define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst2.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
+	call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
 	ret void
 }
 
@@ -106,7 +106,7 @@ define void @vst2Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst2.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -114,7 +114,7 @@ define i8* @vst2update(i8* %out, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: vst2update:
 ;CHECK: vst2.16 {d16, d17}, [r0]!
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	tail call void @llvm.arm.neon.vst2.v4i16(i8* %out, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 2)
+	tail call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %out, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 2)
 	%t5 = getelementptr inbounds i8, i8* %out, i32 16
 	ret i8* %t5
 }
@@ -123,18 +123,18 @@ define i8* @vst2update2(i8 * %out, <4 x float> * %this) nounwind optsize ssp ali
 ;CHECK-LABEL: vst2update2:
 ;CHECK: vst2.32 {d16, d17, d18, d19}, [r0]!
   %tmp1 = load <4 x float>, <4 x float>* %this
-  call void @llvm.arm.neon.vst2.v4f32(i8* %out, <4 x float> %tmp1, <4 x float> %tmp1, i32 4) nounwind
+  call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %out, <4 x float> %tmp1, <4 x float> %tmp1, i32 4) nounwind
   %tmp2 = getelementptr inbounds i8, i8* %out, i32  32
   ret i8* %tmp2
 }
 
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, i32) nounwind
 
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll
index 691ee3bd28f34..d70d59579009d 100644
--- a/test/CodeGen/ARM/vst3.ll
+++ b/test/CodeGen/ARM/vst3.ll
@@ -6,7 +6,7 @@ define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
 ;This test runs at -O0 so do not check for specific register numbers.
 ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32)
 	ret void
 }
 
@@ -15,7 +15,7 @@ define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst3.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst3.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
 	ret void
 }
 
@@ -24,7 +24,7 @@ define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -35,7 +35,7 @@ define void @vst3i32_update(i32** %ptr, <2 x i32>* %B) nounwind {
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
 	%tmp2 = getelementptr i32, i32* %A, i32 6
 	store i32* %tmp2, i32** %ptr
 	ret void
@@ -46,7 +46,7 @@ define void @vst3f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst3.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -57,7 +57,7 @@ define void @vst3i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>, <1 x i64>* %B
-	call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 16)
+	call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 16)
 	ret void
 }
 
@@ -67,7 +67,7 @@ define void @vst3i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
         %A = load i64*, i64** %ptr
         %tmp0 = bitcast i64* %A to i8*
         %tmp1 = load <1 x i64>, <1 x i64>* %B
-        call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
+        call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
         %tmp2 = getelementptr i64, i64* %A, i32 3
         store i64* %tmp2, i64** %ptr
         ret void
@@ -80,7 +80,7 @@ define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]!
 ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
 	%tmp1 = load <16 x i8>, <16 x i8>* %B
-	call void @llvm.arm.neon.vst3.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 32)
 	ret void
 }
 
@@ -90,7 +90,7 @@ define void @vst3Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst3.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
 	ret void
 }
 
@@ -102,7 +102,7 @@ define void @vst3Qi16_update(i16** %ptr, <8 x i16>* %B) nounwind {
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
 	%tmp2 = getelementptr i16, i16* %A, i32 24
 	store i16* %tmp2, i16** %ptr
 	ret void
@@ -114,7 +114,7 @@ define void @vst3Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst3.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -124,17 +124,17 @@ define void @vst3Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst3.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	ret void
 }
 
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
 
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst3.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/vst4.ll b/test/CodeGen/ARM/vst4.ll
index c343c6c86959f..1889551022905 100644
--- a/test/CodeGen/ARM/vst4.ll
+++ b/test/CodeGen/ARM/vst4.ll
@@ -5,7 +5,7 @@ define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 256 bits:
 ;CHECK: vst4.8 {d16, d17, d18, d19}, [r0:64]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
+	call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
 	ret void
 }
 
@@ -15,7 +15,7 @@ define void @vst4i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
 ;CHECK: vst4.8 {d16, d17, d18, d19}, [r1:128], r2
 	%A = load i8*, i8** %ptr
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16)
+	call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16)
 	%tmp2 = getelementptr i8, i8* %A, i32 %inc
 	store i8* %tmp2, i8** %ptr
 	ret void
@@ -27,7 +27,7 @@ define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst4.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
+	call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
 	ret void
 }
 
@@ -37,7 +37,7 @@ define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst4.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
+	call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
 	ret void
 }
 
@@ -46,7 +46,7 @@ define void @vst4f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst4.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -56,7 +56,7 @@ define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>, <1 x i64>* %B
-	call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
+	call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
 	ret void
 }
 
@@ -66,7 +66,7 @@ define void @vst4i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
         %A = load i64*, i64** %ptr
         %tmp0 = bitcast i64* %A to i8*
         %tmp1 = load <1 x i64>, <1 x i64>* %B
-        call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
+        call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
         %tmp2 = getelementptr i64, i64* %A, i32 4
         store i64* %tmp2, i64** %ptr
         ret void
@@ -78,7 +78,7 @@ define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst4.8 {d16, d18, d20, d22}, [r0:256]!
 ;CHECK: vst4.8 {d17, d19, d21, d23}, [r0:256]
 	%tmp1 = load <16 x i8>, <16 x i8>* %B
-	call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
+	call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
 	ret void
 }
 
@@ -89,7 +89,7 @@ define void @vst4Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst4.16 {d17, d19, d21, d23}, [r0]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst4.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
 	ret void
 }
 
@@ -99,7 +99,7 @@ define void @vst4Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst4.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
 	ret void
 }
 
@@ -109,7 +109,7 @@ define void @vst4Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	ret void
 }
 
@@ -121,19 +121,19 @@ define void @vst4Qf_update(float** %ptr, <4 x float>* %B) nounwind {
 	%A = load float*, float** %ptr
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	%tmp2 = getelementptr float, float* %A, i32 16
 	store float* %tmp2, float** %ptr
 	ret void
 }
 
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
 
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/vstlane.ll b/test/CodeGen/ARM/vstlane.ll
index a4575417bce5b..7e130ea01b643 100644
--- a/test/CodeGen/ARM/vstlane.ll
+++ b/test/CodeGen/ARM/vstlane.ll
@@ -110,7 +110,7 @@ define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 16 bits:
 ;CHECK: vst2.8 {d16[1], d17[1]}, [r0:16]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
+	call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
 	ret void
 }
 
@@ -120,7 +120,7 @@ define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst2.16 {d16[1], d17[1]}, [r0:32]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+	call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
 	ret void
 }
 
@@ -131,7 +131,7 @@ define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind {
 	%A = load i16*, i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
+	call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
 	%tmp2 = getelementptr i16, i16* %A, i32 %inc
 	store i16* %tmp2, i16** %ptr
 	ret void
@@ -142,7 +142,7 @@ define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -151,7 +151,7 @@ define void @vst2lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -161,7 +161,7 @@ define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst2.16 {d17[1], d19[1]}, [r0]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
+	call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
 	ret void
 }
 
@@ -171,7 +171,7 @@ define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst2.32 {d17[0], d19[0]}, [r0:64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
+	call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
 	ret void
 }
 
@@ -180,24 +180,24 @@ define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
+	call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
 	ret void
 }
 
-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind
 
-declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind
 
 define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst3lanei8:
 ;CHECK: vst3.8
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -207,7 +207,7 @@ define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst3.16 {d16[1], d17[1], d18[1]}, [r0]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+	call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
 	ret void
 }
 
@@ -216,7 +216,7 @@ define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -225,7 +225,7 @@ define void @vst3lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -235,7 +235,7 @@ define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst3.16 {d17[2], d19[2], d21[2]}, [r0]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8)
+	call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8)
 	ret void
 }
 
@@ -244,7 +244,7 @@ define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
 	ret void
 }
 
@@ -255,7 +255,7 @@ define void @vst3laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
 	%tmp2 = getelementptr i32, i32* %A, i32 3
 	store i32* %tmp2, i32** %ptr
 	ret void
@@ -266,18 +266,18 @@ define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
 	ret void
 }
 
-declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
 
-declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
 
 
 define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
@@ -285,7 +285,7 @@ define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check the alignment value.  Max for this instruction is 32 bits:
 ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+	call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
 	ret void
 }
 
@@ -295,7 +295,7 @@ define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1:32]!
 	%A = load i8*, i8** %ptr
 	%tmp1 = load <8 x i8>, <8 x i8>* %B
-	call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+	call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
 	%tmp2 = getelementptr i8, i8* %A, i32 4
 	store i8* %tmp2, i8** %ptr
 	ret void
@@ -306,7 +306,7 @@ define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst4.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>, <4 x i16>* %B
-	call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -316,7 +316,7 @@ define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:128]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>, <2 x i32>* %B
-	call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
+	call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
 	ret void
 }
 
@@ -325,7 +325,7 @@ define void @vst4lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>, <2 x float>* %B
-	call void @llvm.arm.neon.vst4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -335,7 +335,7 @@ define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0:64]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>, <8 x i16>* %B
-	call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
+	call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
 	ret void
 }
 
@@ -345,7 +345,7 @@ define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>, <4 x i32>* %B
-	call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
+	call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
 	ret void
 }
 
@@ -354,7 +354,7 @@ define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>, <4 x float>* %B
-	call void @llvm.arm.neon.vst4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
+	call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
 	ret void
 }
 
@@ -365,11 +365,11 @@ define <8 x i16> @variable_insertelement(<8 x i16> %a, i16 %b, i32 %c) nounwind
     ret <8 x i16> %r
 }
 
-declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
 
-declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll
index 7b83dfdaf2294..36bcde22731d4 100644
--- a/test/CodeGen/ARM/vtrn.ll
+++ b/test/CodeGen/ARM/vtrn.ll
@@ -20,11 +20,11 @@ define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 define <16 x i8> @vtrni8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ; CHECK-LABEL: vtrni8_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vtrn.8 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vtrn.8 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
@@ -52,11 +52,11 @@ define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 define <8 x i16> @vtrni16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ; CHECK-LABEL: vtrni16_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vtrn.16 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vtrn.16 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = load <4 x i16>, <4 x i16>* %B
@@ -84,11 +84,11 @@ define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 define <4 x i32> @vtrni32_Qres(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ; CHECK-LABEL: vtrni32_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vtrn.32 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vtrn.32 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = load <2 x i32>, <2 x i32>* %B
@@ -116,11 +116,11 @@ define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
 define <4 x float> @vtrnf_Qres(<2 x float>* %A, <2 x float>* %B) nounwind {
 ; CHECK-LABEL: vtrnf_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vtrn.32 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vtrn.32 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x float>, <2 x float>* %A
 	%tmp2 = load <2 x float>, <2 x float>* %B
@@ -281,11 +281,11 @@ define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 define <16 x i8> @vtrni8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ; CHECK-LABEL: vtrni8_undef_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vtrn.8 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vtrn.8 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
@@ -325,3 +325,77 @@ define <16 x i16> @vtrnQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14, i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
 	ret <16 x i16> %tmp3
 }
+
+define <8 x i16> @vtrn_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
+entry:
+  ; CHECK-LABEL: vtrn_lower_shufflemask_undef
+  ; CHECK: vtrn
+	%tmp1 = load <4 x i16>, <4 x i16>* %A
+	%tmp2 = load <4 x i16>, <4 x i16>* %B
+  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 5, i32 3, i32 7>
+  ret <8 x i16> %0
+}
+
+; Here we get a build_vector node, where all the incoming extract_element
+; values do modify the type. However, we get different input types, as some of
+; them get truncated from i32 to i8 (from comparing cmp0 with cmp1) and some of
+; them get truncated from i16 to i8 (from comparing cmp2 with cmp3).
+define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1,
+                                             <4 x i32> %cmp0, <4 x i32> %cmp1,
+                                             <4 x i16> %cmp2, <4 x i16> %cmp3) {
+  ; CHECK-LABEL: vtrn_mismatched_builvector0
+  ; CHECK: vmovn.i32
+  ; CHECK: vtrn
+  ; CHECK: vbsl
+  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
+  %c1 = icmp ult <4 x i16> %cmp2, %cmp3
+  %c = shufflevector <4 x i1> %c0, <4 x i1> %c1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
+  ret <8 x i8> %rv
+}
+
+; Here we get a build_vector node, where half the incoming extract_element
+; values do not modify the type (the values form cmp2), but half of them do
+; (from the icmp operation).
+define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
+                           <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
+  ; CHECK-LABEL: vtrn_mismatched_builvector1
+  ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
+  ; CHECK: vmovl
+  ; CHECK: vtrn.8
+  ; CHECK: vbsl
+  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
+  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
+  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
+  %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
+  ret <8 x i8> %rv
+}
+
+; Negative test that should not generate a vtrn
+define void @lower_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
+entry:
+  ; CHECK-LABEL: lower_twice_no_vtrn
+  ; CHECK: @ BB#0:
+  ; CHECK-NOT: vtrn
+  ; CHECK: mov pc, lr
+  %tmp1 = load <4 x i16>, <4 x i16>* %A
+  %tmp2 = load <4 x i16>, <4 x i16>* %B
+  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 5, i32 3, i32 7, i32 1, i32 5, i32 3, i32 7>
+  store <8 x i16> %0, <8 x i16>* %C
+  ret void
+}
+
+; Negative test that should not generate a vtrn
+define void @upper_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
+entry:
+  ; CHECK-LABEL: upper_twice_no_vtrn
+  ; CHECK: @ BB#0:
+  ; CHECK-NOT: vtrn
+  ; CHECK: mov pc, lr
+  %tmp1 = load <4 x i16>, <4 x i16>* %A
+  %tmp2 = load <4 x i16>, <4 x i16>* %B
+  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 6, i32 0, i32 4, i32 2, i32 6>
+  store <8 x i16> %0, <8 x i16>* %C
+  ret void
+}
diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll
index 5510634b0668d..04499e77fde69 100644
--- a/test/CodeGen/ARM/vuzp.ll
+++ b/test/CodeGen/ARM/vuzp.ll
@@ -20,11 +20,11 @@ define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ; CHECK-LABEL: vuzpi8_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vuzp.8 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vuzp.8 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
@@ -52,11 +52,11 @@ define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ; CHECK-LABEL: vuzpi16_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vuzp.16 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vuzp.16 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = load <4 x i16>, <4 x i16>* %B
@@ -220,11 +220,11 @@ define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ; CHECK-LABEL: vuzpi8_undef_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vuzp.8 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vuzp.8 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
@@ -264,3 +264,109 @@ define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
 	ret <16 x i16> %tmp3
 }
+
+define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
+entry:
+  ; CHECK-LABEL: vuzp_lower_shufflemask_undef
+  ; CHECK: vuzp
+	%tmp1 = load <4 x i16>, <4 x i16>* %A
+	%tmp2 = load <4 x i16>, <4 x i16>* %B
+  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
+  ret <8 x i16> %0
+}
+
+define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
+entry:
+  ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed
+  ; CHECK-NOT: vtrn
+  ; CHECK: vuzp
+  %tmp1 = load <2 x i32>, <2 x i32>* %A
+	%tmp2 = load <2 x i32>, <2 x i32>* %B
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
+  ret <4 x i32> %0
+}
+
+define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
+entry:
+  ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn
+  ; CHECK-NOT: vtrn
+  ; CHECK: vuzp
+  %tmp1 = load <2 x i32>, <2 x i32>* %A
+  %tmp2 = load <2 x i32>, <2 x i32>* %B
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+  store <4 x i32> %0, <4 x i32>* %C
+  ret void
+}
+
+define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
+; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
+; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
+; truncate from i32 to i16 and one vuzp to perform the final truncation for i8.
+; CHECK-LABEL: vuzp_trunc
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vuzp
+; CHECK: vbsl
+  %c = icmp ult <8 x i32> %cmp0, %cmp1
+  %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
+  ret <8 x i8> %res
+}
+
+; Shuffle the result from the compare with a <4 x i8>.
+; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able
+; to perform the vuzp and get the vbsl mask.
+define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
+                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
+; CHECK-LABEL: vuzp_trunc_and_shuffle
+; CHECK: vmovl
+; CHECK: vuzp
+; CHECK: vbsl
+  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
+  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
+  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
+  %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
+  ret <8 x i8> %rv
+}
+
+; Use an undef value for the <4 x i8> that is being shuffled with the compare result.
+; This produces a build_vector with some of the operands undefs.
+define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
+                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
+; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right
+; CHECK: vuzp
+; CHECK: vbsl
+  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
+  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
+  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
+  %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
+  ret <8 x i8> %rv
+}
+
+define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
+                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
+; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left
+; CHECK: vuzp
+; CHECK: vbsl
+  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
+  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
+  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
+  %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
+  ret <8 x i8> %rv
+}
+
+; We're using large data types here, and we have to fill with undef values until we
+; get some vector size that we can represent.
+define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
+                            <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {
+; CHECK-LABEL: vuzp_wide_type
+; CHECK: vbsl
+  %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
+  %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
+  %c0 = icmp ult <5 x i32> %cmp0, %cmp1
+  %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
+  ret <10 x i8> %rv
+}
diff --git a/test/CodeGen/ARM/vzip.ll b/test/CodeGen/ARM/vzip.ll
index 1d9f59aeda0ba..259b484f5f8ed 100644
--- a/test/CodeGen/ARM/vzip.ll
+++ b/test/CodeGen/ARM/vzip.ll
@@ -20,11 +20,11 @@ define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 define <16 x i8> @vzipi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ; CHECK-LABEL: vzipi8_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vzip.8 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vzip.8 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
@@ -52,11 +52,11 @@ define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 define <8 x i16> @vzipi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ; CHECK-LABEL: vzipi16_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vzip.16 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vzip.16 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = load <4 x i16>, <4 x i16>* %B
@@ -220,11 +220,11 @@ define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 define <16 x i8> @vzipi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ; CHECK-LABEL: vzipi8_undef_Qres:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    vzip.8 d16, d17
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
+; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
+; CHECK-NEXT:    vzip.8 [[LDR0]], [[LDR1]]
+; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
+; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
@@ -264,3 +264,55 @@ define <32 x i8> @vzipQi8_undef_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31>
 	ret <32 x i8> %tmp3
 }
+
+define <8 x i16> @vzip_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
+entry:
+  ; CHECK-LABEL: vzip_lower_shufflemask_undef
+  ; CHECK: vzip
+	%tmp1 = load <4 x i16>, <4 x i16>* %A
+	%tmp2 = load <4 x i16>, <4 x i16>* %B
+  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x i16> %0
+}
+
+define <4 x i32> @vzip_lower_shufflemask_zeroed(<2 x i32>* %A) {
+entry:
+  ; CHECK-LABEL: vzip_lower_shufflemask_zeroed
+  ; CHECK-NOT: vtrn
+  ; CHECK: vzip
+  %tmp1 = load <2 x i32>, <2 x i32>* %A
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @vzip_lower_shufflemask_vuzp(<2 x i32>* %A) {
+entry:
+  ; CHECK-LABEL: vzip_lower_shufflemask_vuzp
+  ; CHECK-NOT: vuzp
+  ; CHECK: vzip
+  %tmp1 = load <2 x i32>, <2 x i32>* %A
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 2, i32 1, i32 0>
+  ret <4 x i32> %0
+}
+
+define void @vzip_undef_rev_shufflemask_vtrn(<2 x i32>* %A, <4 x i32>* %B) {
+entry:
+  ; CHECK-LABEL: vzip_undef_rev_shufflemask_vtrn
+  ; CHECK-NOT: vtrn
+  ; CHECK: vzip
+  %tmp1 = load <2 x i32>, <2 x i32>* %A
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+  store <4 x i32> %0, <4 x i32>* %B
+  ret void
+}
+
+define void @vzip_vext_factor(<8 x i16>* %A, <4 x i16>* %B) {
+entry:
+  ; CHECK-LABEL: vzip_vext_factor
+  ; CHECK: vext.16 d16, d16, d17, #3
+  ; CHECK: vzip
+  %tmp1 = load <8 x i16>, <8 x i16>* %A
+  %0 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 4, i32 5, i32 3>
+  store <4 x i16> %0, <4 x i16>* %B
+  ret void
+}
diff --git a/test/CodeGen/BPF/sockex2.ll b/test/CodeGen/BPF/sockex2.ll
index 5de2787d5b07b..b3e83eadf537d 100644
--- a/test/CodeGen/BPF/sockex2.ll
+++ b/test/CodeGen/BPF/sockex2.ll
@@ -311,7 +311,7 @@ flow_dissector.exit.thread:                       ; preds = %86, %12, %196, %199
 ; CHECK-LABEL: bpf_prog2:
 ; CHECK: ldabs_h r0, r6.data + 12 # encoding: [0x28,0x00,0x00,0x00,0x0c,0x00,0x00,0x00]
 ; CHECK: ldabs_h r0, r6.data + 16 # encoding: [0x28,0x00,0x00,0x00,0x10,0x00,0x00,0x00]
-; CHECK: implicit-def: R
+; CHECK: implicit-def: %R1
 ; CHECK: ld_64   r1
 ; CHECK-NOT: ori
 ; CHECK: call 1 # encoding: [0x85,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
diff --git a/test/CodeGen/CPP/gep.ll b/test/CodeGen/CPP/gep.ll
new file mode 100644
index 0000000000000..88a0bf1f216d9
--- /dev/null
+++ b/test/CodeGen/CPP/gep.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=cpp -o - %s | FileCheck %s
+
+define void @f1(i32* %addr) {
+  %x = getelementptr i32, i32* %addr, i32 1
+; CHECK: ConstantInt* [[INT_1:.*]] = ConstantInt::get(mod->getContext(), APInt(32, StringRef("1"), 10));
+; CHECK: GetElementPtrInst::Create(IntegerType::get(mod->getContext(), 32), ptr_addr,
+; CHECK-NEXT:  [[INT_1]]
+; CHECK-NEXT: }, "x", label_3);
+  ret void
+}
diff --git a/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll b/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
index eaaeb37eebb44..ff5952db43b35 100644
--- a/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
+++ b/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
@@ -30,20 +30,20 @@
 	%"struct.qdesigner_internal::GridLayout" = type { %"struct.qdesigner_internal::Layout", %"struct.QPair<int,int>", %"struct.qdesigner_internal::Grid"* }
 	%"struct.qdesigner_internal::Layout" = type { %struct.QObject, %"struct.QList<QAbstractExtensionFactory*>", %struct.QWidget*, %"struct.QHash<QString,QList<QAbstractExtensionFactory*> >", %struct.QWidget*, %struct.QDesignerFormWindowInterface*, i8, %"struct.QPair<int,int>", %struct.QRect, i8 }
 
-@_ZL20__gthrw_pthread_oncePiPFvvE = weak alias i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
-@_ZL27__gthrw_pthread_getspecificj = weak alias i8* (i32)* @pthread_getspecific		; <i8* (i32)*> [#uses=0]
-@_ZL27__gthrw_pthread_setspecificjPKv = weak alias i32 (i32, i8*)* @pthread_setspecific		; <i32 (i32, i8*)*> [#uses=0]
-@_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_ = weak alias i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create		; <i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)*> [#uses=0]
-@_ZL22__gthrw_pthread_cancelm = weak alias i32 (i64)* @pthread_cancel		; <i32 (i64)*> [#uses=0]
-@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_lock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_trylock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_unlock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = weak alias i32 (%struct.pthread_mutex_t*, %struct.Alignment*)* @pthread_mutex_init		; <i32 (%struct.pthread_mutex_t*, %struct.Alignment*)*> [#uses=0]
-@_ZL26__gthrw_pthread_key_createPjPFvPvE = weak alias i32 (i32*, void (i8*)*)* @pthread_key_create		; <i32 (i32*, void (i8*)*)*> [#uses=0]
-@_ZL26__gthrw_pthread_key_deletej = weak alias i32 (i32)* @pthread_key_delete		; <i32 (i32)*> [#uses=0]
-@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = weak alias i32 (%struct.Alignment*)* @pthread_mutexattr_init		; <i32 (%struct.Alignment*)*> [#uses=0]
-@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = weak alias i32 (%struct.Alignment*, i32)* @pthread_mutexattr_settype		; <i32 (%struct.Alignment*, i32)*> [#uses=0]
-@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = weak alias i32 (%struct.Alignment*)* @pthread_mutexattr_destroy		; <i32 (%struct.Alignment*)*> [#uses=0]
+@_ZL20__gthrw_pthread_oncePiPFvvE = weak alias i32 (i32*, void ()*), i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
+@_ZL27__gthrw_pthread_getspecificj = weak alias i8* (i32), i8* (i32)* @pthread_getspecific		; <i8* (i32)*> [#uses=0]
+@_ZL27__gthrw_pthread_setspecificjPKv = weak alias i32 (i32, i8*), i32 (i32, i8*)* @pthread_setspecific		; <i32 (i32, i8*)*> [#uses=0]
+@_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_ = weak alias i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*), i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create		; <i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)*> [#uses=0]
+@_ZL22__gthrw_pthread_cancelm = weak alias i32 (i64), i32 (i64)* @pthread_cancel		; <i32 (i64)*> [#uses=0]
+@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*), i32 (%struct.pthread_mutex_t*)* @pthread_mutex_lock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*), i32 (%struct.pthread_mutex_t*)* @pthread_mutex_trylock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*), i32 (%struct.pthread_mutex_t*)* @pthread_mutex_unlock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = weak alias i32 (%struct.pthread_mutex_t*, %struct.Alignment*), i32 (%struct.pthread_mutex_t*, %struct.Alignment*)* @pthread_mutex_init		; <i32 (%struct.pthread_mutex_t*, %struct.Alignment*)*> [#uses=0]
+@_ZL26__gthrw_pthread_key_createPjPFvPvE = weak alias i32 (i32*, void (i8*)*), i32 (i32*, void (i8*)*)* @pthread_key_create		; <i32 (i32*, void (i8*)*)*> [#uses=0]
+@_ZL26__gthrw_pthread_key_deletej = weak alias i32 (i32), i32 (i32)* @pthread_key_delete		; <i32 (i32)*> [#uses=0]
+@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = weak alias i32 (%struct.Alignment*), i32 (%struct.Alignment*)* @pthread_mutexattr_init		; <i32 (%struct.Alignment*)*> [#uses=0]
+@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = weak alias i32 (%struct.Alignment*, i32), i32 (%struct.Alignment*, i32)* @pthread_mutexattr_settype		; <i32 (%struct.Alignment*, i32)*> [#uses=0]
+@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = weak alias i32 (%struct.Alignment*), i32 (%struct.Alignment*)* @pthread_mutexattr_destroy		; <i32 (%struct.Alignment*)*> [#uses=0]
 
 define void @_ZN18qdesigner_internal10GridLayout9buildGridEv(%"struct.qdesigner_internal::GridLayout"* %this) nounwind {
 entry:
diff --git a/test/CodeGen/Generic/ForceStackAlign.ll b/test/CodeGen/Generic/ForceStackAlign.ll
new file mode 100644
index 0000000000000..57ccb2c41d77c
--- /dev/null
+++ b/test/CodeGen/Generic/ForceStackAlign.ll
@@ -0,0 +1,27 @@
+; Check that stack alignment can be forced. Individual targets should test their
+; specific implementation details.
+
+; RUN: llc < %s -stackrealign -stack-alignment=32 | FileCheck %s
+; CHECK-LABEL: @f
+; CHECK-LABEL: @g
+
+define i32 @f(i8* %p) nounwind {
+entry:
+  %0 = load i8, i8* %p
+  %conv = sext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i64 @g(i32 %i) nounwind {
+entry:
+  br label %if.then
+
+if.then:
+  %0 = alloca i8, i32 %i
+  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 %i, i32 1, i1 false)
+  %call = call i32 @f(i8* %0)
+  %conv = sext i32 %call to i64
+  ret i64 %conv
+}
+
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1) nounwind
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
index 8288e45ee5096..ae3c8da21471f 100644
--- a/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -16,11 +16,11 @@ entry:
     i64 5, label %sw.bb1
   ], !prof !0
 ; CHECK: BB#0: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#2(64) BB#4(14)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.29%) BB#4({{[0-9a-fx/= ]+}}24.71%)
 ; CHECK: BB#4: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1(10) BB#5(4)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}47.62%) BB#5({{[0-9a-fx/= ]+}}52.38%)
 ; CHECK: BB#5: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1(4) BB#3(7)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}36.36%) BB#3({{[0-9a-fx/= ]+}}63.64%)
 
 sw.bb:
   br label %return
@@ -62,7 +62,7 @@ return: ret void
 ; CHECK-LABEL: Machine code for function left_leaning_weight_balanced_tree:
 ; CHECK: BB#0: derived from LLVM BB %entry
 ; CHECK-NOT: Successors
-; CHECK: Successors according to CFG: BB#8(13) BB#9(20)
+; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}39.71%) BB#9({{[0-9a-fx/= ]+}}60.29%)
 }
 
 !1 = !{!"branch_weights",
diff --git a/test/CodeGen/Generic/dbg_value.ll b/test/CodeGen/Generic/dbg_value.ll
index 4038086cbb4e6..e964588622988 100644
--- a/test/CodeGen/Generic/dbg_value.ll
+++ b/test/CodeGen/Generic/dbg_value.ll
@@ -4,11 +4,12 @@
 %0 = type { i32, i32 }
 
 define void @t(%0*, i32, i32, i32, i32) nounwind {
-  tail call void @llvm.dbg.value(metadata %0* %0, i64 0, metadata !0, metadata !DIExpression()), !dbg !DILocation(scope: !DISubprogram())
+  tail call void @llvm.dbg.value(metadata %0* %0, i64 0, metadata !0, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   unreachable
 }
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 ; !0 should conform to the format of DIVariable.
-!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", arg: 0, scope: !DISubprogram())
+!0 = !DILocalVariable(name: "a", arg: 1, scope: !1)
+!1 = distinct !DISubprogram()
diff --git a/test/CodeGen/Generic/lit.local.cfg b/test/CodeGen/Generic/lit.local.cfg
new file mode 100644
index 0000000000000..f22d4aabd730f
--- /dev/null
+++ b/test/CodeGen/Generic/lit.local.cfg
@@ -0,0 +1,3 @@
+if not config.target_triple:
+    config.unsupported = True
+
diff --git a/test/CodeGen/Generic/overloaded-intrinsic-name.ll b/test/CodeGen/Generic/overloaded-intrinsic-name.ll
index 979bc772f75f3..65fc9c1184cf1 100644
--- a/test/CodeGen/Generic/overloaded-intrinsic-name.ll
+++ b/test/CodeGen/Generic/overloaded-intrinsic-name.ll
@@ -13,29 +13,29 @@
 
 ; function and integer
 define i32* @test_iAny(i32* %v) gc "statepoint-example" {
-       %tok = call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32* %v)
-       %v-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok,  i32 7, i32 7)
+       %tok = call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32* %v)
+       %v-new = call i32* @llvm.experimental.gc.relocate.p0i32(token %tok,  i32 7, i32 7)
        ret i32* %v-new
 }
 
 ; float
 define float* @test_fAny(float* %v) gc "statepoint-example" {
-       %tok = call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, float* %v)
-       %v-new = call float* @llvm.experimental.gc.relocate.p0f32(i32 %tok,  i32 7, i32 7)
+       %tok = call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, float* %v)
+       %v-new = call float* @llvm.experimental.gc.relocate.p0f32(token %tok,  i32 7, i32 7)
        ret float* %v-new
 }
 
 ; array of integers
 define [3 x i32]* @test_aAny([3 x i32]* %v) gc "statepoint-example" {
-       %tok = call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, [3 x i32]* %v)
-       %v-new = call [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(i32 %tok,  i32 7, i32 7)
+       %tok = call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, [3 x i32]* %v)
+       %v-new = call [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(token %tok,  i32 7, i32 7)
        ret [3 x i32]* %v-new
 }
 
 ; vector of integers
 define <3 x i32>* @test_vAny(<3 x i32>* %v) gc "statepoint-example" {
-       %tok = call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, <3 x i32>* %v)
-       %v-new = call <3 x i32>* @llvm.experimental.gc.relocate.p0v3i32(i32 %tok,  i32 7, i32 7)
+       %tok = call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, <3 x i32>* %v)
+       %v-new = call <3 x i32>* @llvm.experimental.gc.relocate.p0v3i32(token %tok,  i32 7, i32 7)
        ret <3 x i32>* %v-new
 }
 
@@ -43,15 +43,15 @@ define <3 x i32>* @test_vAny(<3 x i32>* %v) gc "statepoint-example" {
 
 ; struct
 define %struct.test* @test_struct(%struct.test* %v) gc "statepoint-example" {
-       %tok = call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, %struct.test* %v)
-       %v-new = call %struct.test* @llvm.experimental.gc.relocate.p0struct.test(i32 %tok,  i32 7, i32 7)
+       %tok = call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, %struct.test* %v)
+       %v-new = call %struct.test* @llvm.experimental.gc.relocate.p0struct.test(token %tok,  i32 7, i32 7)
        ret %struct.test* %v-new
 }
 
 declare zeroext i1 @return_i1()
-declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
-declare i32* @llvm.experimental.gc.relocate.p0i32(i32, i32, i32)
-declare float* @llvm.experimental.gc.relocate.p0f32(i32, i32, i32)
-declare [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(i32, i32, i32)
-declare <3 x i32>* @llvm.experimental.gc.relocate.p0v3i32(i32, i32, i32)
-declare %struct.test* @llvm.experimental.gc.relocate.p0struct.test(i32, i32, i32)
+declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
+declare i32* @llvm.experimental.gc.relocate.p0i32(token, i32, i32)
+declare float* @llvm.experimental.gc.relocate.p0f32(token, i32, i32)
+declare [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(token, i32, i32)
+declare <3 x i32>* @llvm.experimental.gc.relocate.p0v3i32(token, i32, i32)
+declare %struct.test* @llvm.experimental.gc.relocate.p0struct.test(token, i32, i32)
diff --git a/test/CodeGen/Generic/vector.ll b/test/CodeGen/Generic/vector.ll
index 962b1295b5de0..2d4dc501a53ab 100644
--- a/test/CodeGen/Generic/vector.ll
+++ b/test/CodeGen/Generic/vector.ll
@@ -156,3 +156,9 @@ define <2 x i32*> @vector_gep(<2 x [3 x {i32, i32}]*> %a) {
     %w = getelementptr [3 x {i32, i32}], <2 x [3 x {i32, i32}]*> %a, <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 2, i32 3>, <2 x i32> <i32 1, i32 1>
       ret <2 x i32*> %w
 }
+
+define i32 @extractelt_constant_bitcast() {
+  %1 = bitcast i64 4 to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  ret i32 %2
+}
diff --git a/test/CodeGen/Hexagon/NVJumpCmp.ll b/test/CodeGen/Hexagon/NVJumpCmp.ll
new file mode 100644
index 0000000000000..6b160d962ebb4
--- /dev/null
+++ b/test/CodeGen/Hexagon/NVJumpCmp.ll
@@ -0,0 +1,89 @@
+; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60  < %s | FileCheck %s
+
+; Look for an instruction, we really just do not want to see an abort.
+; CHECK: trace_event
+; REQUIRES: asserts
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon-unknown--elf"
+
+; Function Attrs: nounwind
+define void @_ZN6Halide7Runtime8Internal13default_traceEPvPK18halide_trace_event() #0 {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %if.then
+  br i1 undef, label %while.cond, label %while.end
+
+while.end:                                        ; preds = %while.cond
+  %add = add i32 undef, 48
+  br i1 undef, label %if.end, label %if.then17
+
+if.then17:                                        ; preds = %while.end
+  unreachable
+
+if.end:                                           ; preds = %while.end
+  %arrayidx21 = getelementptr inbounds [4096 x i8], [4096 x i8]* undef, i32 0, i32 8
+  store i8 undef, i8* %arrayidx21, align 4, !tbaa !1
+  br i1 undef, label %for.body42.preheader6, label %min.iters.checked
+
+for.body42.preheader6:                            ; preds = %vector.body.preheader, %min.iters.checked, %if.end
+  unreachable
+
+min.iters.checked:                                ; preds = %if.end
+  br i1 undef, label %for.body42.preheader6, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %min.iters.checked
+  br i1 undef, label %for.cond48.preheader, label %for.body42.preheader6
+
+for.cond48.preheader:                             ; preds = %vector.body.preheader
+  br i1 undef, label %while.cond.i, label %for.body61.lr.ph
+
+for.body61.lr.ph:                                 ; preds = %for.cond48.preheader
+  br i1 undef, label %for.body61, label %min.iters.checked595
+
+min.iters.checked595:                             ; preds = %for.body61.lr.ph
+  br i1 undef, label %for.body61, label %vector.memcheck608
+
+vector.memcheck608:                               ; preds = %min.iters.checked595
+  %scevgep600 = getelementptr [4096 x i8], [4096 x i8]* undef, i32 0, i32 %add
+  %bound0604 = icmp ule i8* %scevgep600, undef
+  %memcheck.conflict607 = and i1 undef, %bound0604
+  br i1 %memcheck.conflict607, label %for.body61, label %vector.body590
+
+vector.body590:                                   ; preds = %vector.body590, %vector.memcheck608
+  br i1 undef, label %middle.block591, label %vector.body590, !llvm.loop !4
+
+middle.block591:                                  ; preds = %vector.body590
+  %cmp.n613 = icmp eq i32 undef, 0
+  br i1 %cmp.n613, label %while.cond.i, label %for.body61
+
+while.cond.i:                                     ; preds = %for.body61, %while.cond.i, %middle.block591, %for.cond48.preheader
+  br i1 undef, label %_ZN6Halide7Runtime8Internal14ScopedSpinLockC2EPVi.exit, label %while.cond.i
+
+_ZN6Halide7Runtime8Internal14ScopedSpinLockC2EPVi.exit: ; preds = %while.cond.i
+  unreachable
+
+for.body61:                                       ; preds = %for.body61, %middle.block591, %vector.memcheck608, %min.iters.checked595, %for.body61.lr.ph
+  %cmp59 = icmp ult i32 undef, undef
+  br i1 %cmp59, label %for.body61, label %while.cond.i, !llvm.loop !7
+
+if.else:                                          ; preds = %entry
+  unreachable
+}
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"halide_mattrs", !"+hvx"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = distinct !{!4, !5, !6}
+!5 = !{!"llvm.loop.vectorize.width", i32 1}
+!6 = !{!"llvm.loop.interleave.count", i32 1}
+!7 = distinct !{!7, !5, !6}
diff --git a/test/CodeGen/Hexagon/absaddr-store.ll b/test/CodeGen/Hexagon/absaddr-store.ll
index dac8607d88db6..f4e97d22e7d2a 100644
--- a/test/CodeGen/Hexagon/absaddr-store.ll
+++ b/test/CodeGen/Hexagon/absaddr-store.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=hexagon -hexagon-small-data-threshold=0 < %s | FileCheck %s
 ; Check that we generate load instructions with absolute addressing mode.
+; XFAIL: *
 
 @a0 = external global i32
 @a1 = external global i32
diff --git a/test/CodeGen/Hexagon/adde.ll b/test/CodeGen/Hexagon/adde.ll
index 7b29e7ad8a0fb..4a88914dc6cb2 100644
--- a/test/CodeGen/Hexagon/adde.ll
+++ b/test/CodeGen/Hexagon/adde.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -disable-hsdr -hexagon-expand-condsets=0 -hexagon-bit=0 < %s | FileCheck %s
 
-; CHECK: r{{[0-9]+:[0-9]+}} = #1
-; CHECK: r{{[0-9]+:[0-9]+}} = #0
+; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #1)
+; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #0)
 ; CHECK: r{{[0-9]+:[0-9]+}} = add(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
 ; CHECK: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
 ; CHECK: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
diff --git a/test/CodeGen/Hexagon/alu64.ll b/test/CodeGen/Hexagon/alu64.ll
index d0824a4ecadc3..f986f13593746 100644
--- a/test/CodeGen/Hexagon/alu64.ll
+++ b/test/CodeGen/Hexagon/alu64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
 
 ; CHECK-LABEL: @test00
-; CHECK: p0 = cmp.eq(r1:0, r3:2)
+; CHECK: = cmp.eq(r1:0, r3:2)
 define i32 @test00(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.C2.cmpeqp(i64 %Rs, i64 %Rt)
@@ -9,7 +9,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test01
-; CHECK: p0 = cmp.gt(r1:0, r3:2)
+; CHECK: = cmp.gt(r1:0, r3:2)
 define i32 @test01(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.C2.cmpgtp(i64 %Rs, i64 %Rt)
@@ -17,7 +17,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test02
-; CHECK: p0 = cmp.gtu(r1:0, r3:2)
+; CHECK: = cmp.gtu(r1:0, r3:2)
 define i32 @test02(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.C2.cmpgtup(i64 %Rs, i64 %Rt)
@@ -25,7 +25,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test10
-; CHECK: r0 = cmp.eq(r0, r1)
+; CHECK: = cmp.eq(r0, r1)
 define i32 @test10(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.rcmpeq(i32 %Rs, i32 %Rt)
@@ -33,7 +33,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test11
-; CHECK: r0 = !cmp.eq(r0, r1)
+; CHECK: = !cmp.eq(r0, r1)
 define i32 @test11(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.rcmpneq(i32 %Rs, i32 %Rt)
@@ -41,7 +41,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test12
-; CHECK: r0 = cmp.eq(r0, #23)
+; CHECK: = cmp.eq(r0, #23)
 define i32 @test12(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.rcmpeqi(i32 %Rs, i32 23)
@@ -49,7 +49,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test13
-; CHECK: r0 = !cmp.eq(r0, #47)
+; CHECK: = !cmp.eq(r0, #47)
 define i32 @test13(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.rcmpneqi(i32 %Rs, i32 47)
@@ -57,7 +57,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test20
-; CHECK: p0 = cmpb.eq(r0, r1)
+; CHECK: = cmpb.eq(r0, r1)
 define i32 @test20(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbeq(i32 %Rs, i32 %Rt)
@@ -65,7 +65,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test21
-; CHECK: p0 = cmpb.gt(r0, r1)
+; CHECK: = cmpb.gt(r0, r1)
 define i32 @test21(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbgt(i32 %Rs, i32 %Rt)
@@ -73,7 +73,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test22
-; CHECK: p0 = cmpb.gtu(r0, r1)
+; CHECK: = cmpb.gtu(r0, r1)
 define i32 @test22(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbgtu(i32 %Rs, i32 %Rt)
@@ -81,7 +81,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test23
-; CHECK: p0 = cmpb.eq(r0, #56)
+; CHECK: = cmpb.eq(r0, #56)
 define i32 @test23(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbeqi(i32 %Rs, i32 56)
@@ -89,7 +89,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test24
-; CHECK: p0 = cmpb.gt(r0, #29)
+; CHECK: = cmpb.gt(r0, #29)
 define i32 @test24(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbgti(i32 %Rs, i32 29)
@@ -97,7 +97,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test25
-; CHECK: p0 = cmpb.gtu(r0, #111)
+; CHECK: = cmpb.gtu(r0, #111)
 define i32 @test25(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbgtui(i32 %Rs, i32 111)
@@ -105,7 +105,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test30
-; CHECK: p0 = cmph.eq(r0, r1)
+; CHECK: = cmph.eq(r0, r1)
 define i32 @test30(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpheq(i32 %Rs, i32 %Rt)
@@ -113,7 +113,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test31
-; CHECK: p0 = cmph.gt(r0, r1)
+; CHECK: = cmph.gt(r0, r1)
 define i32 @test31(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmphgt(i32 %Rs, i32 %Rt)
@@ -121,7 +121,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test32
-; CHECK: p0 = cmph.gtu(r0, r1)
+; CHECK: = cmph.gtu(r0, r1)
 define i32 @test32(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmphgtu(i32 %Rs, i32 %Rt)
@@ -129,7 +129,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test33
-; CHECK: p0 = cmph.eq(r0, #-123)
+; CHECK: = cmph.eq(r0, #-123)
 define i32 @test33(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpheqi(i32 %Rs, i32 -123)
@@ -137,7 +137,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test34
-; CHECK: p0 = cmph.gt(r0, #-3)
+; CHECK: = cmph.gt(r0, #-3)
 define i32 @test34(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmphgti(i32 %Rs, i32 -3)
@@ -145,7 +145,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test35
-; CHECK: p0 = cmph.gtu(r0, #13)
+; CHECK: = cmph.gtu(r0, #13)
 define i32 @test35(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmphgtui(i32 %Rs, i32 13)
@@ -153,7 +153,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test40
-; CHECK: r1:0 = vmux(p0, r3:2, r5:4)
+; CHECK: = vmux(p0, r3:2, r5:4)
 define i64 @test40(i32 %Pu, i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.C2.vmux(i32 %Pu, i64 %Rs, i64 %Rt)
@@ -161,7 +161,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test41
-; CHECK: p0 = any8(vcmpb.eq(r1:0, r3:2))
+; CHECK: = any8(vcmpb.eq(r1:0, r3:2))
 define i32 @test41(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.vcmpbeq.any(i64 %Rs, i64 %Rt)
@@ -169,7 +169,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test50
-; CHECK: r1:0 = add(r1:0, r3:2)
+; CHECK: = add(r1:0, r3:2)
 define i64 @test50(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.addp(i64 %Rs, i64 %Rt)
@@ -177,7 +177,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test51
-; CHECK: r1:0 = add(r1:0, r3:2):sat
+; CHECK: = add(r1:0, r3:2):sat
 define i64 @test51(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.addpsat(i64 %Rs, i64 %Rt)
@@ -185,7 +185,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test52
-; CHECK: r1:0 = sub(r1:0, r3:2)
+; CHECK: = sub(r1:0, r3:2)
 define i64 @test52(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.subp(i64 %Rs, i64 %Rt)
@@ -193,7 +193,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test53
-; CHECK: r1:0 = add(r0, r3:2)
+; CHECK: = add(r1:0, r3:2):raw:
 define i64 @test53(i32 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.addsp(i32 %Rs, i64 %Rt)
@@ -201,7 +201,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test54
-; CHECK: r1:0 = and(r1:0, r3:2)
+; CHECK: = and(r1:0, r3:2)
 define i64 @test54(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.andp(i64 %Rs, i64 %Rt)
@@ -209,7 +209,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test55
-; CHECK: r1:0 = or(r1:0, r3:2)
+; CHECK: = or(r1:0, r3:2)
 define i64 @test55(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.orp(i64 %Rs, i64 %Rt)
@@ -217,7 +217,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test56
-; CHECK: r1:0 = xor(r1:0, r3:2)
+; CHECK: = xor(r1:0, r3:2)
 define i64 @test56(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.xorp(i64 %Rs, i64 %Rt)
@@ -225,7 +225,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test57
-; CHECK: r1:0 = and(r1:0, ~r3:2)
+; CHECK: = and(r1:0, ~r3:2)
 define i64 @test57(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A4.andnp(i64 %Rs, i64 %Rt)
@@ -233,7 +233,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test58
-; CHECK: r1:0 = or(r1:0, ~r3:2)
+; CHECK: = or(r1:0, ~r3:2)
 define i64 @test58(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A4.ornp(i64 %Rs, i64 %Rt)
@@ -241,7 +241,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test60
-; CHECK: r0 = add(r0.l, r1.l)
+; CHECK: = add(r0.l, r1.l)
 define i32 @test60(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.l16.ll(i32 %Rs, i32 %Rt)
@@ -249,7 +249,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test61
-; CHECK: r0 = add(r0.l, r1.h)
+; CHECK: = add(r0.l, r1.h)
 define i32 @test61(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.l16.hl(i32 %Rs, i32 %Rt)
@@ -257,7 +257,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test62
-; CHECK: r0 = add(r0.l, r1.l):sat
+; CHECK: = add(r0.l, r1.l):sat
 define i32 @test62(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32 %Rs, i32 %Rt)
@@ -265,7 +265,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test63
-; CHECK: r0 = add(r0.l, r1.h):sat
+; CHECK: = add(r0.l, r1.h):sat
 define i32 @test63(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.l16.sat.hl(i32 %Rs, i32 %Rt)
@@ -273,7 +273,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test64
-; CHECK: r0 = add(r0.l, r1.l):<<16
+; CHECK: = add(r0.l, r1.l):<<16
 define i32 @test64(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.ll(i32 %Rs, i32 %Rt)
@@ -281,7 +281,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test65
-; CHECK: r0 = add(r0.l, r1.h):<<16
+; CHECK: = add(r0.l, r1.h):<<16
 define i32 @test65(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.lh(i32 %Rs, i32 %Rt)
@@ -289,7 +289,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test66
-; CHECK: r0 = add(r0.h, r1.l):<<16
+; CHECK: = add(r0.h, r1.l):<<16
 define i32 @test66(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.hl(i32 %Rs, i32 %Rt)
@@ -297,7 +297,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test67
-; CHECK: r0 = add(r0.h, r1.h):<<16
+; CHECK: = add(r0.h, r1.h):<<16
 define i32 @test67(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.hh(i32 %Rs, i32 %Rt)
@@ -305,7 +305,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test68
-; CHECK: r0 = add(r0.l, r1.l):sat:<<16
+; CHECK: = add(r0.l, r1.l):sat:<<16
 define i32 @test68(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.sat.ll(i32 %Rs, i32 %Rt)
@@ -313,7 +313,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test69
-; CHECK: r0 = add(r0.l, r1.h):sat:<<16
+; CHECK: = add(r0.l, r1.h):sat:<<16
 define i32 @test69(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.sat.lh(i32 %Rs, i32 %Rt)
@@ -321,7 +321,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test6A
-; CHECK: r0 = add(r0.h, r1.l):sat:<<16
+; CHECK: = add(r0.h, r1.l):sat:<<16
 define i32 @test6A(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.sat.hl(i32 %Rs, i32 %Rt)
@@ -329,7 +329,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test6B
-; CHECK: r0 = add(r0.h, r1.h):sat:<<16
+; CHECK: = add(r0.h, r1.h):sat:<<16
 define i32 @test6B(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.sat.hh(i32 %Rs, i32 %Rt)
@@ -337,7 +337,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test70
-; CHECK: r0 = sub(r0.l, r1.l)
+; CHECK: = sub(r0.l, r1.l)
 define i32 @test70(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.l16.ll(i32 %Rs, i32 %Rt)
@@ -345,7 +345,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test71
-; CHECK: r0 = sub(r0.l, r1.h)
+; CHECK: = sub(r0.l, r1.h)
 define i32 @test71(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.l16.hl(i32 %Rs, i32 %Rt)
@@ -353,7 +353,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test72
-; CHECK: r0 = sub(r0.l, r1.l):sat
+; CHECK: = sub(r0.l, r1.l):sat
 define i32 @test72(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.l16.sat.ll(i32 %Rs, i32 %Rt)
@@ -361,7 +361,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test73
-; CHECK: r0 = sub(r0.l, r1.h):sat
+; CHECK: = sub(r0.l, r1.h):sat
 define i32 @test73(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.l16.sat.hl(i32 %Rs, i32 %Rt)
@@ -369,7 +369,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test74
-; CHECK: r0 = sub(r0.l, r1.l):<<16
+; CHECK: = sub(r0.l, r1.l):<<16
 define i32 @test74(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.ll(i32 %Rs, i32 %Rt)
@@ -377,7 +377,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test75
-; CHECK: r0 = sub(r0.l, r1.h):<<16
+; CHECK: = sub(r0.l, r1.h):<<16
 define i32 @test75(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.lh(i32 %Rs, i32 %Rt)
@@ -385,7 +385,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test76
-; CHECK: r0 = sub(r0.h, r1.l):<<16
+; CHECK: = sub(r0.h, r1.l):<<16
 define i32 @test76(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.hl(i32 %Rs, i32 %Rt)
@@ -393,7 +393,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test77
-; CHECK: r0 = sub(r0.h, r1.h):<<16
+; CHECK: = sub(r0.h, r1.h):<<16
 define i32 @test77(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.hh(i32 %Rs, i32 %Rt)
@@ -401,7 +401,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test78
-; CHECK: r0 = sub(r0.l, r1.l):sat:<<16
+; CHECK: = sub(r0.l, r1.l):sat:<<16
 define i32 @test78(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.sat.ll(i32 %Rs, i32 %Rt)
@@ -409,7 +409,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test79
-; CHECK: r0 = sub(r0.l, r1.h):sat:<<16
+; CHECK: = sub(r0.l, r1.h):sat:<<16
 define i32 @test79(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.sat.lh(i32 %Rs, i32 %Rt)
@@ -417,7 +417,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test7A
-; CHECK: r0 = sub(r0.h, r1.l):sat:<<16
+; CHECK: = sub(r0.h, r1.l):sat:<<16
 define i32 @test7A(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.sat.hl(i32 %Rs, i32 %Rt)
@@ -425,7 +425,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test7B
-; CHECK: r0 = sub(r0.h, r1.h):sat:<<16
+; CHECK: = sub(r0.h, r1.h):sat:<<16
 define i32 @test7B(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.sat.hh(i32 %Rs, i32 %Rt)
@@ -433,7 +433,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test90
-; CHECK: r0 = and(#1, asl(r0, #2))
+; CHECK: = and(#1, asl(r0, #2))
 define i32 @test90(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.andi.asl.ri(i32 1, i32 %Rs, i32 2)
@@ -441,7 +441,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test91
-; CHECK: r0 = or(#1, asl(r0, #2))
+; CHECK: = or(#1, asl(r0, #2))
 define i32 @test91(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.ori.asl.ri(i32 1, i32 %Rs, i32 2)
@@ -449,7 +449,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test92
-; CHECK: r0 = add(#1, asl(r0, #2))
+; CHECK: = add(#1, asl(r0, #2))
 define i32 @test92(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.addi.asl.ri(i32 1, i32 %Rs, i32 2)
@@ -457,7 +457,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test93
-; CHECK: r0 = sub(#1, asl(r0, #2))
+; CHECK: = sub(#1, asl(r0, #2))
 define i32 @test93(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.subi.asl.ri(i32 1, i32 %Rs, i32 2)
@@ -465,7 +465,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test94
-; CHECK: r0 = and(#1, lsr(r0, #2))
+; CHECK: = and(#1, lsr(r0, #2))
 define i32 @test94(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.andi.lsr.ri(i32 1, i32 %Rs, i32 2)
@@ -473,7 +473,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test95
-; CHECK: r0 = or(#1, lsr(r0, #2))
+; CHECK: = or(#1, lsr(r0, #2))
 define i32 @test95(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.ori.lsr.ri(i32 1, i32 %Rs, i32 2)
@@ -481,7 +481,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test96
-; CHECK: r0 = add(#1, lsr(r0, #2))
+; CHECK: = add(#1, lsr(r0, #2))
 define i32 @test96(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.addi.lsr.ri(i32 1, i32 %Rs, i32 2)
@@ -489,7 +489,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test97
-; CHECK: r0 = sub(#1, lsr(r0, #2))
+; CHECK: = sub(#1, lsr(r0, #2))
 define i32 @test97(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.subi.lsr.ri(i32 1, i32 %Rs, i32 2)
@@ -497,7 +497,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test100
-; CHECK: r1:0 = bitsplit(r0, r1)
+; CHECK: = bitsplit(r0, r1)
 define i64 @test100(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A4.bitsplit(i32 %Rs, i32 %Rt)
@@ -505,7 +505,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test101
-; CHECK: r0 = modwrap(r0, r1)
+; CHECK: = modwrap(r0, r1)
 define i32 @test101(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.modwrapu(i32 %Rs, i32 %Rt)
@@ -513,7 +513,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test102
-; CHECK: r0 = parity(r1:0, r3:2)
+; CHECK: = parity(r1:0, r3:2)
 define i32 @test102(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S2.parityp(i64 %Rs, i64 %Rt)
@@ -521,7 +521,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test103
-; CHECK: r0 = parity(r0, r1)
+; CHECK: = parity(r0, r1)
 define i32 @test103(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.parity(i32 %Rs, i32 %Rt)
diff --git a/test/CodeGen/Hexagon/bit-eval.ll b/test/CodeGen/Hexagon/bit-eval.ll
new file mode 100644
index 0000000000000..1d2be5bfc19d9
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-eval.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon"
+
+; CHECK-LABEL: test1:
+; CHECK: r0 = ##1073741824
+define i32 @test1() #0 {
+entry:
+  %0 = tail call i32 @llvm.hexagon.S2.asr.i.r.rnd(i32 2147483647, i32 0)
+  ret i32 %0
+}
+
+; CHECK-LABEL: test2:
+; CHECK: r0 = ##1073741824
+define i32 @test2() #0 {
+entry:
+  %0 = tail call i32 @llvm.hexagon.S2.asr.i.r.rnd.goodsyntax(i32 2147483647, i32 1)
+  ret i32 %0
+}
+
+; CHECK-LABEL: test3:
+; CHECK: r1:0 = combine(#0, #1)
+define i64 @test3() #0 {
+entry:
+  %0 = tail call i64 @llvm.hexagon.S4.extractp(i64 -1, i32 63, i32 63)
+  ret i64 %0
+}
+
+; CHECK-LABEL: test4:
+; CHECK: r0 = #1
+define i32 @test4() #0 {
+entry:
+  %0 = tail call i32 @llvm.hexagon.S4.extract(i32 -1, i32 31, i32 31)
+  ret i32 %0
+}
+
+; CHECK-LABEL: test5:
+; CHECK: r0 = ##-1073741569
+define i32 @test5() #0 {
+entry:
+  %0 = tail call i32 @llvm.hexagon.S4.subi.lsr.ri(i32 255, i32 -2147483648, i32 1)
+  ret i32 %0
+}
+
+declare i32 @llvm.hexagon.S2.asr.i.r.rnd(i32, i32) #0
+declare i32 @llvm.hexagon.S2.asr.i.r.rnd.goodsyntax(i32, i32) #0
+declare i64 @llvm.hexagon.S4.extractp(i64, i32, i32) #0
+declare i32 @llvm.hexagon.S4.extract(i32, i32, i32) #0
+declare i32 @llvm.hexagon.S4.subi.lsr.ri(i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/Hexagon/bit-loop.ll b/test/CodeGen/Hexagon/bit-loop.ll
new file mode 100644
index 0000000000000..74a1a276115d3
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-loop.ll
@@ -0,0 +1,80 @@
+; RUN: llc < %s | FileCheck %s
+; CHECK-DAG: memh(r{{[0-9]+}}+#0) = r{{[0-9]+}}
+; CHECK-DAG: memh(r{{[0-9]+}}+#2) = r{{[0-9]+}}.h
+; CHECK-DAG: memh(r{{[0-9]+}}+#4) = r{{[0-9]+}}
+; CHECK-DAG: memh(r{{[0-9]+}}+#6) = r{{[0-9]+}}.h
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define void @foo(i64* nocapture readonly %r64, i16 zeroext %n, i16 zeroext %s, i64* nocapture %p64) #0 {
+entry:
+  %conv = zext i16 %n to i32
+  %cmp = icmp eq i16 %n, 0
+  br i1 %cmp, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = load i64, i64* %r64, align 8, !tbaa !1
+  %v.sroa.0.0.extract.trunc = trunc i64 %0 to i16
+  %v.sroa.4.0.extract.shift = lshr i64 %0, 16
+  %v.sroa.4.0.extract.trunc = trunc i64 %v.sroa.4.0.extract.shift to i16
+  %v.sroa.5.0.extract.shift = lshr i64 %0, 32
+  %v.sroa.5.0.extract.trunc = trunc i64 %v.sroa.5.0.extract.shift to i16
+  %v.sroa.6.0.extract.shift = lshr i64 %0, 48
+  %v.sroa.6.0.extract.trunc = trunc i64 %v.sroa.6.0.extract.shift to i16
+  %1 = bitcast i64* %p64 to i16*
+  %conv2 = zext i16 %s to i32
+  %add.ptr = getelementptr inbounds i16, i16* %1, i32 %conv2
+  %add.ptr.sum = add nuw nsw i32 %conv2, 1
+  %add.ptr3 = getelementptr inbounds i16, i16* %1, i32 %add.ptr.sum
+  %add.ptr.sum50 = add nuw nsw i32 %conv2, 2
+  %add.ptr4 = getelementptr inbounds i16, i16* %1, i32 %add.ptr.sum50
+  %add.ptr.sum51 = add nuw nsw i32 %conv2, 3
+  %add.ptr5 = getelementptr inbounds i16, i16* %1, i32 %add.ptr.sum51
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %add.ptr11.phi = phi i16* [ %add.ptr11.inc, %for.body ], [ %add.ptr, %for.body.preheader ]
+  %add.ptr16.phi = phi i16* [ %add.ptr16.inc, %for.body ], [ %add.ptr3, %for.body.preheader ]
+  %add.ptr21.phi = phi i16* [ %add.ptr21.inc, %for.body ], [ %add.ptr4, %for.body.preheader ]
+  %add.ptr26.phi = phi i16* [ %add.ptr26.inc, %for.body ], [ %add.ptr5, %for.body.preheader ]
+  %i.058.pmt = phi i32 [ %inc.pmt, %for.body ], [ 0, %for.body.preheader ]
+  %v.sroa.0.157 = phi i16 [ %v.sroa.0.0.extract.trunc34, %for.body ], [ %v.sroa.0.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.4.156 = phi i16 [ %v.sroa.4.0.extract.trunc36, %for.body ], [ %v.sroa.4.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.5.155 = phi i16 [ %v.sroa.5.0.extract.trunc38, %for.body ], [ %v.sroa.5.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.6.154 = phi i16 [ %v.sroa.6.0.extract.trunc40, %for.body ], [ %v.sroa.6.0.extract.trunc, %for.body.preheader ]
+  %q64.153.pn = phi i64* [ %q64.153, %for.body ], [ %r64, %for.body.preheader ]
+  %q64.153 = getelementptr inbounds i64, i64* %q64.153.pn, i32 1
+  store i16 %v.sroa.0.157, i16* %add.ptr11.phi, align 2, !tbaa !5
+  store i16 %v.sroa.4.156, i16* %add.ptr16.phi, align 2, !tbaa !5
+  store i16 %v.sroa.5.155, i16* %add.ptr21.phi, align 2, !tbaa !5
+  store i16 %v.sroa.6.154, i16* %add.ptr26.phi, align 2, !tbaa !5
+  %2 = load i64, i64* %q64.153, align 8, !tbaa !1
+  %v.sroa.0.0.extract.trunc34 = trunc i64 %2 to i16
+  %v.sroa.4.0.extract.shift35 = lshr i64 %2, 16
+  %v.sroa.4.0.extract.trunc36 = trunc i64 %v.sroa.4.0.extract.shift35 to i16
+  %v.sroa.5.0.extract.shift37 = lshr i64 %2, 32
+  %v.sroa.5.0.extract.trunc38 = trunc i64 %v.sroa.5.0.extract.shift37 to i16
+  %v.sroa.6.0.extract.shift39 = lshr i64 %2, 48
+  %v.sroa.6.0.extract.trunc40 = trunc i64 %v.sroa.6.0.extract.shift39 to i16
+  %inc.pmt = add i32 %i.058.pmt, 1
+  %cmp8 = icmp slt i32 %inc.pmt, %conv
+  %add.ptr11.inc = getelementptr i16, i16* %add.ptr11.phi, i32 4
+  %add.ptr16.inc = getelementptr i16, i16* %add.ptr16.phi, i32 4
+  %add.ptr21.inc = getelementptr i16, i16* %add.ptr21.phi, i32 4
+  %add.ptr26.inc = getelementptr i16, i16* %add.ptr26.phi, i32 4
+  br i1 %cmp8, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"long long", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"short", !3, i64 0}
diff --git a/test/CodeGen/Hexagon/cfi-late.ll b/test/CodeGen/Hexagon/cfi-late.ll
new file mode 100644
index 0000000000000..ce38711ae8d7f
--- /dev/null
+++ b/test/CodeGen/Hexagon/cfi-late.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=hexagon -enable-misched=false < %s | FileCheck %s
+; This testcase causes the scheduler to crash for some reason. Disable
+; it for now.
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon-unknown--elf"
+
+; Check that allocframe was packetized with the two adds.
+; CHECK: foo:
+; CHECK: {
+; CHECK-DAG: allocframe
+; CHECK-DAG: add
+; CHECK-DAG: add
+; CHECK: }
+; CHECK: dealloc_return
+; CHECK: }
+
+; Function Attrs: nounwind
+define i32 @foo(i32 %x, i32 %y) #0 !dbg !4 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !9, metadata !14), !dbg !15
+  tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !10, metadata !14), !dbg !16
+  %add = add nsw i32 %x, 1, !dbg !17
+  %add1 = add nsw i32 %y, 1, !dbg !18
+  %call = tail call i32 @bar(i32 %add, i32 %add1) #3, !dbg !19
+  %add2 = add nsw i32 %call, 1, !dbg !20
+  ret i32 %add2, !dbg !21
+}
+
+declare i32 @bar(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv4" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv4" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !12}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (http://llvm.org/git/clang.git 15506a21305e212c406f980ed9b6b1bac785df56)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!1 = !DIFile(filename: "cfi-late.c", directory: "/test")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, variables: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7, !7, !7}
+!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !{!9, !10}
+!9 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1, line: 3, type: !7)
+!10 = !DILocalVariable(name: "y", arg: 2, scope: !4, file: !1, line: 3, type: !7)
+!11 = !{i32 2, !"Dwarf Version", i32 4}
+!12 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git 15506a21305e212c406f980ed9b6b1bac785df56)"}
+!14 = !DIExpression()
+!15 = !DILocation(line: 3, column: 13, scope: !4)
+!16 = !DILocation(line: 3, column: 20, scope: !4)
+!17 = !DILocation(line: 4, column: 15, scope: !4)
+!18 = !DILocation(line: 4, column: 20, scope: !4)
+!19 = !DILocation(line: 4, column: 10, scope: !4)
+!20 = !DILocation(line: 4, column: 24, scope: !4)
+!21 = !DILocation(line: 4, column: 3, scope: !4)
diff --git a/test/CodeGen/Hexagon/clr_set_toggle.ll b/test/CodeGen/Hexagon/clr_set_toggle.ll
index 87c52956129e3..4e90f3d99a1ee 100644
--- a/test/CodeGen/Hexagon/clr_set_toggle.ll
+++ b/test/CodeGen/Hexagon/clr_set_toggle.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -hexagon-bit=0 < %s | FileCheck %s
 ; Optimized bitwise operations.
 
 define i32 @my_clrbit(i32 %x) nounwind {
diff --git a/test/CodeGen/Hexagon/combine.ll b/test/CodeGen/Hexagon/combine.ll
index 2e320d977d625..8f5cec88d692b 100644
--- a/test/CodeGen/Hexagon/combine.ll
+++ b/test/CodeGen/Hexagon/combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
 ; CHECK: combine(r{{[0-9]+}}, r{{[0-9]+}})
 
 @j = external global i32
diff --git a/test/CodeGen/Hexagon/combine_ir.ll b/test/CodeGen/Hexagon/combine_ir.ll
index 634a5c82a9163..0d781d8d5d497 100644
--- a/test/CodeGen/Hexagon/combine_ir.ll
+++ b/test/CodeGen/Hexagon/combine_ir.ll
@@ -1,18 +1,8 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: word
-; CHECK: combine(#0
-
-define void @word(i32* nocapture %a) nounwind {
-entry:
-  %0 = load i32, i32* %a, align 4
-  %1 = zext i32 %0 to i64
-  tail call void @bar(i64 %1) nounwind
-  ret void
-}
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hsdr < %s | FileCheck %s
 
 declare void @bar(i64)
 
-; CHECK: halfword
+; CHECK-LABEL: halfword:
 ; CHECK: combine(#0
 
 define void @halfword(i16* nocapture %a) nounwind {
@@ -28,7 +18,7 @@ entry:
   ret void
 }
 
-; CHECK: byte
+; CHECK-LABEL: byte:
 ; CHECK: combine(#0
 
 define void @byte(i8* nocapture %a) nounwind {
diff --git a/test/CodeGen/Hexagon/early-if-conversion-bug1.ll b/test/CodeGen/Hexagon/early-if-conversion-bug1.ll
new file mode 100644
index 0000000000000..6739b03985ddb
--- /dev/null
+++ b/test/CodeGen/Hexagon/early-if-conversion-bug1.ll
@@ -0,0 +1,412 @@
+; RUN: llc -O2 -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; we do not want to see a segv.
+; CHECK-NOT: segmentation
+; CHECK: call
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon"
+
+%"class.std::__1::basic_string" = type { %"class.std::__1::__compressed_pair" }
+%"class.std::__1::__compressed_pair" = type { %"class.std::__1::__libcpp_compressed_pair_imp" }
+%"class.std::__1::__libcpp_compressed_pair_imp" = type { %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep" }
+%"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep" = type { %union.anon }
+%union.anon = type { %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long" }
+%"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long" = type { i32, i32, i8* }
+%"class.std::__1::ios_base" = type { i32 (...)**, i32, i32, i32, i32, i32, i8*, i8*, void (i8, %"class.std::__1::ios_base"*, i32)**, i32*, i32, i32, i32*, i32, i32, i8**, i32, i32 }
+%"class.std::__1::basic_streambuf" = type { i32 (...)**, %"class.std::__1::locale", i8*, i8*, i8*, i8*, i8*, i8* }
+%"class.std::__1::locale" = type { %"class.std::__1::locale::__imp"* }
+%"class.std::__1::locale::__imp" = type opaque
+%"class.std::__1::allocator" = type { i8 }
+%"class.std::__1::ostreambuf_iterator" = type { %"class.std::__1::basic_streambuf"* }
+%"class.std::__1::__basic_string_common" = type { i8 }
+%"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__short" = type { %union.anon.0, [11 x i8] }
+%union.anon.0 = type { i8 }
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: nounwind
+declare void @_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED1Ev(%"class.std::__1::basic_string"*) #1
+
+define weak_odr hidden i32 @_ZNSt3__116__pad_and_outputIcNS_11char_traitsIcEEEENS_19ostreambuf_iteratorIT_T0_EES6_PKS4_S8_S8_RNS_8ios_baseES4_(i32 %__s.coerce, i8* %__ob, i8* %__op, i8* %__oe, %"class.std::__1::ios_base"* nonnull %__iob, i8 zeroext %__fl) #2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %this.addr.i66 = alloca %"class.std::__1::basic_streambuf"*, align 4
+  %__s.addr.i67 = alloca i8*, align 4
+  %__n.addr.i68 = alloca i32, align 4
+  %__p.addr.i.i = alloca i8*, align 4
+  %this.addr.i.i.i13.i.i = alloca %"class.std::__1::__libcpp_compressed_pair_imp"*, align 4
+  %this.addr.i.i14.i.i = alloca %"class.std::__1::__compressed_pair"*, align 4
+  %this.addr.i15.i.i = alloca %"class.std::__1::basic_string"*, align 4
+  %__x.addr.i.i.i.i.i = alloca i8*, align 4
+  %__r.addr.i.i.i.i = alloca i8*, align 4
+  %this.addr.i.i.i4.i.i = alloca %"class.std::__1::__libcpp_compressed_pair_imp"*, align 4
+  %this.addr.i.i5.i.i = alloca %"class.std::__1::__compressed_pair"*, align 4
+  %this.addr.i6.i.i = alloca %"class.std::__1::basic_string"*, align 4
+  %this.addr.i.i.i.i.i56 = alloca %"class.std::__1::__libcpp_compressed_pair_imp"*, align 4
+  %this.addr.i.i.i.i57 = alloca %"class.std::__1::__compressed_pair"*, align 4
+  %this.addr.i.i.i58 = alloca %"class.std::__1::basic_string"*, align 4
+  %this.addr.i.i59 = alloca %"class.std::__1::basic_string"*, align 4
+  %this.addr.i60 = alloca %"class.std::__1::basic_string"*, align 4
+  %this.addr.i.i.i.i.i = alloca %"class.std::__1::allocator"*, align 4
+  %this.addr.i.i.i.i = alloca %"class.std::__1::__libcpp_compressed_pair_imp"*, align 4
+  %this.addr.i.i.i = alloca %"class.std::__1::__compressed_pair"*, align 4
+  %this.addr.i.i = alloca %"class.std::__1::basic_string"*, align 4
+  %__n.addr.i.i = alloca i32, align 4
+  %__c.addr.i.i = alloca i8, align 1
+  %this.addr.i53 = alloca %"class.std::__1::basic_string"*, align 4
+  %__n.addr.i54 = alloca i32, align 4
+  %__c.addr.i = alloca i8, align 1
+  %this.addr.i46 = alloca %"class.std::__1::basic_streambuf"*, align 4
+  %__s.addr.i47 = alloca i8*, align 4
+  %__n.addr.i48 = alloca i32, align 4
+  %this.addr.i44 = alloca %"class.std::__1::basic_streambuf"*, align 4
+  %__s.addr.i = alloca i8*, align 4
+  %__n.addr.i = alloca i32, align 4
+  %this.addr.i41 = alloca %"class.std::__1::ios_base"*, align 4
+  %__wide.addr.i = alloca i32, align 4
+  %__r.i = alloca i32, align 4
+  %this.addr.i = alloca %"class.std::__1::ios_base"*, align 4
+  %retval = alloca %"class.std::__1::ostreambuf_iterator", align 4
+  %__s = alloca %"class.std::__1::ostreambuf_iterator", align 4
+  %__ob.addr = alloca i8*, align 4
+  %__op.addr = alloca i8*, align 4
+  %__oe.addr = alloca i8*, align 4
+  %__iob.addr = alloca %"class.std::__1::ios_base"*, align 4
+  %__fl.addr = alloca i8, align 1
+  %__sz = alloca i32, align 4
+  %__ns = alloca i32, align 4
+  %__np = alloca i32, align 4
+  %__sp = alloca %"class.std::__1::basic_string", align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %cleanup.dest.slot = alloca i32
+  %coerce.dive = getelementptr %"class.std::__1::ostreambuf_iterator", %"class.std::__1::ostreambuf_iterator"* %__s, i32 0, i32 0
+  %coerce.val.ip = inttoptr i32 %__s.coerce to %"class.std::__1::basic_streambuf"*
+  store %"class.std::__1::basic_streambuf"* %coerce.val.ip, %"class.std::__1::basic_streambuf"** %coerce.dive
+  store i8* %__ob, i8** %__ob.addr, align 4
+  store i8* %__op, i8** %__op.addr, align 4
+  store i8* %__oe, i8** %__oe.addr, align 4
+  store %"class.std::__1::ios_base"* %__iob, %"class.std::__1::ios_base"** %__iob.addr, align 4
+  store i8 %__fl, i8* %__fl.addr, align 1
+  %__sbuf_ = getelementptr inbounds %"class.std::__1::ostreambuf_iterator", %"class.std::__1::ostreambuf_iterator"* %__s, i32 0, i32 0
+  %0 = load %"class.std::__1::basic_streambuf"*, %"class.std::__1::basic_streambuf"** %__sbuf_, align 4
+  %cmp = icmp eq %"class.std::__1::basic_streambuf"* %0, null
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %1 = bitcast %"class.std::__1::ostreambuf_iterator"* %retval to i8*
+  %2 = bitcast %"class.std::__1::ostreambuf_iterator"* %__s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %2, i32 4, i32 4, i1 false)
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %3 = load i8*, i8** %__oe.addr, align 4
+  %4 = load i8*, i8** %__ob.addr, align 4
+  %sub.ptr.lhs.cast = ptrtoint i8* %3 to i32
+  %sub.ptr.rhs.cast = ptrtoint i8* %4 to i32
+  %sub.ptr.sub = sub i32 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  store i32 %sub.ptr.sub, i32* %__sz, align 4
+  %5 = load %"class.std::__1::ios_base"*, %"class.std::__1::ios_base"** %__iob.addr, align 4
+  store %"class.std::__1::ios_base"* %5, %"class.std::__1::ios_base"** %this.addr.i, align 4
+  %this1.i = load %"class.std::__1::ios_base"*, %"class.std::__1::ios_base"** %this.addr.i
+  %__width_.i = getelementptr inbounds %"class.std::__1::ios_base", %"class.std::__1::ios_base"* %this1.i, i32 0, i32 3
+  %6 = load i32, i32* %__width_.i, align 4
+  store i32 %6, i32* %__ns, align 4
+  %7 = load i32, i32* %__ns, align 4
+  %8 = load i32, i32* %__sz, align 4
+  %cmp1 = icmp sgt i32 %7, %8
+  br i1 %cmp1, label %if.then2, label %if.else
+
+if.then2:                                         ; preds = %if.end
+  %9 = load i32, i32* %__sz, align 4
+  %10 = load i32, i32* %__ns, align 4
+  %sub = sub nsw i32 %10, %9
+  store i32 %sub, i32* %__ns, align 4
+  br label %if.end3
+
+if.else:                                          ; preds = %if.end
+  store i32 0, i32* %__ns, align 4
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.else, %if.then2
+  %11 = load i8*, i8** %__op.addr, align 4
+  %12 = load i8*, i8** %__ob.addr, align 4
+  %sub.ptr.lhs.cast4 = ptrtoint i8* %11 to i32
+  %sub.ptr.rhs.cast5 = ptrtoint i8* %12 to i32
+  %sub.ptr.sub6 = sub i32 %sub.ptr.lhs.cast4, %sub.ptr.rhs.cast5
+  store i32 %sub.ptr.sub6, i32* %__np, align 4
+  %13 = load i32, i32* %__np, align 4
+  %cmp7 = icmp sgt i32 %13, 0
+  br i1 %cmp7, label %if.then8, label %if.end15
+
+if.then8:                                         ; preds = %if.end3
+  %__sbuf_9 = getelementptr inbounds %"class.std::__1::ostreambuf_iterator", %"class.std::__1::ostreambuf_iterator"* %__s, i32 0, i32 0
+  %14 = load %"class.std::__1::basic_streambuf"*, %"class.std::__1::basic_streambuf"** %__sbuf_9, align 4
+  %15 = load i8*, i8** %__ob.addr, align 4
+  %16 = load i32, i32* %__np, align 4
+  store %"class.std::__1::basic_streambuf"* %14, %"class.std::__1::basic_streambuf"** %this.addr.i46, align 4
+  store i8* %15, i8** %__s.addr.i47, align 4
+  store i32 %16, i32* %__n.addr.i48, align 4
+  %this1.i49 = load %"class.std::__1::basic_streambuf"*, %"class.std::__1::basic_streambuf"** %this.addr.i46
+  %17 = bitcast %"class.std::__1::basic_streambuf"* %this1.i49 to i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)***
+  %vtable.i50 = load i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)**, i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)*** %17
+  %vfn.i51 = getelementptr inbounds i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)*, i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)** %vtable.i50, i64 12
+  %18 = load i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)*, i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)** %vfn.i51
+  %19 = load i8*, i8** %__s.addr.i47, align 4
+  %20 = load i32, i32* %__n.addr.i48, align 4
+  %call.i52 = call i32 %18(%"class.std::__1::basic_streambuf"* %this1.i49, i8* %19, i32 %20)
+  %21 = load i32, i32* %__np, align 4
+  %cmp11 = icmp ne i32 %call.i52, %21
+  br i1 %cmp11, label %if.then12, label %if.end14
+
+if.then12:                                        ; preds = %if.then8
+  %__sbuf_13 = getelementptr inbounds %"class.std::__1::ostreambuf_iterator", %"class.std::__1::ostreambuf_iterator"* %__s, i32 0, i32 0
+  store %"class.std::__1::basic_streambuf"* null, %"class.std::__1::basic_streambuf"** %__sbuf_13, align 4
+  %22 = bitcast %"class.std::__1::ostreambuf_iterator"* %retval to i8*
+  %23 = bitcast %"class.std::__1::ostreambuf_iterator"* %__s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %22, i8* %23, i32 4, i32 4, i1 false)
+  br label %return
+
+if.end14:                                         ; preds = %if.then8
+  br label %if.end15
+
+if.end15:                                         ; preds = %if.end14, %if.end3
+  %24 = load i32, i32* %__ns, align 4
+  %cmp16 = icmp sgt i32 %24, 0
+  br i1 %cmp16, label %if.then17, label %if.end25
+
+if.then17:                                        ; preds = %if.end15
+  %25 = load i32, i32* %__ns, align 4
+  %26 = load i8, i8* %__fl.addr, align 1
+  store %"class.std::__1::basic_string"* %__sp, %"class.std::__1::basic_string"** %this.addr.i53, align 4
+  store i32 %25, i32* %__n.addr.i54, align 4
+  store i8 %26, i8* %__c.addr.i, align 1
+  %this1.i55 = load %"class.std::__1::basic_string"*, %"class.std::__1::basic_string"** %this.addr.i53
+  %27 = load i32, i32* %__n.addr.i54, align 4
+  %28 = load i8, i8* %__c.addr.i, align 1
+  store %"class.std::__1::basic_string"* %this1.i55, %"class.std::__1::basic_string"** %this.addr.i.i, align 4
+  store i32 %27, i32* %__n.addr.i.i, align 4
+  store i8 %28, i8* %__c.addr.i.i, align 1
+  %this1.i.i = load %"class.std::__1::basic_string"*, %"class.std::__1::basic_string"** %this.addr.i.i
+  %29 = bitcast %"class.std::__1::basic_string"* %this1.i.i to %"class.std::__1::__basic_string_common"*
+  %__r_.i.i = getelementptr inbounds %"class.std::__1::basic_string", %"class.std::__1::basic_string"* %this1.i.i, i32 0, i32 0
+  store %"class.std::__1::__compressed_pair"* %__r_.i.i, %"class.std::__1::__compressed_pair"** %this.addr.i.i.i, align 4
+  %this1.i.i.i = load %"class.std::__1::__compressed_pair"*, %"class.std::__1::__compressed_pair"** %this.addr.i.i.i
+  %30 = bitcast %"class.std::__1::__compressed_pair"* %this1.i.i.i to %"class.std::__1::__libcpp_compressed_pair_imp"*
+  store %"class.std::__1::__libcpp_compressed_pair_imp"* %30, %"class.std::__1::__libcpp_compressed_pair_imp"** %this.addr.i.i.i.i, align 4
+  %this1.i.i.i.i = load %"class.std::__1::__libcpp_compressed_pair_imp"*, %"class.std::__1::__libcpp_compressed_pair_imp"** %this.addr.i.i.i.i
+  %31 = bitcast %"class.std::__1::__libcpp_compressed_pair_imp"* %this1.i.i.i.i to %"class.std::__1::allocator"*
+  store %"class.std::__1::allocator"* %31, %"class.std::__1::allocator"** %this.addr.i.i.i.i.i, align 4
+  %this1.i.i.i.i.i = load %"class.std::__1::allocator"*, %"class.std::__1::allocator"** %this.addr.i.i.i.i.i
+  %__first_.i.i.i.i = getelementptr inbounds %"class.std::__1::__libcpp_compressed_pair_imp", %"class.std::__1::__libcpp_compressed_pair_imp"* %this1.i.i.i.i, i32 0, i32 0
+  %32 = load i32, i32* %__n.addr.i.i, align 4
+  %33 = load i8, i8* %__c.addr.i.i, align 1
+  call void @_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE6__initEjc(%"class.std::__1::basic_string"* %this1.i.i, i32 %32, i8 zeroext %33)
+  %__sbuf_18 = getelementptr inbounds %"class.std::__1::ostreambuf_iterator", %"class.std::__1::ostreambuf_iterator"* %__s, i32 0, i32 0
+  %34 = load %"class.std::__1::basic_streambuf"*, %"class.std::__1::basic_streambuf"** %__sbuf_18, align 4
+  store %"class.std::__1::basic_string"* %__sp, %"class.std::__1::basic_string"** %this.addr.i60, align 4
+  %this1.i61 = load %"class.std::__1::basic_string"*, %"class.std::__1::basic_string"** %this.addr.i60
+  store %"class.std::__1::basic_string"* %this1.i61, %"class.std::__1::basic_string"** %this.addr.i.i59, align 4
+  %this1.i.i62 = load %"class.std::__1::basic_string"*, %"class.std::__1::basic_string"** %this.addr.i.i59
+  store %"class.std::__1::basic_string"* %this1.i.i62, %"class.std::__1::basic_string"** %this.addr.i.i.i58, align 4
+  %this1.i.i.i63 = load %"class.std::__1::basic_string"*, %"class.std::__1::basic_string"** %this.addr.i.i.i58
+  %__r_.i.i.i = getelementptr inbounds %"class.std::__1::basic_string", %"class.std::__1::basic_string"* %this1.i.i.i63, i32 0, i32 0
+  store %"class.std::__1::__compressed_pair"* %__r_.i.i.i, %"class.std::__1::__compressed_pair"** %this.addr.i.i.i.i57, align 4
+  %this1.i.i.i.i64 = load %"class.std::__1::__compressed_pair"*, %"class.std::__1::__compressed_pair"** %this.addr.i.i.i.i57
+  %35 = bitcast %"class.std::__1::__compressed_pair"* %this1.i.i.i.i64 to %"class.std::__1::__libcpp_compressed_pair_imp"*
+  store %"class.std::__1::__libcpp_compressed_pair_imp"* %35, %"class.std::__1::__libcpp_compressed_pair_imp"** %this.addr.i.i.i.i.i56, align 4
+  %this1.i.i.i.i.i65 = load %"class.std::__1::__libcpp_compressed_pair_imp"*, %"class.std::__1::__libcpp_compressed_pair_imp"** %this.addr.i.i.i.i.i56
+  %__first_.i.i.i.i.i = getelementptr inbounds %"class.std::__1::__libcpp_compressed_pair_imp", %"class.std::__1::__libcpp_compressed_pair_imp"* %this1.i.i.i.i.i65, i32 0, i32 0
+  %36 = getelementptr inbounds %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep", %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep"* %__first_.i.i.i.i.i, i32 0, i32 0
+  %__s.i.i.i = bitcast %union.anon* %36 to %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__short"*
+  %37 = getelementptr inbounds %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__short", %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__short"* %__s.i.i.i, i32 0, i32 0
+  %__size_.i.i.i = bitcast %union.anon.0* %37 to i8*
+  %38 = load i8, i8* %__size_.i.i.i, align 1
+  %conv.i.i.i = zext i8 %38 to i32
+  %and.i.i.i = and i32 %conv.i.i.i, 1
+  %tobool.i.i.i = icmp ne i32 %and.i.i.i, 0
+  br i1 %tobool.i.i.i, label %cond.true.i.i, label %cond.false.i.i
+
+cond.true.i.i:                                    ; preds = %if.then17
+  store %"class.std::__1::basic_string"* %this1.i.i62, %"class.std::__1::basic_string"** %this.addr.i15.i.i, align 4
+  %this1.i16.i.i = load %"class.std::__1::basic_string"*, %"class.std::__1::basic_string"** %this.addr.i15.i.i
+  %__r_.i17.i.i = getelementptr inbounds %"class.std::__1::basic_string", %"class.std::__1::basic_string"* %this1.i16.i.i, i32 0, i32 0
+  store %"class.std::__1::__compressed_pair"* %__r_.i17.i.i, %"class.std::__1::__compressed_pair"** %this.addr.i.i14.i.i, align 4
+  %this1.i.i18.i.i = load %"class.std::__1::__compressed_pair"*, %"class.std::__1::__compressed_pair"** %this.addr.i.i14.i.i
+  %39 = bitcast %"class.std::__1::__compressed_pair"* %this1.i.i18.i.i to %"class.std::__1::__libcpp_compressed_pair_imp"*
+  store %"class.std::__1::__libcpp_compressed_pair_imp"* %39, %"class.std::__1::__libcpp_compressed_pair_imp"** %this.addr.i.i.i13.i.i, align 4
+  %this1.i.i.i19.i.i = load %"class.std::__1::__libcpp_compressed_pair_imp"*, %"class.std::__1::__libcpp_compressed_pair_imp"** %this.addr.i.i.i13.i.i
+  %__first_.i.i.i20.i.i = getelementptr inbounds %"class.std::__1::__libcpp_compressed_pair_imp", %"class.std::__1::__libcpp_compressed_pair_imp"* %this1.i.i.i19.i.i, i32 0, i32 0
+  %40 = getelementptr inbounds %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep", %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep"* %__first_.i.i.i20.i.i, i32 0, i32 0
+  %__l.i.i.i = bitcast %union.anon* %40 to %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long"*
+  %__data_.i21.i.i = getelementptr inbounds %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long", %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long"* %__l.i.i.i, i32 0, i32 2
+  %41 = load i8*, i8** %__data_.i21.i.i, align 4
+  br label %_ZNKSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE4dataEv.exit
+
+cond.false.i.i:                                   ; preds = %if.then17
+  store %"class.std::__1::basic_string"* %this1.i.i62, %"class.std::__1::basic_string"** %this.addr.i6.i.i, align 4
+  %this1.i7.i.i = load %"class.std::__1::basic_string"*, %"class.std::__1::basic_string"** %this.addr.i6.i.i
+  %__r_.i8.i.i = getelementptr inbounds %"class.std::__1::basic_string", %"class.std::__1::basic_string"* %this1.i7.i.i, i32 0, i32 0
+  store %"class.std::__1::__compressed_pair"* %__r_.i8.i.i, %"class.std::__1::__compressed_pair"** %this.addr.i.i5.i.i, align 4
+  %this1.i.i9.i.i = load %"class.std::__1::__compressed_pair"*, %"class.std::__1::__compressed_pair"** %this.addr.i.i5.i.i
+  %42 = bitcast %"class.std::__1::__compressed_pair"* %this1.i.i9.i.i to %"class.std::__1::__libcpp_compressed_pair_imp"*
+  store %"class.std::__1::__libcpp_compressed_pair_imp"* %42, %"class.std::__1::__libcpp_compressed_pair_imp"** %this.addr.i.i.i4.i.i, align 4
+  %this1.i.i.i10.i.i = load %"class.std::__1::__libcpp_compressed_pair_imp"*, %"class.std::__1::__libcpp_compressed_pair_imp"** %this.addr.i.i.i4.i.i
+  %__first_.i.i.i11.i.i = getelementptr inbounds %"class.std::__1::__libcpp_compressed_pair_imp", %"class.std::__1::__libcpp_compressed_pair_imp"* %this1.i.i.i10.i.i, i32 0, i32 0
+  %43 = getelementptr inbounds %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep", %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep"* %__first_.i.i.i11.i.i, i32 0, i32 0
+  %__s.i12.i.i = bitcast %union.anon* %43 to %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__short"*
+  %__data_.i.i.i = getelementptr inbounds %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__short", %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__short"* %__s.i12.i.i, i32 0, i32 1
+  %arrayidx.i.i.i = getelementptr inbounds [11 x i8], [11 x i8]* %__data_.i.i.i, i32 0, i32 0
+  store i8* %arrayidx.i.i.i, i8** %__r.addr.i.i.i.i, align 4
+  %44 = load i8*, i8** %__r.addr.i.i.i.i, align 4
+  store i8* %44, i8** %__x.addr.i.i.i.i.i, align 4
+  %45 = load i8*, i8** %__x.addr.i.i.i.i.i, align 4
+  br label %_ZNKSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE4dataEv.exit
+
+_ZNKSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE4dataEv.exit: ; preds = %cond.false.i.i, %cond.true.i.i
+  %cond.i.i = phi i8* [ %41, %cond.true.i.i ], [ %45, %cond.false.i.i ]
+  store i8* %cond.i.i, i8** %__p.addr.i.i, align 4
+  %46 = load i8*, i8** %__p.addr.i.i, align 4
+  %47 = load i32, i32* %__ns, align 4
+  store %"class.std::__1::basic_streambuf"* %34, %"class.std::__1::basic_streambuf"** %this.addr.i66, align 4
+  store i8* %46, i8** %__s.addr.i67, align 4
+  store i32 %47, i32* %__n.addr.i68, align 4
+  %this1.i69 = load %"class.std::__1::basic_streambuf"*, %"class.std::__1::basic_streambuf"** %this.addr.i66
+  %48 = bitcast %"class.std::__1::basic_streambuf"* %this1.i69 to i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)***
+  %vtable.i70 = load i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)**, i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)*** %48
+  %vfn.i71 = getelementptr inbounds i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)*, i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)** %vtable.i70, i64 12
+  %49 = load i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)*, i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)** %vfn.i71
+  %50 = load i8*, i8** %__s.addr.i67, align 4
+  %51 = load i32, i32* %__n.addr.i68, align 4
+  %call.i7273 = invoke i32 %49(%"class.std::__1::basic_streambuf"* %this1.i69, i8* %50, i32 %51)
+          to label %_ZNSt3__115basic_streambufIcNS_11char_traitsIcEEE5sputnEPKci.exit unwind label %lpad
+
+_ZNSt3__115basic_streambufIcNS_11char_traitsIcEEE5sputnEPKci.exit: ; preds = %_ZNKSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE4dataEv.exit
+  br label %invoke.cont
+
+invoke.cont:                                      ; preds = %_ZNSt3__115basic_streambufIcNS_11char_traitsIcEEE5sputnEPKci.exit
+  %52 = load i32, i32* %__ns, align 4
+  %cmp21 = icmp ne i32 %call.i7273, %52
+  br i1 %cmp21, label %if.then22, label %if.end24
+
+if.then22:                                        ; preds = %invoke.cont
+  %__sbuf_23 = getelementptr inbounds %"class.std::__1::ostreambuf_iterator", %"class.std::__1::ostreambuf_iterator"* %__s, i32 0, i32 0
+  store %"class.std::__1::basic_streambuf"* null, %"class.std::__1::basic_streambuf"** %__sbuf_23, align 4
+  %53 = bitcast %"class.std::__1::ostreambuf_iterator"* %retval to i8*
+  %54 = bitcast %"class.std::__1::ostreambuf_iterator"* %__s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %53, i8* %54, i32 4, i32 4, i1 false)
+  store i32 1, i32* %cleanup.dest.slot
+  br label %cleanup
+
+lpad:                                             ; preds = %_ZNKSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE4dataEv.exit
+  %55 = landingpad { i8*, i32 }
+          cleanup
+  %56 = extractvalue { i8*, i32 } %55, 0
+  store i8* %56, i8** %exn.slot
+  %57 = extractvalue { i8*, i32 } %55, 1
+  store i32 %57, i32* %ehselector.slot
+  call void @_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED1Ev(%"class.std::__1::basic_string"* %__sp) #0
+  br label %eh.resume
+
+if.end24:                                         ; preds = %invoke.cont
+  store i32 0, i32* %cleanup.dest.slot
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.end24, %if.then22
+  call void @_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED1Ev(%"class.std::__1::basic_string"* %__sp) #0
+  %cleanup.dest = load i32, i32* %cleanup.dest.slot
+  switch i32 %cleanup.dest, label %unreachable [
+    i32 0, label %cleanup.cont
+    i32 1, label %return
+  ]
+
+cleanup.cont:                                     ; preds = %cleanup
+  br label %if.end25
+
+if.end25:                                         ; preds = %cleanup.cont, %if.end15
+  %58 = load i8*, i8** %__oe.addr, align 4
+  %59 = load i8*, i8** %__op.addr, align 4
+  %sub.ptr.lhs.cast26 = ptrtoint i8* %58 to i32
+  %sub.ptr.rhs.cast27 = ptrtoint i8* %59 to i32
+  %sub.ptr.sub28 = sub i32 %sub.ptr.lhs.cast26, %sub.ptr.rhs.cast27
+  store i32 %sub.ptr.sub28, i32* %__np, align 4
+  %60 = load i32, i32* %__np, align 4
+  %cmp29 = icmp sgt i32 %60, 0
+  br i1 %cmp29, label %if.then30, label %if.end37
+
+if.then30:                                        ; preds = %if.end25
+  %__sbuf_31 = getelementptr inbounds %"class.std::__1::ostreambuf_iterator", %"class.std::__1::ostreambuf_iterator"* %__s, i32 0, i32 0
+  %61 = load %"class.std::__1::basic_streambuf"*, %"class.std::__1::basic_streambuf"** %__sbuf_31, align 4
+  %62 = load i8*, i8** %__op.addr, align 4
+  %63 = load i32, i32* %__np, align 4
+  store %"class.std::__1::basic_streambuf"* %61, %"class.std::__1::basic_streambuf"** %this.addr.i44, align 4
+  store i8* %62, i8** %__s.addr.i, align 4
+  store i32 %63, i32* %__n.addr.i, align 4
+  %this1.i45 = load %"class.std::__1::basic_streambuf"*, %"class.std::__1::basic_streambuf"** %this.addr.i44
+  %64 = bitcast %"class.std::__1::basic_streambuf"* %this1.i45 to i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)***
+  %vtable.i = load i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)**, i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)*** %64
+  %vfn.i = getelementptr inbounds i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)*, i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)** %vtable.i, i64 12
+  %65 = load i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)*, i32 (%"class.std::__1::basic_streambuf"*, i8*, i32)** %vfn.i
+  %66 = load i8*, i8** %__s.addr.i, align 4
+  %67 = load i32, i32* %__n.addr.i, align 4
+  %call.i = call i32 %65(%"class.std::__1::basic_streambuf"* %this1.i45, i8* %66, i32 %67)
+  %68 = load i32, i32* %__np, align 4
+  %cmp33 = icmp ne i32 %call.i, %68
+  br i1 %cmp33, label %if.then34, label %if.end36
+
+if.then34:                                        ; preds = %if.then30
+  %__sbuf_35 = getelementptr inbounds %"class.std::__1::ostreambuf_iterator", %"class.std::__1::ostreambuf_iterator"* %__s, i32 0, i32 0
+  store %"class.std::__1::basic_streambuf"* null, %"class.std::__1::basic_streambuf"** %__sbuf_35, align 4
+  %69 = bitcast %"class.std::__1::ostreambuf_iterator"* %retval to i8*
+  %70 = bitcast %"class.std::__1::ostreambuf_iterator"* %__s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %69, i8* %70, i32 4, i32 4, i1 false)
+  br label %return
+
+if.end36:                                         ; preds = %if.then30
+  br label %if.end37
+
+if.end37:                                         ; preds = %if.end36, %if.end25
+  %71 = load %"class.std::__1::ios_base"*, %"class.std::__1::ios_base"** %__iob.addr, align 4
+  store %"class.std::__1::ios_base"* %71, %"class.std::__1::ios_base"** %this.addr.i41, align 4
+  store i32 0, i32* %__wide.addr.i, align 4
+  %this1.i42 = load %"class.std::__1::ios_base"*, %"class.std::__1::ios_base"** %this.addr.i41
+  %__width_.i43 = getelementptr inbounds %"class.std::__1::ios_base", %"class.std::__1::ios_base"* %this1.i42, i32 0, i32 3
+  %72 = load i32, i32* %__width_.i43, align 4
+  store i32 %72, i32* %__r.i, align 4
+  %73 = load i32, i32* %__wide.addr.i, align 4
+  %__width_2.i = getelementptr inbounds %"class.std::__1::ios_base", %"class.std::__1::ios_base"* %this1.i42, i32 0, i32 3
+  store i32 %73, i32* %__width_2.i, align 4
+  %74 = load i32, i32* %__r.i, align 4
+  %75 = bitcast %"class.std::__1::ostreambuf_iterator"* %retval to i8*
+  %76 = bitcast %"class.std::__1::ostreambuf_iterator"* %__s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %75, i8* %76, i32 4, i32 4, i1 false)
+  br label %return
+
+return:                                           ; preds = %if.end37, %if.then34, %cleanup, %if.then12, %if.then
+  %coerce.dive39 = getelementptr %"class.std::__1::ostreambuf_iterator", %"class.std::__1::ostreambuf_iterator"* %retval, i32 0, i32 0
+  %77 = load %"class.std::__1::basic_streambuf"*, %"class.std::__1::basic_streambuf"** %coerce.dive39
+  %coerce.val.pi = ptrtoint %"class.std::__1::basic_streambuf"* %77 to i32
+  ret i32 %coerce.val.pi
+
+eh.resume:                                        ; preds = %lpad
+  %exn = load i8*, i8** %exn.slot
+  %sel = load i32, i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
+  %lpad.val40 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
+  resume { i8*, i32 } %lpad.val40
+
+unreachable:                                      ; preds = %cleanup
+  unreachable
+}
+
+declare void @_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE6__initEjc(%"class.std::__1::basic_string"*, i32, i8 zeroext) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Clang 3.1"}
diff --git a/test/CodeGen/Hexagon/early-if-phi-i1.ll b/test/CodeGen/Hexagon/early-if-phi-i1.ll
new file mode 100644
index 0000000000000..1649d51269ee2
--- /dev/null
+++ b/test/CodeGen/Hexagon/early-if-phi-i1.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s
+; REQUIRES: asserts
+; Check that the early if-conversion does not predicate block1 (where the
+; join block has a phi node of type i1).
+
+define i1 @foo(i32 %x, i32* %p) {
+entry:
+  %c = icmp sgt i32 %x, 0
+  %c1 = icmp sgt i32 %x, 10
+  br i1 %c, label %block2, label %block1
+block1:
+  store i32 1, i32* %p, align 4
+  br label %block2
+block2:
+  %b = phi i1 [ 0, %entry ], [ %c1, %block1 ]
+  ret i1 %b
+}
diff --git a/test/CodeGen/Hexagon/early-if-spare.ll b/test/CodeGen/Hexagon/early-if-spare.ll
new file mode 100644
index 0000000000000..7497b53ba3ca6
--- /dev/null
+++ b/test/CodeGen/Hexagon/early-if-spare.ll
@@ -0,0 +1,57 @@
+; RUN: llc -O2 -mcpu=hexagonv5 < %s | FileCheck %s
+; Check if the three stores in the loop were predicated.
+; CHECK: if{{.*}}memw
+; CHECK: if{{.*}}memw
+; CHECK: if{{.*}}memw
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @fred(i32 %n, i32* %bp) nounwind {
+entry:
+  %cmp16 = icmp eq i32 %n, 0
+  br i1 %cmp16, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp2 = icmp ugt i32 %n, 32
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.017 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %call = tail call i32 @foo(i32* %bp) nounwind
+  %call1 = tail call i32 @bar(i32* %bp) nounwind
+  br i1 %cmp2, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32* %bp, i32 %i.017
+  store i32 %call, i32* %arrayidx, align 4, !tbaa !0
+  %add = add i32 %i.017, 2
+  %arrayidx3 = getelementptr inbounds i32, i32* %bp, i32 %add
+  store i32 %call1, i32* %arrayidx3, align 4, !tbaa !0
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %or = or i32 %call1, %call
+  %arrayidx4 = getelementptr inbounds i32, i32* %bp, i32 %i.017
+  store i32 %or, i32* %arrayidx4, align 4, !tbaa !0
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.else
+  %inc = add i32 %i.017, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+declare i32 @foo(i32*) nounwind
+
+declare i32 @bar(i32*) nounwind
+
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/early-if.ll b/test/CodeGen/Hexagon/early-if.ll
new file mode 100644
index 0000000000000..dcb1dd20b5155
--- /dev/null
+++ b/test/CodeGen/Hexagon/early-if.ll
@@ -0,0 +1,75 @@
+; RUN: llc -O2 -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; Rely on the comments generated by llc. Check that "if.then" was predicated.
+; CHECK: while.body13
+; CHECK: if{{.*}}memd
+; CHECK: while.end
+
+%struct.1 = type { i32, i32 }
+%struct.2 = type { [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [3 x i32], [24 x i32], [8 x %struct.1], [5 x i32] }
+
+@A1 = global i64 zeroinitializer
+@A2 = global i64 zeroinitializer
+@B1 = global i32 zeroinitializer
+@B2 = global i32 zeroinitializer
+@C1 = global i8 zeroinitializer
+
+declare i32 @llvm.hexagon.S2.cl0(i32) nounwind readnone
+declare i32 @llvm.hexagon.S2.setbit.r(i32, i32) nounwind readnone
+declare i64 @llvm.hexagon.M2.vmpy2s.s0(i32, i32) nounwind readnone
+declare i64 @llvm.hexagon.M2.vmac2s.s0(i64, i32, i32) nounwind readnone
+declare i64 @llvm.hexagon.A2.vaddws(i64, i64) nounwind readnone
+declare i64 @llvm.hexagon.A2.vsubws(i64, i64) nounwind readnone
+declare i32 @llvm.hexagon.A4.modwrapu(i32, i32) nounwind readnone
+
+define void @foo(i32 %n, i64* %ptr) nounwind {
+entry:
+  br label %while.body
+
+while.body:
+  %count = phi i32 [ 0, %entry ], [ %next, %while.end ]
+  %idx = phi i32 [ 0, %entry ], [ %15, %while.end ]
+  %0 = load i32, i32* @B1, align 4
+  %1 = load i32, i32* @B2, align 8
+  %2 = and i32 %1, %0
+  br label %while.body13
+
+while.body13:                                     ; preds = %while.body, %if.end
+  %3 = phi i64 [ %13, %if.end ], [ 0, %while.body ]
+  %4 = phi i64 [ %14, %if.end ], [ 0, %while.body ]
+  %m = phi i32 [ %6, %if.end ], [ %2, %while.body ]
+  %5 = tail call i32 @llvm.hexagon.S2.cl0(i32 %m)
+  %6 = tail call i32 @llvm.hexagon.S2.setbit.r(i32 %m, i32 %5)
+  %cgep85 = getelementptr [10 x %struct.2], [10 x %struct.2]* inttoptr (i32 -121502345 to [10 x %struct.2]*), i32 0, i32 %idx
+  %cgep90 = getelementptr %struct.2, %struct.2* %cgep85, i32 0, i32 12, i32 %5
+  %7 = load i32, i32* %cgep90, align 4
+  %8 = tail call i64 @llvm.hexagon.M2.vmpy2s.s0(i32 %7, i32 %7)
+  %cgep91 = getelementptr %struct.2, %struct.2* %cgep85, i32 0, i32 13, i32 %5
+  %9 = load i32, i32* %cgep91, align 4
+  %10 = tail call i64 @llvm.hexagon.M2.vmac2s.s0(i64 %8, i32 %9, i32 %9)
+  %11 = load i8, i8* @C1, align 1
+  %and24 = and i8 %11, 1
+  %cmp = icmp eq i8 %and24, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body13
+  %12 = tail call i64 @llvm.hexagon.A2.vaddws(i64 %3, i64 %10)
+  store i64 %12, i64* %ptr, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %while.body13
+  %13 = phi i64 [ %12, %if.then ], [ %3, %while.body13 ]
+  %14 = tail call i64 @llvm.hexagon.A2.vsubws(i64 %4, i64 %10)
+  %tobool12 = icmp eq i32 %6, 0
+  br i1 %tobool12, label %while.end, label %while.body13
+
+while.end:
+  %add40 = add i32 %idx, 1
+  %15 = tail call i32 @llvm.hexagon.A4.modwrapu(i32 %add40, i32 10) nounwind
+  %next = add i32 %count, 1
+  %cc = icmp eq i32 %next, %n
+  br i1 %cc, label %end, label %while.body
+
+end:
+  store i64 %10, i64* @A2, align 8
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/extload-combine.ll b/test/CodeGen/Hexagon/extload-combine.ll
index 519177fc75fc3..773b10b2b288e 100644
--- a/test/CodeGen/Hexagon/extload-combine.ll
+++ b/test/CodeGen/Hexagon/extload-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 -disable-hsdr  < %s | FileCheck %s
 ; Check that the combine/stxw instructions are being generated.
 ; In case of combine one of the operand should be 0 and another should be
 ; the output of absolute addressing load instruction.
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index 66c6662f735a6..341f8db9e3368 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon"
 
-define void @foo(i32* nocapture %a, i32* nocapture %b) nounwind {
+define void @foo(i32* nocapture %a, i32* nocapture %b) nounwind !dbg !5 {
 entry:
   tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !13, metadata !DIExpression()), !dbg !17
   tail call void @llvm.dbg.value(metadata i32* %b, i64 0, metadata !14, metadata !DIExpression()), !dbg !18
@@ -37,19 +37,19 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", isOptimized: true, emissionKind: 1, file: !28, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", isOptimized: true, emissionKind: 1, file: !28, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2)
 !2 = !{}
 !3 = !{!5}
-!5 = !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !28, scope: null, type: !7, function: void (i32*, i32*)* @foo, variables: !11)
+!5 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !28, scope: null, type: !7, variables: !11)
 !6 = !DIFile(filename: "hwloop-dbg.c", directory: "/usr2/kparzysz/s.hex/t")
 !7 = !DISubroutineType(types: !8)
 !8 = !{null, !9, !9}
 !9 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, baseType: !10)
 !10 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !11 = !{!13, !14, !15}
-!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 1, arg: 1, scope: !5, file: !6, type: !9)
-!14 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "b", line: 1, arg: 2, scope: !5, file: !6, type: !9)
-!15 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 2, scope: !16, file: !6, type: !10)
+!13 = !DILocalVariable(name: "a", line: 1, arg: 1, scope: !5, file: !6, type: !9)
+!14 = !DILocalVariable(name: "b", line: 1, arg: 2, scope: !5, file: !6, type: !9)
+!15 = !DILocalVariable(name: "i", line: 2, scope: !16, file: !6, type: !10)
 !16 = distinct !DILexicalBlock(line: 1, column: 26, file: !28, scope: !5)
 !17 = !DILocation(line: 1, column: 15, scope: !5)
 !18 = !DILocation(line: 1, column: 23, scope: !5)
diff --git a/test/CodeGen/Hexagon/i16_VarArg.ll b/test/CodeGen/Hexagon/i16_VarArg.ll
index ba98f62266830..74d066e4936e1 100644
--- a/test/CodeGen/Hexagon/i16_VarArg.ll
+++ b/test/CodeGen/Hexagon/i16_VarArg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; CHECK: call __hexagon_{{[A-Z_a-z0-9]+}}
 
 @a_str = internal constant [8 x i8] c"a = %f\0A\00"
diff --git a/test/CodeGen/Hexagon/i1_VarArg.ll b/test/CodeGen/Hexagon/i1_VarArg.ll
index 1908b3c71f3f0..4078c0f3f005e 100644
--- a/test/CodeGen/Hexagon/i1_VarArg.ll
+++ b/test/CodeGen/Hexagon/i1_VarArg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; CHECK: call __hexagon_{{[_A-Za-z0-9]+}}
 
 @a_str = internal constant [8 x i8] c"a = %f\0A\00"
diff --git a/test/CodeGen/Hexagon/i8_VarArg.ll b/test/CodeGen/Hexagon/i8_VarArg.ll
index c40a6a957270e..1353de47a9761 100644
--- a/test/CodeGen/Hexagon/i8_VarArg.ll
+++ b/test/CodeGen/Hexagon/i8_VarArg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; CHECK: call __hexagon_{{[A-Z_a-z0-9]+}}
 
 @a_str = internal constant [8 x i8] c"a = %f\0A\00"
diff --git a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
new file mode 100644
index 0000000000000..341567e1d02fd
--- /dev/null
+++ b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -hexagon-eif=0 -print-machineinstrs=if-converter %s -o /dev/null 2>&1 | FileCheck %s
+; Check that the edge weights are updated correctly after if-conversion.
+
+; CHECK: BB#3:
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}10.00%) BB#1({{[0-9a-fx/= ]+}}90.00%)
+@a = external global i32
+@d = external global i32
+
+; In the following CFG, A,B,C,D will be if-converted into a single block.
+; Check if the edge weights on edges to E and F are maintained correctly.
+;
+;    A
+;   / \
+;  B   C
+;   \ /
+;    D
+;   / \
+;  E   F
+;
+define void @test1(i8 zeroext %la, i8 zeroext %lb) {
+entry:
+  %cmp0 = call i1 @pred()
+  br i1 %cmp0, label %if.else2, label %if.then0, !prof !1
+
+if.else2:
+  call void @bar(i32 2)
+  br label %if.end2
+
+if.end2:
+  call void @foo(i32 2)
+  br label %return
+
+if.end:
+  %storemerge = phi i32 [ %and, %if.else ], [ %shl, %if.then ]
+  store i32 %storemerge, i32* @a, align 4
+  %0 = load i32, i32* @d, align 4
+  %cmp2 = call i1 @pred()
+  br i1 %cmp2, label %if.end2, label %if.else2, !prof !2
+
+if.then0:
+  %cmp = icmp eq i8 %la, %lb
+  br i1 %cmp, label %if.then, label %if.else, !prof !1
+
+if.then:
+  %conv1 = zext i8 %la to i32
+  %shl = shl nuw nsw i32 %conv1, 16
+  br label %if.end
+
+if.else:
+  %and8 = and i8 %lb, %la
+  %and = zext i8 %and8 to i32
+  br label %if.end
+
+return:
+  call void @foo(i32 2)
+  ret void
+}
+
+declare void @foo(i32)
+declare void @bar(i32)
+declare i1 @pred()
+
+!1 = !{!"branch_weights", i32 80, i32 20}
+!2 = !{!"branch_weights", i32 10, i32 90}
diff --git a/test/CodeGen/Hexagon/memcpy-likely-aligned.ll b/test/CodeGen/Hexagon/memcpy-likely-aligned.ll
new file mode 100644
index 0000000000000..f2677efc30495
--- /dev/null
+++ b/test/CodeGen/Hexagon/memcpy-likely-aligned.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+%struct.e = type { i8, i8, [2 x i8] }
+%struct.s = type { i8* }
+%struct.o = type { %struct.n }
+%struct.n = type { [2 x %struct.l] }
+%struct.l = type { %struct.e, %struct.d, %struct.e }
+%struct.d = type <{ i8, i8, i8, i8, [2 x i8], [2 x i8] }>
+
+@y = global { <{ { %struct.e, { i8, i8, i8, [5 x i8] }, %struct.e }, { %struct.e, { i8, i8, i8, [5 x i8] }, %struct.e } }> } { <{ { %struct.e, { i8, i8, i8, [5 x i8] }, %struct.e }, { %struct.e, { i8, i8, i8, [5 x i8] }, %struct.e } }> <{ { %struct.e, { i8, i8, i8, [5 x i8] }, %struct.e } { %struct.e { i8 3, i8 0, [2 x i8] undef }, { i8, i8, i8, [5 x i8] } { i8 -47, i8 2, i8 0, [5 x i8] undef }, %struct.e { i8 3, i8 0, [2 x i8] undef } }, { %struct.e, { i8, i8, i8, [5 x i8] }, %struct.e } { %struct.e { i8 3, i8 0, [2 x i8] undef }, { i8, i8, i8, [5 x i8] } { i8 -47, i8 2, i8 0, [5 x i8] undef }, %struct.e { i8 3, i8 0, [2 x i8] undef } } }> }, align 4
+@t = common global %struct.s zeroinitializer, align 4
+@q = internal global %struct.o* null, align 4
+
+define void @foo() nounwind {
+entry:
+  %0 = load i8*, i8** getelementptr inbounds (%struct.s, %struct.s* @t, i32 0, i32 0), align 4
+  %1 = bitcast i8* %0 to %struct.o*
+  store %struct.o* %1, %struct.o** @q, align 4
+  %2 = load %struct.o*, %struct.o** @q, align 4
+  %p = getelementptr inbounds %struct.o, %struct.o* %2, i32 0, i32 0
+  %m = getelementptr inbounds %struct.n, %struct.n* %p, i32 0, i32 0
+  %arraydecay = getelementptr inbounds [2 x %struct.l], [2 x %struct.l]* %m, i32 0, i32 0
+  %3 = bitcast %struct.l* %arraydecay to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %3, i8* getelementptr inbounds ({ <{ { %struct.e, { i8, i8, i8, [5 x i8] }, %struct.e }, { %struct.e, { i8, i8, i8, [5 x i8] }, %struct.e } }> }, { <{ { %struct.e, { i8, i8, i8, [5 x i8] }, %struct.e }, { %struct.e, { i8, i8, i8, [5 x i8] }, %struct.e } }> }* @y, i32 0, i32 0, i32 0, i32 0, i32 0), i32 32, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/Hexagon/mux-basic.ll b/test/CodeGen/Hexagon/mux-basic.ll
new file mode 100644
index 0000000000000..ef1f7cb60e175
--- /dev/null
+++ b/test/CodeGen/Hexagon/mux-basic.ll
@@ -0,0 +1,28 @@
+; RUN: llc -O2 < %s | FileCheck %s
+; We should generate a MUX instruction for one of the selects.
+; CHECK: mux
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+%struct.struct_t = type { i32, i32, i32 }
+
+define void @foo(%struct.struct_t* nocapture %p, i32 %x, i32 %y, i32 %z) nounwind {
+entry:
+  %cmp = icmp slt i32 %x, 4660
+  %add = add nsw i32 %x, 1
+  %add.y = select i1 %cmp, i32 %add, i32 %y
+  %x.add.y = select i1 %cmp, i32 %x, i32 %y
+  %. = zext i1 %cmp to i32
+  %b.0 = add nsw i32 %x.add.y, %z
+  %a3 = getelementptr inbounds %struct.struct_t, %struct.struct_t* %p, i32 0, i32 0
+  store i32 %add.y, i32* %a3, align 4, !tbaa !0
+  %b4 = getelementptr inbounds %struct.struct_t, %struct.struct_t* %p, i32 0, i32 1
+  store i32 %b.0, i32* %b4, align 4, !tbaa !0
+  %c5 = getelementptr inbounds %struct.struct_t, %struct.struct_t* %p, i32 0, i32 2
+  store i32 %., i32* %c5, align 4, !tbaa !0
+  ret void
+}
+
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/opt-fabs.ll b/test/CodeGen/Hexagon/opt-fabs.ll
index da657e4b1b8fe..2ecbce310adee 100644
--- a/test/CodeGen/Hexagon/opt-fabs.ll
+++ b/test/CodeGen/Hexagon/opt-fabs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv5  < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv5 -hexagon-bit=0 < %s | FileCheck %s
 ; Optimize fabsf to clrbit in V5.
 
 ; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #31)
diff --git a/test/CodeGen/Hexagon/pic-jumptables.ll b/test/CodeGen/Hexagon/pic-jumptables.ll
new file mode 100644
index 0000000000000..271105cb4b5bd
--- /dev/null
+++ b/test/CodeGen/Hexagon/pic-jumptables.ll
@@ -0,0 +1,48 @@
+; RUN: llc -march=hexagon -relocation-model=pic < %s | FileCheck %s
+
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}add({{pc|PC}}{{ *}},{{ *}}##
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}{{ *}}+{{ *}}r{{[0-9]+}}{{ *}}<<{{ *}}#2)
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}{{ *}},{{ *}}r{{[0-9]+}})
+
+
+define i32 @test(i32 %y) nounwind {
+entry:
+  switch i32 %y, label %sw.epilog [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb2
+    i32 4, label %sw.bb3
+    i32 5, label %sw.bb4
+  ]
+
+sw.bb:                                            ; preds = %entry
+  tail call void bitcast (void (...)* @baz1 to void ()*)() nounwind
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %entry
+  tail call void @baz2(i32 2, i32 78) nounwind
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %entry
+  tail call void @baz3(i32 59) nounwind
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %entry
+  tail call void @baz4(i32 4, i32 14) nounwind
+  br label %sw.epilog
+
+sw.bb4:                                           ; preds = %entry
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb4, %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb, %entry
+  %y.addr.0 = phi i32 [ %y, %entry ], [ 14, %sw.bb4 ], [ 4, %sw.bb3 ], [ 3, %sw.bb2 ], [ 2, %sw.bb1 ], [ 1, %sw.bb ]
+  ret i32 %y.addr.0
+}
+
+declare void @baz1(...)
+
+declare void @baz2(i32, i32)
+
+declare void @baz3(i32)
+
+declare void @baz4(i32, i32)
diff --git a/test/CodeGen/Hexagon/pic-simple.ll b/test/CodeGen/Hexagon/pic-simple.ll
new file mode 100644
index 0000000000000..fa223d5372e17
--- /dev/null
+++ b/test/CodeGen/Hexagon/pic-simple.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -relocation-model=pic < %s | FileCheck %s
+
+; CHECK: r{{[0-9]+}} = add({{pc|PC}}, ##_GLOBAL_OFFSET_TABLE_@PCREL)
+; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}{{.*}}+{{.*}}##src@GOT)
+; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}{{.*}}+{{.*}}##dst@GOT)
+
+@dst = external global i32
+@src = external global i32
+
+define i32 @foo() nounwind {
+entry:
+  %0 = load i32, i32* @src, align 4, !tbaa !0
+  store i32 %0, i32* @dst, align 4, !tbaa !0
+  %call = tail call i32 @baz(i32 %0) nounwind
+  ret i32 0
+}
+
+declare i32 @baz(i32)
+
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/pic-static.ll b/test/CodeGen/Hexagon/pic-static.ll
new file mode 100644
index 0000000000000..f4ccc6b9ee73d
--- /dev/null
+++ b/test/CodeGen/Hexagon/pic-static.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -relocation-model=pic < %s | FileCheck %s
+
+; CHECK-DAG: r{{[0-9]+}} = add({{pc|PC}}, ##_GLOBAL_OFFSET_TABLE_@PCREL)
+; CHECK-DAG: r{{[0-9]+}} = add({{pc|PC}}, ##x@PCREL)
+; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}{{.*}}+{{.*}}##bar@GOT)
+
+@x = internal global i32 9, align 4
+@bar = external global i32*
+
+define i32 @foo(i32 %y) nounwind {
+entry:
+  store i32* @x, i32** @bar, align 4, !tbaa !0
+  %0 = load i32, i32* @x, align 4, !tbaa !3
+  %add = add nsw i32 %0, %y
+  ret i32 %add
+}
+
+!0 = !{!"any pointer", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"int", !1}
diff --git a/test/CodeGen/Hexagon/relax.ll b/test/CodeGen/Hexagon/relax.ll
index 9823d4d1cd9c9..9af45f3353cea 100644
--- a/test/CodeGen/Hexagon/relax.ll
+++ b/test/CodeGen/Hexagon/relax.ll
@@ -7,8 +7,7 @@ call void @bar()
 ret void
 }
 
-
-; CHECK: { allocframe(#0) }
-; CHECK: { call 0 }
-; CHECK: 00000004:  R_HEX_B22_PCREL
-; CHECK: { dealloc_return }
\ No newline at end of file
+; CHECK: { call 0
+; CHECK:   allocframe(#0)
+; CHECK: 00000000:  R_HEX_B22_PCREL
+; CHECK: { dealloc_return }
diff --git a/test/CodeGen/Hexagon/sdr-basic.ll b/test/CodeGen/Hexagon/sdr-basic.ll
new file mode 100644
index 0000000000000..162e5452572d4
--- /dev/null
+++ b/test/CodeGen/Hexagon/sdr-basic.ll
@@ -0,0 +1,15 @@
+; RUN: llc -O2 -mcpu=hexagonv5 < %s | FileCheck %s
+; There should be no register pair used.
+; CHECK-NOT: r{{.*}}:{{[0-9]}} = and
+; CHECK-NOT: r{{.*}}:{{[0-9]}} = xor
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define i32 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
+entry:
+  %and = and i64 %y, -361700868401135616
+  %xor = xor i64 %and, %z
+  %shr1 = lshr i64 %xor, 32
+  %conv = trunc i64 %shr1 to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/Hexagon/sdr-shr32.ll b/test/CodeGen/Hexagon/sdr-shr32.ll
new file mode 100644
index 0000000000000..67dc1c14d7203
--- /dev/null
+++ b/test/CodeGen/Hexagon/sdr-shr32.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK-NOT: lsr{{.*}}#31
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon-unknown--elf"
+
+; Function Attrs: nounwind readnone
+define i64 @foo(i64 %x) #0 {
+entry:
+  %0 = tail call i64 @llvm.hexagon.S2.asr.i.p(i64 %x, i32 32)
+  ret i64 %0
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.S2.asr.i.p(i64, i32) #1
+
+attributes #0 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Clang $LLVM_VERSION_MAJOR.$LLVM_VERSION_MINOR (based on LLVM 3.7.0)"}
diff --git a/test/CodeGen/Hexagon/simple_addend.ll b/test/CodeGen/Hexagon/simple_addend.ll
index ec3a87f1dcc04..939d44b2a2413 100644
--- a/test/CodeGen/Hexagon/simple_addend.ll
+++ b/test/CodeGen/Hexagon/simple_addend.ll
@@ -7,4 +7,4 @@ define void @foo(i32 %a) {
   call void @bar(i32 %b)
   ret void
 }
-; CHECK:     0x8 R_HEX_B22_PCREL bar 0x4
+; CHECK:     0x4 R_HEX_B22_PCREL bar 0x4
diff --git a/test/CodeGen/Hexagon/store-widen-aliased-load.ll b/test/CodeGen/Hexagon/store-widen-aliased-load.ll
new file mode 100644
index 0000000000000..a8380306565ef
--- /dev/null
+++ b/test/CodeGen/Hexagon/store-widen-aliased-load.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK-NOT: memh
+; Check that store widening does not merge the two stores.
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+%struct.type_t = type { i8, i8, [2 x i8] }
+
+define zeroext i8 @foo(%struct.type_t* nocapture %p) nounwind {
+entry:
+  %a = getelementptr inbounds %struct.type_t, %struct.type_t* %p, i32 0, i32 0
+  store i8 0, i8* %a, align 2, !tbaa !0
+  %b = getelementptr inbounds %struct.type_t, %struct.type_t* %p, i32 0, i32 1
+  %0 = load i8, i8* %b, align 1, !tbaa !0
+  store i8 0, i8* %b, align 1, !tbaa !0
+  ret i8 %0
+}
+
+!0 = !{!"omnipotent char", !1}
+!1 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/store-widen-negv.ll b/test/CodeGen/Hexagon/store-widen-negv.ll
new file mode 100644
index 0000000000000..50a633d82be46
--- /dev/null
+++ b/test/CodeGen/Hexagon/store-widen-negv.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; We shouldn't see a 32-bit expansion of -120, just the uint8 value.
+; CHECK: #136
+define i32 @foo([4 x i8]* %ptr) {
+entry:
+  %msb = getelementptr inbounds [4 x i8], [4 x i8]* %ptr, i32 0, i32 3
+  %lsb = getelementptr inbounds [4 x i8], [4 x i8]* %ptr, i32 0, i32 2
+  store i8 0, i8* %msb
+  store i8 -120, i8* %lsb, align 2
+  ret i32 0
+}
diff --git a/test/CodeGen/Hexagon/store-widen-negv2.ll b/test/CodeGen/Hexagon/store-widen-negv2.ll
new file mode 100644
index 0000000000000..6abe01a6ed9fc
--- /dev/null
+++ b/test/CodeGen/Hexagon/store-widen-negv2.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK-LABEL: foo:
+; CHECK: memh(r0+#0){{.*}}={{.*}}#-2
+; Don't use memh(r0+#0)=##65534.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define void @foo(i16* nocapture %s) #0 {
+entry:
+  %0 = bitcast i16* %s to i8*
+  store i8 -2, i8* %0, align 2
+  %add.ptr = getelementptr inbounds i8, i8* %0, i32 1
+  store i8 -1, i8* %add.ptr, align 1
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/Hexagon/store-widen.ll b/test/CodeGen/Hexagon/store-widen.ll
new file mode 100644
index 0000000000000..9428093901c58
--- /dev/null
+++ b/test/CodeGen/Hexagon/store-widen.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @foo(i16* nocapture %a) nounwind {
+entry:
+; There should be a memw, not memh.
+; CHECK: memw
+  ; Cheated on the alignment, just to trigger the widening.
+  store i16 0, i16* %a, align 4, !tbaa !0
+  %arrayidx1 = getelementptr inbounds i16, i16* %a, i32 1
+  store i16 0, i16* %arrayidx1, align 2, !tbaa !0
+  ret void
+}
+
+!0 = !{!"short", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/struct_args.ll b/test/CodeGen/Hexagon/struct_args.ll
index 95b76c7999d4c..2ac1f8eadbb79 100644
--- a/test/CodeGen/Hexagon/struct_args.ll
+++ b/test/CodeGen/Hexagon/struct_args.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hsdr < %s | FileCheck %s
 ; CHECK: r{{[0-9]}}:{{[0-9]}} = combine({{r[0-9]|#0}}, r{{[0-9]}})
 ; CHECK: r{{[0-9]}}:{{[0-9]}} |= asl(r{{[0-9]}}:{{[0-9]}}, #32)
 
diff --git a/test/CodeGen/Hexagon/sube.ll b/test/CodeGen/Hexagon/sube.ll
index 9735894c419e2..fab3dcaefa86b 100644
--- a/test/CodeGen/Hexagon/sube.ll
+++ b/test/CodeGen/Hexagon/sube.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -disable-hsdr -hexagon-expand-condsets=0 -hexagon-bit=0 < %s | FileCheck %s
 
-; CHECK: r{{[0-9]+:[0-9]+}} = #1
-; CHECK: r{{[0-9]+:[0-9]+}} = #0
+; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #1)
+; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #0)
 ; CHECK: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
 ; CHECK: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
-; CHECK: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
 ; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}})
 ; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
 ; CHECK: r{{[0-9]+:[0-9]+}} = combine(r{{[0-9]+}}, r{{[0-9]+}})
 
 define void @check_sube_subc(i64 %AL, i64 %AH, i64 %BL, i64 %BH, i64* %RL, i64* %RH) {
diff --git a/test/CodeGen/Hexagon/tail-dup-subreg-abort.ll b/test/CodeGen/Hexagon/tail-dup-subreg-abort.ll
new file mode 100644
index 0000000000000..82dae2cc586a5
--- /dev/null
+++ b/test/CodeGen/Hexagon/tail-dup-subreg-abort.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=hexagon -O2 -disable-cgp < %s
+; REQUIRES: asserts
+;
+; Tail duplication can ignore subregister information on PHI nodes, and as
+; a result, generate COPY instructions between registers of different classes.
+; This could lead to HexagonInstrInfo::copyPhysReg aborting on an unhandled
+; src/dst combination.
+;
+define i32 @foo(i32 %x, i64 %y) nounwind {
+entry:
+  %a = icmp slt i32 %x, 0
+  %lo = trunc i64 %y to i32
+  br i1 %a, label %next, label %tail
+tail:
+  br label %join
+next:
+  %c = icmp eq i32 %x, 0
+  br i1 %c, label %b1, label %tail
+b1:
+  %t1 = lshr i64 %y, 32
+  %hi = trunc i64 %t1 to i32
+  br label %join
+join:
+  %val = phi i32 [ %hi, %b1 ], [ %lo, %tail ]
+  ret i32 %val
+}
+
+
diff --git a/test/CodeGen/Hexagon/tfr-to-combine.ll b/test/CodeGen/Hexagon/tfr-to-combine.ll
index a257acfeb49b6..1b82f3e4562e7 100644
--- a/test/CodeGen/Hexagon/tfr-to-combine.ll
+++ b/test/CodeGen/Hexagon/tfr-to-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv5  -O3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5  -O3 -disable-hsdr < %s | FileCheck %s
 
 ; Check that we combine TFRs and TFRIs into COMBINEs.
 
diff --git a/test/CodeGen/Hexagon/union-1.ll b/test/CodeGen/Hexagon/union-1.ll
index 1d93797db8582..8f2ff28b38144 100644
--- a/test/CodeGen/Hexagon/union-1.ll
+++ b/test/CodeGen/Hexagon/union-1.ll
@@ -2,8 +2,6 @@
 ; CHECK: word
 ; CHECK-NOT: combine(#0
 ; CHECK: jump bar
-; XFAIL: *
-; Disable this test temporarily.
 
 define void @word(i32* nocapture %a) nounwind {
 entry:
diff --git a/test/CodeGen/Hexagon/v60Intrins.ll b/test/CodeGen/Hexagon/v60Intrins.ll
new file mode 100644
index 0000000000000..5f4f294c405b3
--- /dev/null
+++ b/test/CodeGen/Hexagon/v60Intrins.ll
@@ -0,0 +1,2559 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv60 -O2  < %s | FileCheck %s
+
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vsetq(r{{[0-9]*}})
+; CHECK: q{{[0-3]}} |= vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vcmp.eq(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vcmp.eq(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vcmp.eq(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} &= vcmp.eq(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} &= vcmp.eq(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} &= vcmp.eq(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} |= vcmp.eq(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} |= vcmp.eq(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} |= vcmp.eq(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} ^= vcmp.eq(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} ^= vcmp.eq(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} ^= vcmp.eq(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vcmp.gt(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vcmp.gt(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vcmp.gt(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vcmp.gt(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vcmp.gt(v{{[0-9]*}}.uw,v{{[0-9]*}}.uw)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vcmp.gt(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} &= vcmp.gt(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} &= vcmp.gt(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} &= vcmp.gt(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} &= vcmp.gt(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} &= vcmp.gt(v{{[0-9]*}}.uw,v{{[0-9]*}}.uw)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} &= vcmp.gt(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} |= vcmp.gt(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} |= vcmp.gt(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} |= vcmp.gt(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} |= vcmp.gt(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} |= vcmp.gt(v{{[0-9]*}}.uw,v{{[0-9]*}}.uw)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} |= vcmp.gt(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} ^= vcmp.gt(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} ^= vcmp.gt(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} ^= vcmp.gt(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} ^= vcmp.gt(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} ^= vcmp.gt(v{{[0-9]*}}.uw,v{{[0-9]*}}.uw)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} ^= vcmp.gt(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = xor{{[0-9]*}}(q{{[0-3]}},q{{[0-3]}})
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = v
+; CHECK: v{{[0-9]*}} = valign(v{{[0-9]*}},v{{[0-9]*}},#0)
+; CHECK: v{{[0-9]*}} = valign(v{{[0-9]*}},v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vand(v{{[0-9]*}},v{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}} |= vand(q{{[0-3]}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vdelta(v{{[0-9]*}},v{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vlalign(v{{[0-9]*}},v{{[0-9]*}},#0)
+; CHECK: v{{[0-9]*}} = vlalign(v{{[0-9]*}},v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vmux(q{{[0-3]}},v{{[0-9]*}},v{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vnot(v{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vor{{[0-9]*}}(v{{[0-9]*}},v{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vr{{[0-9]*}}delta(v{{[0-9]*}},v{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vr{{[0-9]*}}or{{[0-9]*}}(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}} = vxor{{[0-9]*}}(v{{[0-9]*}},v{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.b = vadd(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.b = vasr{{[0-9]*}}(v{{[0-9]*}}.h,v{{[0-9]*}}.h,r{{[0-9]*}}):{{[0-9]*}}r{{[0-9]*}}nd:{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.b = vdeal(v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.b = vdeale(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.b = vlut32(v{{[0-9]*}}.b,v{{[0-9]*}}.b,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.b |= vlut32(v{{[0-9]*}}.b,v{{[0-9]*}}.b,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.b = vnav{{[0-9]*}}g(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}.b = vpack(v{{[0-9]*}}.h,v{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.b = vpacke(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.b = vpacko(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.b = vr{{[0-9]*}}ound(v{{[0-9]*}}.h,v{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.b = vshuff(v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.b = vshuffe(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.b = vshuffo(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.b = vsub(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.h = vabs(v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vabs(v{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.h = vadd(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vadd(v{{[0-9]*}}.h,v{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.h = vasl(v{{[0-9]*}}.h,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.h = vasl(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vasr{{[0-9]*}}(v{{[0-9]*}}.h,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.h = vasr{{[0-9]*}}(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vasr{{[0-9]*}}(v{{[0-9]*}}.w,v{{[0-9]*}}.w,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.h = vasr{{[0-9]*}}(v{{[0-9]*}}.w,v{{[0-9]*}}.w,r{{[0-9]*}}):{{[0-9]*}}r{{[0-9]*}}nd:{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.h = vasr{{[0-9]*}}(v{{[0-9]*}}.w,v{{[0-9]*}}.w,r{{[0-9]*}}):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.h = vav{{[0-9]*}}g(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vav{{[0-9]*}}g(v{{[0-9]*}}.h,v{{[0-9]*}}.h):{{[0-9]*}}r{{[0-9]*}}nd
+; CHECK: v{{[0-9]*}}.h = vdeal(v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vdmpy(v{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.h += vdmpy(v{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.h = vlsr{{[0-9]*}}(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vmax(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vmin(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vmpy(v{{[0-9]*}}.h,r{{[0-9]*}}.h):{{[0-9]*}}<<1:{{[0-9]*}}r{{[0-9]*}}nd:{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.h = vmpy(v{{[0-9]*}}.h,r{{[0-9]*}}.h):{{[0-9]*}}<<1:{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.h = vmpy(v{{[0-9]*}}.h,v{{[0-9]*}}.h):{{[0-9]*}}<<1:{{[0-9]*}}r{{[0-9]*}}nd:{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.h = vmpyi(v{{[0-9]*}}.h,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.h = vmpyi(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h += vmpyi(v{{[0-9]*}}.h,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.h += vmpyi(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vnav{{[0-9]*}}g(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vnor{{[0-9]*}}mamt(v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vpack(v{{[0-9]*}}.w,v{{[0-9]*}}.w):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.h = vpacke(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.h = vpacko(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.h = vpopcount(v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vr{{[0-9]*}}ound(v{{[0-9]*}}.w,v{{[0-9]*}}.w):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.h = vsat(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.h = vshuff(v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vshuffe(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vshuffo(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vsub(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.h = vsub(v{{[0-9]*}}.h,v{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.ub = vabsdiff(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}.ub = vadd(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.ub = vasr{{[0-9]*}}(v{{[0-9]*}}.h,v{{[0-9]*}}.h,r{{[0-9]*}}):{{[0-9]*}}r{{[0-9]*}}nd:{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.ub = vasr{{[0-9]*}}(v{{[0-9]*}}.h,v{{[0-9]*}}.h,r{{[0-9]*}}):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.ub = vav{{[0-9]*}}g(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}.ub = vav{{[0-9]*}}g(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub):{{[0-9]*}}r{{[0-9]*}}nd
+; CHECK: v{{[0-9]*}}.ub = vmax(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}.ub = vmin(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}.ub = vpack(v{{[0-9]*}}.h,v{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.ub = vr{{[0-9]*}}ound(v{{[0-9]*}}.h,v{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.ub = vsat(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.ub = vsub(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.uh = vabsdiff(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.uh = vabsdiff(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}.uh = vadd(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.uh = vasr{{[0-9]*}}(v{{[0-9]*}}.w,v{{[0-9]*}}.w,r{{[0-9]*}}):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.uh = vav{{[0-9]*}}g(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}.uh = vav{{[0-9]*}}g(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh):{{[0-9]*}}r{{[0-9]*}}nd
+; CHECK: v{{[0-9]*}}.uh = vcl0(v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}.uh = vlsr{{[0-9]*}}(v{{[0-9]*}}.uh,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.uh = vmax(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}.uh = vmin(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}.uh = vpack(v{{[0-9]*}}.w,v{{[0-9]*}}.w):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.uh = vr{{[0-9]*}}ound(v{{[0-9]*}}.w,v{{[0-9]*}}.w):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.uh = vsub(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.uw = vabsdiff(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.uw = vcl0(v{{[0-9]*}}.uw)
+; CHECK: v{{[0-9]*}}.uw = vlsr{{[0-9]*}}(v{{[0-9]*}}.uw,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.uw = vr{{[0-9]*}}mpy(v{{[0-9]*}}.ub,r{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}.uw = vr{{[0-9]*}}mpy(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}.uw += vr{{[0-9]*}}mpy(v{{[0-9]*}}.ub,r{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}.uw += vr{{[0-9]*}}mpy(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}.w = vabs(v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w = vabs(v{{[0-9]*}}.w):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w = vadd(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w = vadd(v{{[0-9]*}}.w,v{{[0-9]*}}.w):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w = vasl(v{{[0-9]*}}.w,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.w = vasl(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w += vasl(v{{[0-9]*}}.w,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.w = vasr{{[0-9]*}}(v{{[0-9]*}}.w,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.w = vasr{{[0-9]*}}(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w += vasr{{[0-9]*}}(v{{[0-9]*}}.w,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.w = vav{{[0-9]*}}g(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w = vav{{[0-9]*}}g(v{{[0-9]*}}.w,v{{[0-9]*}}.w):{{[0-9]*}}r{{[0-9]*}}nd
+; CHECK: v{{[0-9]*}}.w = vdmpy(v{{[0-9]*}}.h,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.w = vdmpy(v{{[0-9]*}}.h,r{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w = vdmpy(v{{[0-9]*}}.h,r{{[0-9]*}}.uh):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w = vdmpy(v{{[0-9]*}}.h,v{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w = vdmpy(v{{[0-9]*}}:{{[0-9]*}}.h,r{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w = vdmpy(v{{[0-9]*}}:{{[0-9]*}}.h,r{{[0-9]*}}.uh,#1):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w += vdmpy(v{{[0-9]*}}.h,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.w += vdmpy(v{{[0-9]*}}.h,r{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w += vdmpy(v{{[0-9]*}}.h,r{{[0-9]*}}.uh):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w += vdmpy(v{{[0-9]*}}.h,v{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w += vdmpy(v{{[0-9]*}}:{{[0-9]*}}.h,r{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w += vdmpy(v{{[0-9]*}}:{{[0-9]*}}.h,r{{[0-9]*}}.uh,#1):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w = vinser{{[0-9]*}}t(r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.w = vinser{{[0-9]*}}t(r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.w = vinser{{[0-9]*}}t(r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}.w = vlsr{{[0-9]*}}(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w = vmax(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w = vmin(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w = vmpye(v{{[0-9]*}}.w,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}.w = vmpyi(v{{[0-9]*}}.w,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.w = vmpyi(v{{[0-9]*}}.w,r{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.w += vmpyi(v{{[0-9]*}}.w,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.w += vmpyi(v{{[0-9]*}}.w,r{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.w = vmpyie(v{{[0-9]*}}.w,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}.w += vmpyie(v{{[0-9]*}}.w,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.w += vmpyie(v{{[0-9]*}}.w,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}.w = vmpyieo(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.w = vmpyio(v{{[0-9]*}}.w,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}.w = vmpyo(v{{[0-9]*}}.w,v{{[0-9]*}}.h):{{[0-9]*}}<<1:{{[0-9]*}}r{{[0-9]*}}nd:{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w = vmpyo(v{{[0-9]*}}.w,v{{[0-9]*}}.h):{{[0-9]*}}<<1:{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}.w += vmpyo(v{{[0-9]*}}.w,v{{[0-9]*}}.h):{{[0-9]*}}<<1:{{[0-9]*}}r{{[0-9]*}}nd:{{[0-9]*}}sat:{{[0-9]*}}shift
+; CHECK: v{{[0-9]*}}.w += vmpyo(v{{[0-9]*}}.w,v{{[0-9]*}}.h):{{[0-9]*}}<<1:{{[0-9]*}}sat:{{[0-9]*}}shift
+; CHECK: v{{[0-9]*}}.w = vnav{{[0-9]*}}g(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w = vnor{{[0-9]*}}mamt(v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w = vr{{[0-9]*}}mpy(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.w = vr{{[0-9]*}}mpy(v{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.w = vr{{[0-9]*}}mpy(v{{[0-9]*}}.ub,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.w += vr{{[0-9]*}}mpy(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.w += vr{{[0-9]*}}mpy(v{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.w += vr{{[0-9]*}}mpy(v{{[0-9]*}}.ub,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}.w = vsub(v{{[0-9]*}}.w,v{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}.w = vsub(v{{[0-9]*}}.w,v{{[0-9]*}}.w):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}:{{[0-9]*}} = vcombine(v{{[0-9]*}},v{{[0-9]*}})
+; CHECK: v{{[0-9]*}}:{{[0-9]*}} = vdeal(v{{[0-9]*}},v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}:{{[0-9]*}} = vshuff(v{{[0-9]*}},v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}:{{[0-9]*}} = vshuff(v{{[0-9]*}},v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}:{{[0-9]*}} = vshuff(v{{[0-9]*}},v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}:{{[0-9]*}} = vswap(q{{[0-3]}},v{{[0-9]*}},v{{[0-9]*}})
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.b = vadd(v{{[0-9]*}}:{{[0-9]*}}.b,v{{[0-9]*}}:{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.b = vshuffoe(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.b = vsub(v{{[0-9]*}}:{{[0-9]*}}.b,v{{[0-9]*}}:{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vadd(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vadd(v{{[0-9]*}}:{{[0-9]*}}.h,v{{[0-9]*}}:{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vadd(v{{[0-9]*}}:{{[0-9]*}}.h,v{{[0-9]*}}:{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vdmpy(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h += vdmpy(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vlut16(v{{[0-9]*}}.b,v{{[0-9]*}}.h,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h |= vlut16(v{{[0-9]*}}.b,v{{[0-9]*}}.h,r{{[0-9]*}})
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vmpa(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vmpa(v{{[0-9]*}}:{{[0-9]*}}.ub,v{{[0-9]*}}:{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vmpa(v{{[0-9]*}}:{{[0-9]*}}.ub,v{{[0-9]*}}:{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h += vmpa(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vmpy(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vmpy(v{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vmpy(v{{[0-9]*}}.ub,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h += vmpy(v{{[0-9]*}}.b,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h += vmpy(v{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h += vmpy(v{{[0-9]*}}.ub,v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vshuffoe(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vsub(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vsub(v{{[0-9]*}}:{{[0-9]*}}.h,v{{[0-9]*}}:{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vsub(v{{[0-9]*}}:{{[0-9]*}}.h,v{{[0-9]*}}:{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vsxt(v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vtmpy(v{{[0-9]*}}:{{[0-9]*}}.b,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vtmpy(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h += vtmpy(v{{[0-9]*}}:{{[0-9]*}}.b,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h += vtmpy(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h = vunpack(v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.h |= vunpacko(v{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.ub = vadd(v{{[0-9]*}}:{{[0-9]*}}.ub,v{{[0-9]*}}:{{[0-9]*}}.ub):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.ub = vsub(v{{[0-9]*}}:{{[0-9]*}}.ub,v{{[0-9]*}}:{{[0-9]*}}.ub):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uh = vadd(v{{[0-9]*}}:{{[0-9]*}}.uh,v{{[0-9]*}}:{{[0-9]*}}.uh):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uh = vmpy(v{{[0-9]*}}.ub,r{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uh = vmpy(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uh += vmpy(v{{[0-9]*}}.ub,r{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uh += vmpy(v{{[0-9]*}}.ub,v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uh = vsub(v{{[0-9]*}}:{{[0-9]*}}.uh,v{{[0-9]*}}:{{[0-9]*}}.uh):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uh = vunpack(v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uh = vzxt(v{{[0-9]*}}.ub)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw = vdsad(v{{[0-9]*}}:{{[0-9]*}}.uh,r{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw += vdsad(v{{[0-9]*}}:{{[0-9]*}}.uh,r{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw = vmpy(v{{[0-9]*}}.uh,r{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw = vmpy(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw += vmpy(v{{[0-9]*}}.uh,r{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw += vmpy(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw = vr{{[0-9]*}}mpy(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.ub,#0)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw += vr{{[0-9]*}}mpy(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.ub,#0)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw = vr{{[0-9]*}}sad(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.ub,#0)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw += vr{{[0-9]*}}sad(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.ub,#0)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw = vunpack(v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.uw = vzxt(v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vadd(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vadd(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vadd(v{{[0-9]*}}:{{[0-9]*}}.w,v{{[0-9]*}}:{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vadd(v{{[0-9]*}}:{{[0-9]*}}.w,v{{[0-9]*}}:{{[0-9]*}}.w):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vdmpy(v{{[0-9]*}}:{{[0-9]*}}.h,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w += vdmpy(v{{[0-9]*}}:{{[0-9]*}}.h,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vmpa(v{{[0-9]*}}:{{[0-9]*}}.h,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w += vmpa(v{{[0-9]*}}:{{[0-9]*}}.h,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vmpy(v{{[0-9]*}}.h,r{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vmpy(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vmpy(v{{[0-9]*}}.h,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w += vmpy(v{{[0-9]*}}.h,r{{[0-9]*}}.h):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w += vmpy(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w += vmpy(v{{[0-9]*}}.h,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vr{{[0-9]*}}mpy(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.b,#0)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w += vr{{[0-9]*}}mpy(v{{[0-9]*}}:{{[0-9]*}}.ub,r{{[0-9]*}}.b,#0)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vsub(v{{[0-9]*}}.h,v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vsub(v{{[0-9]*}}.uh,v{{[0-9]*}}.uh)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vsub(v{{[0-9]*}}:{{[0-9]*}}.w,v{{[0-9]*}}:{{[0-9]*}}.w)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vsub(v{{[0-9]*}}:{{[0-9]*}}.w,v{{[0-9]*}}:{{[0-9]*}}.w):{{[0-9]*}}sat
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vsxt(v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vtmpy(v{{[0-9]*}}:{{[0-9]*}}.h,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w += vtmpy(v{{[0-9]*}}:{{[0-9]*}}.h,r{{[0-9]*}}.b)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w = vunpack(v{{[0-9]*}}.h)
+; CHECK: v{{[0-9]*}}:{{[0-9]*}}.w |= vunpacko(v{{[0-9]*}}.h)
+target datalayout = "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon"
+
+@K = global i64 0, align 8
+@src = global i8 -1, align 1
+@vecpreds = common global [15 x <16 x i32>] zeroinitializer, align 64
+@Q6VecPredResult = common global <16 x i32> zeroinitializer, align 64
+@vectors = common global [15 x <16 x i32>] zeroinitializer, align 64
+@VectorResult = common global <16 x i32> zeroinitializer, align 64
+@vector_pairs = common global [15 x <32 x i32>] zeroinitializer, align 128
+@VectorPairResult = common global <32 x i32> zeroinitializer, align 128
+@dst_addresses = common global [15 x i8] zeroinitializer, align 8
+@ptr_addresses = common global [15 x i8*] zeroinitializer, align 8
+@src_addresses = common global [15 x i8*] zeroinitializer, align 8
+@dst = common global i8 0, align 1
+@ptr = common global [32768 x i8] zeroinitializer, align 8
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %0 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %1 = bitcast <16 x i32> %0 to <512 x i1>
+  %2 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 1), align 64
+  %3 = bitcast <16 x i32> %2 to <512 x i1>
+  %4 = call <512 x i1> @llvm.hexagon.V6.pred.and(<512 x i1> %1, <512 x i1> %3)
+  %5 = bitcast <512 x i1> %4 to <16 x i32>
+  store volatile <16 x i32> %5, <16 x i32>* @Q6VecPredResult, align 64
+  %6 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %7 = bitcast <16 x i32> %6 to <512 x i1>
+  %8 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 1), align 64
+  %9 = bitcast <16 x i32> %8 to <512 x i1>
+  %10 = call <512 x i1> @llvm.hexagon.V6.pred.and.n(<512 x i1> %7, <512 x i1> %9)
+  %11 = bitcast <512 x i1> %10 to <16 x i32>
+  store volatile <16 x i32> %11, <16 x i32>* @Q6VecPredResult, align 64
+  %12 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %13 = bitcast <16 x i32> %12 to <512 x i1>
+  %14 = call <512 x i1> @llvm.hexagon.V6.pred.not(<512 x i1> %13)
+  %15 = bitcast <512 x i1> %14 to <16 x i32>
+  store volatile <16 x i32> %15, <16 x i32>* @Q6VecPredResult, align 64
+  %16 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %17 = bitcast <16 x i32> %16 to <512 x i1>
+  %18 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 1), align 64
+  %19 = bitcast <16 x i32> %18 to <512 x i1>
+  %20 = call <512 x i1> @llvm.hexagon.V6.pred.or(<512 x i1> %17, <512 x i1> %19)
+  %21 = bitcast <512 x i1> %20 to <16 x i32>
+  store volatile <16 x i32> %21, <16 x i32>* @Q6VecPredResult, align 64
+  %22 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %23 = bitcast <16 x i32> %22 to <512 x i1>
+  %24 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 1), align 64
+  %25 = bitcast <16 x i32> %24 to <512 x i1>
+  %26 = call <512 x i1> @llvm.hexagon.V6.pred.or.n(<512 x i1> %23, <512 x i1> %25)
+  %27 = bitcast <512 x i1> %26 to <16 x i32>
+  store volatile <16 x i32> %27, <16 x i32>* @Q6VecPredResult, align 64
+  %28 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %29 = call <512 x i1> @llvm.hexagon.V6.vandvrt(<16 x i32> %28, i32 -1)
+  %30 = bitcast <512 x i1> %29 to <16 x i32>
+  store volatile <16 x i32> %30, <16 x i32>* @Q6VecPredResult, align 64
+  %31 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %32 = bitcast <16 x i32> %31 to <512 x i1>
+  %33 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %34 = call <512 x i1> @llvm.hexagon.V6.vandvrt.acc(<512 x i1> %32, <16 x i32> %33, i32 -1)
+  %35 = bitcast <512 x i1> %34 to <16 x i32>
+  store volatile <16 x i32> %35, <16 x i32>* @Q6VecPredResult, align 64
+  %36 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %37 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %38 = call <512 x i1> @llvm.hexagon.V6.veqb(<16 x i32> %36, <16 x i32> %37)
+  %39 = bitcast <512 x i1> %38 to <16 x i32>
+  store volatile <16 x i32> %39, <16 x i32>* @Q6VecPredResult, align 64
+  %40 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %41 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %42 = call <512 x i1> @llvm.hexagon.V6.veqh(<16 x i32> %40, <16 x i32> %41)
+  %43 = bitcast <512 x i1> %42 to <16 x i32>
+  store volatile <16 x i32> %43, <16 x i32>* @Q6VecPredResult, align 64
+  %44 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %45 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %46 = call <512 x i1> @llvm.hexagon.V6.veqw(<16 x i32> %44, <16 x i32> %45)
+  %47 = bitcast <512 x i1> %46 to <16 x i32>
+  store volatile <16 x i32> %47, <16 x i32>* @Q6VecPredResult, align 64
+  %48 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %49 = bitcast <16 x i32> %48 to <512 x i1>
+  %50 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %51 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %52 = call <512 x i1> @llvm.hexagon.V6.veqb.and(<512 x i1> %49, <16 x i32> %50, <16 x i32> %51)
+  %53 = bitcast <512 x i1> %52 to <16 x i32>
+  store volatile <16 x i32> %53, <16 x i32>* @Q6VecPredResult, align 64
+  %54 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %55 = bitcast <16 x i32> %54 to <512 x i1>
+  %56 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %57 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %58 = call <512 x i1> @llvm.hexagon.V6.veqh.and(<512 x i1> %55, <16 x i32> %56, <16 x i32> %57)
+  %59 = bitcast <512 x i1> %58 to <16 x i32>
+  store volatile <16 x i32> %59, <16 x i32>* @Q6VecPredResult, align 64
+  %60 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %61 = bitcast <16 x i32> %60 to <512 x i1>
+  %62 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %63 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %64 = call <512 x i1> @llvm.hexagon.V6.veqw.and(<512 x i1> %61, <16 x i32> %62, <16 x i32> %63)
+  %65 = bitcast <512 x i1> %64 to <16 x i32>
+  store volatile <16 x i32> %65, <16 x i32>* @Q6VecPredResult, align 64
+  %66 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %67 = bitcast <16 x i32> %66 to <512 x i1>
+  %68 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %69 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %70 = call <512 x i1> @llvm.hexagon.V6.veqb.or(<512 x i1> %67, <16 x i32> %68, <16 x i32> %69)
+  %71 = bitcast <512 x i1> %70 to <16 x i32>
+  store volatile <16 x i32> %71, <16 x i32>* @Q6VecPredResult, align 64
+  %72 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %73 = bitcast <16 x i32> %72 to <512 x i1>
+  %74 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %75 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %76 = call <512 x i1> @llvm.hexagon.V6.veqh.or(<512 x i1> %73, <16 x i32> %74, <16 x i32> %75)
+  %77 = bitcast <512 x i1> %76 to <16 x i32>
+  store volatile <16 x i32> %77, <16 x i32>* @Q6VecPredResult, align 64
+  %78 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %79 = bitcast <16 x i32> %78 to <512 x i1>
+  %80 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %81 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %82 = call <512 x i1> @llvm.hexagon.V6.veqw.or(<512 x i1> %79, <16 x i32> %80, <16 x i32> %81)
+  %83 = bitcast <512 x i1> %82 to <16 x i32>
+  store volatile <16 x i32> %83, <16 x i32>* @Q6VecPredResult, align 64
+  %84 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %85 = bitcast <16 x i32> %84 to <512 x i1>
+  %86 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %87 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %88 = call <512 x i1> @llvm.hexagon.V6.veqb.xor(<512 x i1> %85, <16 x i32> %86, <16 x i32> %87)
+  %89 = bitcast <512 x i1> %88 to <16 x i32>
+  store volatile <16 x i32> %89, <16 x i32>* @Q6VecPredResult, align 64
+  %90 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %91 = bitcast <16 x i32> %90 to <512 x i1>
+  %92 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %93 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %94 = call <512 x i1> @llvm.hexagon.V6.veqh.xor(<512 x i1> %91, <16 x i32> %92, <16 x i32> %93)
+  %95 = bitcast <512 x i1> %94 to <16 x i32>
+  store volatile <16 x i32> %95, <16 x i32>* @Q6VecPredResult, align 64
+  %96 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %97 = bitcast <16 x i32> %96 to <512 x i1>
+  %98 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %99 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %100 = call <512 x i1> @llvm.hexagon.V6.veqw.xor(<512 x i1> %97, <16 x i32> %98, <16 x i32> %99)
+  %101 = bitcast <512 x i1> %100 to <16 x i32>
+  store volatile <16 x i32> %101, <16 x i32>* @Q6VecPredResult, align 64
+  %102 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %103 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %104 = call <512 x i1> @llvm.hexagon.V6.vgtb(<16 x i32> %102, <16 x i32> %103)
+  %105 = bitcast <512 x i1> %104 to <16 x i32>
+  store volatile <16 x i32> %105, <16 x i32>* @Q6VecPredResult, align 64
+  %106 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %107 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %108 = call <512 x i1> @llvm.hexagon.V6.vgth(<16 x i32> %106, <16 x i32> %107)
+  %109 = bitcast <512 x i1> %108 to <16 x i32>
+  store volatile <16 x i32> %109, <16 x i32>* @Q6VecPredResult, align 64
+  %110 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %111 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %112 = call <512 x i1> @llvm.hexagon.V6.vgtub(<16 x i32> %110, <16 x i32> %111)
+  %113 = bitcast <512 x i1> %112 to <16 x i32>
+  store volatile <16 x i32> %113, <16 x i32>* @Q6VecPredResult, align 64
+  %114 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %115 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %116 = call <512 x i1> @llvm.hexagon.V6.vgtuh(<16 x i32> %114, <16 x i32> %115)
+  %117 = bitcast <512 x i1> %116 to <16 x i32>
+  store volatile <16 x i32> %117, <16 x i32>* @Q6VecPredResult, align 64
+  %118 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %119 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %120 = call <512 x i1> @llvm.hexagon.V6.vgtuw(<16 x i32> %118, <16 x i32> %119)
+  %121 = bitcast <512 x i1> %120 to <16 x i32>
+  store volatile <16 x i32> %121, <16 x i32>* @Q6VecPredResult, align 64
+  %122 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %123 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %124 = call <512 x i1> @llvm.hexagon.V6.vgtw(<16 x i32> %122, <16 x i32> %123)
+  %125 = bitcast <512 x i1> %124 to <16 x i32>
+  store volatile <16 x i32> %125, <16 x i32>* @Q6VecPredResult, align 64
+  %126 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %127 = bitcast <16 x i32> %126 to <512 x i1>
+  %128 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %129 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %130 = call <512 x i1> @llvm.hexagon.V6.vgtb.and(<512 x i1> %127, <16 x i32> %128, <16 x i32> %129)
+  %131 = bitcast <512 x i1> %130 to <16 x i32>
+  store volatile <16 x i32> %131, <16 x i32>* @Q6VecPredResult, align 64
+  %132 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %133 = bitcast <16 x i32> %132 to <512 x i1>
+  %134 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %135 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %136 = call <512 x i1> @llvm.hexagon.V6.vgth.and(<512 x i1> %133, <16 x i32> %134, <16 x i32> %135)
+  %137 = bitcast <512 x i1> %136 to <16 x i32>
+  store volatile <16 x i32> %137, <16 x i32>* @Q6VecPredResult, align 64
+  %138 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %139 = bitcast <16 x i32> %138 to <512 x i1>
+  %140 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %141 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %142 = call <512 x i1> @llvm.hexagon.V6.vgtub.and(<512 x i1> %139, <16 x i32> %140, <16 x i32> %141)
+  %143 = bitcast <512 x i1> %142 to <16 x i32>
+  store volatile <16 x i32> %143, <16 x i32>* @Q6VecPredResult, align 64
+  %144 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %145 = bitcast <16 x i32> %144 to <512 x i1>
+  %146 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %147 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %148 = call <512 x i1> @llvm.hexagon.V6.vgtuh.and(<512 x i1> %145, <16 x i32> %146, <16 x i32> %147)
+  %149 = bitcast <512 x i1> %148 to <16 x i32>
+  store volatile <16 x i32> %149, <16 x i32>* @Q6VecPredResult, align 64
+  %150 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %151 = bitcast <16 x i32> %150 to <512 x i1>
+  %152 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %153 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %154 = call <512 x i1> @llvm.hexagon.V6.vgtuw.and(<512 x i1> %151, <16 x i32> %152, <16 x i32> %153)
+  %155 = bitcast <512 x i1> %154 to <16 x i32>
+  store volatile <16 x i32> %155, <16 x i32>* @Q6VecPredResult, align 64
+  %156 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %157 = bitcast <16 x i32> %156 to <512 x i1>
+  %158 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %159 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %160 = call <512 x i1> @llvm.hexagon.V6.vgtw.and(<512 x i1> %157, <16 x i32> %158, <16 x i32> %159)
+  %161 = bitcast <512 x i1> %160 to <16 x i32>
+  store volatile <16 x i32> %161, <16 x i32>* @Q6VecPredResult, align 64
+  %162 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %163 = bitcast <16 x i32> %162 to <512 x i1>
+  %164 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %165 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %166 = call <512 x i1> @llvm.hexagon.V6.vgtb.or(<512 x i1> %163, <16 x i32> %164, <16 x i32> %165)
+  %167 = bitcast <512 x i1> %166 to <16 x i32>
+  store volatile <16 x i32> %167, <16 x i32>* @Q6VecPredResult, align 64
+  %168 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %169 = bitcast <16 x i32> %168 to <512 x i1>
+  %170 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %171 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %172 = call <512 x i1> @llvm.hexagon.V6.vgth.or(<512 x i1> %169, <16 x i32> %170, <16 x i32> %171)
+  %173 = bitcast <512 x i1> %172 to <16 x i32>
+  store volatile <16 x i32> %173, <16 x i32>* @Q6VecPredResult, align 64
+  %174 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %175 = bitcast <16 x i32> %174 to <512 x i1>
+  %176 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %177 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %178 = call <512 x i1> @llvm.hexagon.V6.vgtub.or(<512 x i1> %175, <16 x i32> %176, <16 x i32> %177)
+  %179 = bitcast <512 x i1> %178 to <16 x i32>
+  store volatile <16 x i32> %179, <16 x i32>* @Q6VecPredResult, align 64
+  %180 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %181 = bitcast <16 x i32> %180 to <512 x i1>
+  %182 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %183 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %184 = call <512 x i1> @llvm.hexagon.V6.vgtuh.or(<512 x i1> %181, <16 x i32> %182, <16 x i32> %183)
+  %185 = bitcast <512 x i1> %184 to <16 x i32>
+  store volatile <16 x i32> %185, <16 x i32>* @Q6VecPredResult, align 64
+  %186 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %187 = bitcast <16 x i32> %186 to <512 x i1>
+  %188 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %189 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %190 = call <512 x i1> @llvm.hexagon.V6.vgtuw.or(<512 x i1> %187, <16 x i32> %188, <16 x i32> %189)
+  %191 = bitcast <512 x i1> %190 to <16 x i32>
+  store volatile <16 x i32> %191, <16 x i32>* @Q6VecPredResult, align 64
+  %192 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %193 = bitcast <16 x i32> %192 to <512 x i1>
+  %194 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %195 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %196 = call <512 x i1> @llvm.hexagon.V6.vgtw.or(<512 x i1> %193, <16 x i32> %194, <16 x i32> %195)
+  %197 = bitcast <512 x i1> %196 to <16 x i32>
+  store volatile <16 x i32> %197, <16 x i32>* @Q6VecPredResult, align 64
+  %198 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %199 = bitcast <16 x i32> %198 to <512 x i1>
+  %200 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %201 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %202 = call <512 x i1> @llvm.hexagon.V6.vgtb.xor(<512 x i1> %199, <16 x i32> %200, <16 x i32> %201)
+  %203 = bitcast <512 x i1> %202 to <16 x i32>
+  store volatile <16 x i32> %203, <16 x i32>* @Q6VecPredResult, align 64
+  %204 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %205 = bitcast <16 x i32> %204 to <512 x i1>
+  %206 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %207 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %208 = call <512 x i1> @llvm.hexagon.V6.vgth.xor(<512 x i1> %205, <16 x i32> %206, <16 x i32> %207)
+  %209 = bitcast <512 x i1> %208 to <16 x i32>
+  store volatile <16 x i32> %209, <16 x i32>* @Q6VecPredResult, align 64
+  %210 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %211 = bitcast <16 x i32> %210 to <512 x i1>
+  %212 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %213 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %214 = call <512 x i1> @llvm.hexagon.V6.vgtub.xor(<512 x i1> %211, <16 x i32> %212, <16 x i32> %213)
+  %215 = bitcast <512 x i1> %214 to <16 x i32>
+  store volatile <16 x i32> %215, <16 x i32>* @Q6VecPredResult, align 64
+  %216 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %217 = bitcast <16 x i32> %216 to <512 x i1>
+  %218 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %219 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %220 = call <512 x i1> @llvm.hexagon.V6.vgtuh.xor(<512 x i1> %217, <16 x i32> %218, <16 x i32> %219)
+  %221 = bitcast <512 x i1> %220 to <16 x i32>
+  store volatile <16 x i32> %221, <16 x i32>* @Q6VecPredResult, align 64
+  %222 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %223 = bitcast <16 x i32> %222 to <512 x i1>
+  %224 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %225 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %226 = call <512 x i1> @llvm.hexagon.V6.vgtuw.xor(<512 x i1> %223, <16 x i32> %224, <16 x i32> %225)
+  %227 = bitcast <512 x i1> %226 to <16 x i32>
+  store volatile <16 x i32> %227, <16 x i32>* @Q6VecPredResult, align 64
+  %228 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %229 = bitcast <16 x i32> %228 to <512 x i1>
+  %230 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %231 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %232 = call <512 x i1> @llvm.hexagon.V6.vgtw.xor(<512 x i1> %229, <16 x i32> %230, <16 x i32> %231)
+  %233 = bitcast <512 x i1> %232 to <16 x i32>
+  store volatile <16 x i32> %233, <16 x i32>* @Q6VecPredResult, align 64
+  %234 = call <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32 1)
+  %235 = bitcast <512 x i1> %234 to <16 x i32>
+  store volatile <16 x i32> %235, <16 x i32>* @Q6VecPredResult, align 64
+  %236 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %237 = bitcast <16 x i32> %236 to <512 x i1>
+  %238 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 1), align 64
+  %239 = bitcast <16 x i32> %238 to <512 x i1>
+  %240 = call <512 x i1> @llvm.hexagon.V6.pred.xor(<512 x i1> %237, <512 x i1> %239)
+  %241 = bitcast <512 x i1> %240 to <16 x i32>
+  store volatile <16 x i32> %241, <16 x i32>* @Q6VecPredResult, align 64
+  %242 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %243 = call <16 x i32> @llvm.hexagon.V6.vassign(<16 x i32> %242)
+  store volatile <16 x i32> %243, <16 x i32>* @VectorResult, align 64
+  %244 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %245 = call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %244)
+  store volatile <16 x i32> %245, <16 x i32>* @VectorResult, align 64
+  %246 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %247 = call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %246)
+  store volatile <16 x i32> %247, <16 x i32>* @VectorResult, align 64
+  %248 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %249 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %250 = call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %248, <16 x i32> %249, i32 0)
+  store volatile <16 x i32> %250, <16 x i32>* @VectorResult, align 64
+  %251 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %252 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %253 = call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %251, <16 x i32> %252, i32 -1)
+  store volatile <16 x i32> %253, <16 x i32>* @VectorResult, align 64
+  %254 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %255 = bitcast <16 x i32> %254 to <512 x i1>
+  %256 = call <16 x i32> @llvm.hexagon.V6.vandqrt(<512 x i1> %255, i32 -1)
+  store volatile <16 x i32> %256, <16 x i32>* @VectorResult, align 64
+  %257 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %258 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %259 = call <16 x i32> @llvm.hexagon.V6.vand(<16 x i32> %257, <16 x i32> %258)
+  store volatile <16 x i32> %259, <16 x i32>* @VectorResult, align 64
+  %260 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %261 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %262 = bitcast <16 x i32> %261 to <512 x i1>
+  %263 = call <16 x i32> @llvm.hexagon.V6.vandqrt.acc(<16 x i32> %260, <512 x i1> %262, i32 -1)
+  store volatile <16 x i32> %263, <16 x i32>* @VectorResult, align 64
+  %264 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %265 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %266 = call <16 x i32> @llvm.hexagon.V6.vdelta(<16 x i32> %264, <16 x i32> %265)
+  store volatile <16 x i32> %266, <16 x i32>* @VectorResult, align 64
+  %267 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %268 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %269 = call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %267, <16 x i32> %268, i32 0)
+  store volatile <16 x i32> %269, <16 x i32>* @VectorResult, align 64
+  %270 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %271 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %272 = call <16 x i32> @llvm.hexagon.V6.vlalignb(<16 x i32> %270, <16 x i32> %271, i32 -1)
+  store volatile <16 x i32> %272, <16 x i32>* @VectorResult, align 64
+  %273 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %274 = bitcast <16 x i32> %273 to <512 x i1>
+  %275 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %276 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %277 = call <16 x i32> @llvm.hexagon.V6.vmux(<512 x i1> %274, <16 x i32> %275, <16 x i32> %276)
+  store volatile <16 x i32> %277, <16 x i32>* @VectorResult, align 64
+  %278 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %279 = call <16 x i32> @llvm.hexagon.V6.vnot(<16 x i32> %278)
+  store volatile <16 x i32> %279, <16 x i32>* @VectorResult, align 64
+  %280 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %281 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %282 = call <16 x i32> @llvm.hexagon.V6.vor(<16 x i32> %280, <16 x i32> %281)
+  store volatile <16 x i32> %282, <16 x i32>* @VectorResult, align 64
+  %283 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %284 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %285 = call <16 x i32> @llvm.hexagon.V6.vrdelta(<16 x i32> %283, <16 x i32> %284)
+  store volatile <16 x i32> %285, <16 x i32>* @VectorResult, align 64
+  %286 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %287 = call <16 x i32> @llvm.hexagon.V6.vror(<16 x i32> %286, i32 -1)
+  store volatile <16 x i32> %287, <16 x i32>* @VectorResult, align 64
+  %288 = call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 -1)
+  store volatile <16 x i32> %288, <16 x i32>* @VectorResult, align 64
+  %289 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %290 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %291 = call <16 x i32> @llvm.hexagon.V6.vxor(<16 x i32> %289, <16 x i32> %290)
+  store volatile <16 x i32> %291, <16 x i32>* @VectorResult, align 64
+  %292 = call <16 x i32> @llvm.hexagon.V6.vd0()
+  store volatile <16 x i32> %292, <16 x i32>* @VectorResult, align 64
+  %293 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %294 = bitcast <16 x i32> %293 to <512 x i1>
+  %295 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %296 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %297 = call <16 x i32> @llvm.hexagon.V6.vaddbq(<512 x i1> %294, <16 x i32> %295, <16 x i32> %296)
+  store volatile <16 x i32> %297, <16 x i32>* @VectorResult, align 64
+  %298 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %299 = bitcast <16 x i32> %298 to <512 x i1>
+  %300 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %301 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %302 = call <16 x i32> @llvm.hexagon.V6.vaddbnq(<512 x i1> %299, <16 x i32> %300, <16 x i32> %301)
+  store volatile <16 x i32> %302, <16 x i32>* @VectorResult, align 64
+  %303 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %304 = bitcast <16 x i32> %303 to <512 x i1>
+  %305 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %306 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %307 = call <16 x i32> @llvm.hexagon.V6.vsubbq(<512 x i1> %304, <16 x i32> %305, <16 x i32> %306)
+  store volatile <16 x i32> %307, <16 x i32>* @VectorResult, align 64
+  %308 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %309 = bitcast <16 x i32> %308 to <512 x i1>
+  %310 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %311 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %312 = call <16 x i32> @llvm.hexagon.V6.vsubbnq(<512 x i1> %309, <16 x i32> %310, <16 x i32> %311)
+  store volatile <16 x i32> %312, <16 x i32>* @VectorResult, align 64
+  %313 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %314 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %315 = call <16 x i32> @llvm.hexagon.V6.vaddb(<16 x i32> %313, <16 x i32> %314)
+  store volatile <16 x i32> %315, <16 x i32>* @VectorResult, align 64
+  %316 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %317 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %318 = call <16 x i32> @llvm.hexagon.V6.vasrhbrndsat(<16 x i32> %316, <16 x i32> %317, i32 -1)
+  store volatile <16 x i32> %318, <16 x i32>* @VectorResult, align 64
+  %319 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %320 = call <16 x i32> @llvm.hexagon.V6.vdealb(<16 x i32> %319)
+  store volatile <16 x i32> %320, <16 x i32>* @VectorResult, align 64
+  %321 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %322 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %323 = call <16 x i32> @llvm.hexagon.V6.vdealb4w(<16 x i32> %321, <16 x i32> %322)
+  store volatile <16 x i32> %323, <16 x i32>* @VectorResult, align 64
+  %324 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %325 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %326 = call <16 x i32> @llvm.hexagon.V6.vlutvvb(<16 x i32> %324, <16 x i32> %325, i32 -1)
+  store volatile <16 x i32> %326, <16 x i32>* @VectorResult, align 64
+  %327 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %328 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %329 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 2), align 64
+  %330 = call <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32> %327, <16 x i32> %328, <16 x i32> %329, i32 -1)
+  store volatile <16 x i32> %330, <16 x i32>* @VectorResult, align 64
+  %331 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %332 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %333 = call <16 x i32> @llvm.hexagon.V6.vnavgub(<16 x i32> %331, <16 x i32> %332)
+  store volatile <16 x i32> %333, <16 x i32>* @VectorResult, align 64
+  %334 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %335 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %336 = call <16 x i32> @llvm.hexagon.V6.vpackhb.sat(<16 x i32> %334, <16 x i32> %335)
+  store volatile <16 x i32> %336, <16 x i32>* @VectorResult, align 64
+  %337 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %338 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %339 = call <16 x i32> @llvm.hexagon.V6.vpackeb(<16 x i32> %337, <16 x i32> %338)
+  store volatile <16 x i32> %339, <16 x i32>* @VectorResult, align 64
+  %340 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %341 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %342 = call <16 x i32> @llvm.hexagon.V6.vpackob(<16 x i32> %340, <16 x i32> %341)
+  store volatile <16 x i32> %342, <16 x i32>* @VectorResult, align 64
+  %343 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %344 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %345 = call <16 x i32> @llvm.hexagon.V6.vroundhb(<16 x i32> %343, <16 x i32> %344)
+  store volatile <16 x i32> %345, <16 x i32>* @VectorResult, align 64
+  %346 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %347 = call <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32> %346)
+  store volatile <16 x i32> %347, <16 x i32>* @VectorResult, align 64
+  %348 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %349 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %350 = call <16 x i32> @llvm.hexagon.V6.vshuffeb(<16 x i32> %348, <16 x i32> %349)
+  store volatile <16 x i32> %350, <16 x i32>* @VectorResult, align 64
+  %351 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %352 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %353 = call <16 x i32> @llvm.hexagon.V6.vshuffob(<16 x i32> %351, <16 x i32> %352)
+  store volatile <16 x i32> %353, <16 x i32>* @VectorResult, align 64
+  %354 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %355 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %356 = call <16 x i32> @llvm.hexagon.V6.vsubb(<16 x i32> %354, <16 x i32> %355)
+  store volatile <16 x i32> %356, <16 x i32>* @VectorResult, align 64
+  %357 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %358 = bitcast <16 x i32> %357 to <512 x i1>
+  %359 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %360 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %361 = call <16 x i32> @llvm.hexagon.V6.vaddhq(<512 x i1> %358, <16 x i32> %359, <16 x i32> %360)
+  store volatile <16 x i32> %361, <16 x i32>* @VectorResult, align 64
+  %362 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %363 = bitcast <16 x i32> %362 to <512 x i1>
+  %364 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %365 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %366 = call <16 x i32> @llvm.hexagon.V6.vaddhnq(<512 x i1> %363, <16 x i32> %364, <16 x i32> %365)
+  store volatile <16 x i32> %366, <16 x i32>* @VectorResult, align 64
+  %367 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %368 = bitcast <16 x i32> %367 to <512 x i1>
+  %369 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %370 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %371 = call <16 x i32> @llvm.hexagon.V6.vsubhq(<512 x i1> %368, <16 x i32> %369, <16 x i32> %370)
+  store volatile <16 x i32> %371, <16 x i32>* @VectorResult, align 64
+  %372 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %373 = bitcast <16 x i32> %372 to <512 x i1>
+  %374 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %375 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %376 = call <16 x i32> @llvm.hexagon.V6.vsubhnq(<512 x i1> %373, <16 x i32> %374, <16 x i32> %375)
+  store volatile <16 x i32> %376, <16 x i32>* @VectorResult, align 64
+  %377 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %378 = call <16 x i32> @llvm.hexagon.V6.vabsh(<16 x i32> %377)
+  store volatile <16 x i32> %378, <16 x i32>* @VectorResult, align 64
+  %379 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %380 = call <16 x i32> @llvm.hexagon.V6.vabsh.sat(<16 x i32> %379)
+  store volatile <16 x i32> %380, <16 x i32>* @VectorResult, align 64
+  %381 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %382 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %383 = call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %381, <16 x i32> %382)
+  store volatile <16 x i32> %383, <16 x i32>* @VectorResult, align 64
+  %384 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %385 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %386 = call <16 x i32> @llvm.hexagon.V6.vaddhsat(<16 x i32> %384, <16 x i32> %385)
+  store volatile <16 x i32> %386, <16 x i32>* @VectorResult, align 64
+  %387 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %388 = call <16 x i32> @llvm.hexagon.V6.vaslh(<16 x i32> %387, i32 -1)
+  store volatile <16 x i32> %388, <16 x i32>* @VectorResult, align 64
+  %389 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %390 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %391 = call <16 x i32> @llvm.hexagon.V6.vaslhv(<16 x i32> %389, <16 x i32> %390)
+  store volatile <16 x i32> %391, <16 x i32>* @VectorResult, align 64
+  %392 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %393 = call <16 x i32> @llvm.hexagon.V6.vasrh(<16 x i32> %392, i32 -1)
+  store volatile <16 x i32> %393, <16 x i32>* @VectorResult, align 64
+  %394 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %395 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %396 = call <16 x i32> @llvm.hexagon.V6.vasrhv(<16 x i32> %394, <16 x i32> %395)
+  store volatile <16 x i32> %396, <16 x i32>* @VectorResult, align 64
+  %397 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %398 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %399 = call <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32> %397, <16 x i32> %398, i32 -1)
+  store volatile <16 x i32> %399, <16 x i32>* @VectorResult, align 64
+  %400 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %401 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %402 = call <16 x i32> @llvm.hexagon.V6.vasrwhrndsat(<16 x i32> %400, <16 x i32> %401, i32 -1)
+  store volatile <16 x i32> %402, <16 x i32>* @VectorResult, align 64
+  %403 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %404 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %405 = call <16 x i32> @llvm.hexagon.V6.vasrwhsat(<16 x i32> %403, <16 x i32> %404, i32 -1)
+  store volatile <16 x i32> %405, <16 x i32>* @VectorResult, align 64
+  %406 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %407 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %408 = call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %406, <16 x i32> %407)
+  store volatile <16 x i32> %408, <16 x i32>* @VectorResult, align 64
+  %409 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %410 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %411 = call <16 x i32> @llvm.hexagon.V6.vavghrnd(<16 x i32> %409, <16 x i32> %410)
+  store volatile <16 x i32> %411, <16 x i32>* @VectorResult, align 64
+  %412 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %413 = call <16 x i32> @llvm.hexagon.V6.vdealh(<16 x i32> %412)
+  store volatile <16 x i32> %413, <16 x i32>* @VectorResult, align 64
+  %414 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %415 = call <16 x i32> @llvm.hexagon.V6.vdmpybus(<16 x i32> %414, i32 -1)
+  store volatile <16 x i32> %415, <16 x i32>* @VectorResult, align 64
+  %416 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %417 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %418 = call <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32> %416, <16 x i32> %417, i32 -1)
+  store volatile <16 x i32> %418, <16 x i32>* @VectorResult, align 64
+  %419 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %420 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %421 = call <16 x i32> @llvm.hexagon.V6.vlsrhv(<16 x i32> %419, <16 x i32> %420)
+  store volatile <16 x i32> %421, <16 x i32>* @VectorResult, align 64
+  %422 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %423 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %424 = call <16 x i32> @llvm.hexagon.V6.vmaxh(<16 x i32> %422, <16 x i32> %423)
+  store volatile <16 x i32> %424, <16 x i32>* @VectorResult, align 64
+  %425 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %426 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %427 = call <16 x i32> @llvm.hexagon.V6.vminh(<16 x i32> %425, <16 x i32> %426)
+  store volatile <16 x i32> %427, <16 x i32>* @VectorResult, align 64
+  %428 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %429 = call <16 x i32> @llvm.hexagon.V6.vmpyhsrs(<16 x i32> %428, i32 -1)
+  store volatile <16 x i32> %429, <16 x i32>* @VectorResult, align 64
+  %430 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %431 = call <16 x i32> @llvm.hexagon.V6.vmpyhss(<16 x i32> %430, i32 -1)
+  store volatile <16 x i32> %431, <16 x i32>* @VectorResult, align 64
+  %432 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %433 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %434 = call <16 x i32> @llvm.hexagon.V6.vmpyhvsrs(<16 x i32> %432, <16 x i32> %433)
+  store volatile <16 x i32> %434, <16 x i32>* @VectorResult, align 64
+  %435 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %436 = call <16 x i32> @llvm.hexagon.V6.vmpyihb(<16 x i32> %435, i32 -1)
+  store volatile <16 x i32> %436, <16 x i32>* @VectorResult, align 64
+  %437 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %438 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %439 = call <16 x i32> @llvm.hexagon.V6.vmpyih(<16 x i32> %437, <16 x i32> %438)
+  store volatile <16 x i32> %439, <16 x i32>* @VectorResult, align 64
+  %440 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %441 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %442 = call <16 x i32> @llvm.hexagon.V6.vmpyihb.acc(<16 x i32> %440, <16 x i32> %441, i32 -1)
+  store volatile <16 x i32> %442, <16 x i32>* @VectorResult, align 64
+  %443 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %444 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %445 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 2), align 64
+  %446 = call <16 x i32> @llvm.hexagon.V6.vmpyih.acc(<16 x i32> %443, <16 x i32> %444, <16 x i32> %445)
+  store volatile <16 x i32> %446, <16 x i32>* @VectorResult, align 64
+  %447 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %448 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %449 = call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %447, <16 x i32> %448)
+  store volatile <16 x i32> %449, <16 x i32>* @VectorResult, align 64
+  %450 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %451 = call <16 x i32> @llvm.hexagon.V6.vnormamth(<16 x i32> %450)
+  store volatile <16 x i32> %451, <16 x i32>* @VectorResult, align 64
+  %452 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %453 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %454 = call <16 x i32> @llvm.hexagon.V6.vpackwh.sat(<16 x i32> %452, <16 x i32> %453)
+  store volatile <16 x i32> %454, <16 x i32>* @VectorResult, align 64
+  %455 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %456 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %457 = call <16 x i32> @llvm.hexagon.V6.vpackeh(<16 x i32> %455, <16 x i32> %456)
+  store volatile <16 x i32> %457, <16 x i32>* @VectorResult, align 64
+  %458 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %459 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %460 = call <16 x i32> @llvm.hexagon.V6.vpackoh(<16 x i32> %458, <16 x i32> %459)
+  store volatile <16 x i32> %460, <16 x i32>* @VectorResult, align 64
+  %461 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %462 = call <16 x i32> @llvm.hexagon.V6.vpopcounth(<16 x i32> %461)
+  store volatile <16 x i32> %462, <16 x i32>* @VectorResult, align 64
+  %463 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %464 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %465 = call <16 x i32> @llvm.hexagon.V6.vroundwh(<16 x i32> %463, <16 x i32> %464)
+  store volatile <16 x i32> %465, <16 x i32>* @VectorResult, align 64
+  %466 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %467 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %468 = call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %466, <16 x i32> %467)
+  store volatile <16 x i32> %468, <16 x i32>* @VectorResult, align 64
+  %469 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %470 = call <16 x i32> @llvm.hexagon.V6.vshuffh(<16 x i32> %469)
+  store volatile <16 x i32> %470, <16 x i32>* @VectorResult, align 64
+  %471 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %472 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %473 = call <16 x i32> @llvm.hexagon.V6.vshufeh(<16 x i32> %471, <16 x i32> %472)
+  store volatile <16 x i32> %473, <16 x i32>* @VectorResult, align 64
+  %474 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %475 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %476 = call <16 x i32> @llvm.hexagon.V6.vshufoh(<16 x i32> %474, <16 x i32> %475)
+  store volatile <16 x i32> %476, <16 x i32>* @VectorResult, align 64
+  %477 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %478 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %479 = call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %477, <16 x i32> %478)
+  store volatile <16 x i32> %479, <16 x i32>* @VectorResult, align 64
+  %480 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %481 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %482 = call <16 x i32> @llvm.hexagon.V6.vsubhsat(<16 x i32> %480, <16 x i32> %481)
+  store volatile <16 x i32> %482, <16 x i32>* @VectorResult, align 64
+  %483 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %484 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %485 = call <16 x i32> @llvm.hexagon.V6.vabsdiffub(<16 x i32> %483, <16 x i32> %484)
+  store volatile <16 x i32> %485, <16 x i32>* @VectorResult, align 64
+  %486 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %487 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %488 = call <16 x i32> @llvm.hexagon.V6.vaddubsat(<16 x i32> %486, <16 x i32> %487)
+  store volatile <16 x i32> %488, <16 x i32>* @VectorResult, align 64
+  %489 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %490 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %491 = call <16 x i32> @llvm.hexagon.V6.vasrhubrndsat(<16 x i32> %489, <16 x i32> %490, i32 -1)
+  store volatile <16 x i32> %491, <16 x i32>* @VectorResult, align 64
+  %492 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %493 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %494 = call <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32> %492, <16 x i32> %493, i32 -1)
+  store volatile <16 x i32> %494, <16 x i32>* @VectorResult, align 64
+  %495 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %496 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %497 = call <16 x i32> @llvm.hexagon.V6.vavgub(<16 x i32> %495, <16 x i32> %496)
+  store volatile <16 x i32> %497, <16 x i32>* @VectorResult, align 64
+  %498 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %499 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %500 = call <16 x i32> @llvm.hexagon.V6.vavgubrnd(<16 x i32> %498, <16 x i32> %499)
+  store volatile <16 x i32> %500, <16 x i32>* @VectorResult, align 64
+  %501 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %502 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %503 = call <16 x i32> @llvm.hexagon.V6.vmaxub(<16 x i32> %501, <16 x i32> %502)
+  store volatile <16 x i32> %503, <16 x i32>* @VectorResult, align 64
+  %504 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %505 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %506 = call <16 x i32> @llvm.hexagon.V6.vminub(<16 x i32> %504, <16 x i32> %505)
+  store volatile <16 x i32> %506, <16 x i32>* @VectorResult, align 64
+  %507 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %508 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %509 = call <16 x i32> @llvm.hexagon.V6.vpackhub.sat(<16 x i32> %507, <16 x i32> %508)
+  store volatile <16 x i32> %509, <16 x i32>* @VectorResult, align 64
+  %510 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %511 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %512 = call <16 x i32> @llvm.hexagon.V6.vroundhub(<16 x i32> %510, <16 x i32> %511)
+  store volatile <16 x i32> %512, <16 x i32>* @VectorResult, align 64
+  %513 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %514 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %515 = call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %513, <16 x i32> %514)
+  store volatile <16 x i32> %515, <16 x i32>* @VectorResult, align 64
+  %516 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %517 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %518 = call <16 x i32> @llvm.hexagon.V6.vsububsat(<16 x i32> %516, <16 x i32> %517)
+  store volatile <16 x i32> %518, <16 x i32>* @VectorResult, align 64
+  %519 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %520 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %521 = call <16 x i32> @llvm.hexagon.V6.vabsdiffh(<16 x i32> %519, <16 x i32> %520)
+  store volatile <16 x i32> %521, <16 x i32>* @VectorResult, align 64
+  %522 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %523 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %524 = call <16 x i32> @llvm.hexagon.V6.vabsdiffuh(<16 x i32> %522, <16 x i32> %523)
+  store volatile <16 x i32> %524, <16 x i32>* @VectorResult, align 64
+  %525 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %526 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %527 = call <16 x i32> @llvm.hexagon.V6.vadduhsat(<16 x i32> %525, <16 x i32> %526)
+  store volatile <16 x i32> %527, <16 x i32>* @VectorResult, align 64
+  %528 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %529 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %530 = call <16 x i32> @llvm.hexagon.V6.vasrwuhsat(<16 x i32> %528, <16 x i32> %529, i32 -1)
+  store volatile <16 x i32> %530, <16 x i32>* @VectorResult, align 64
+  %531 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %532 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %533 = call <16 x i32> @llvm.hexagon.V6.vavguh(<16 x i32> %531, <16 x i32> %532)
+  store volatile <16 x i32> %533, <16 x i32>* @VectorResult, align 64
+  %534 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %535 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %536 = call <16 x i32> @llvm.hexagon.V6.vavguhrnd(<16 x i32> %534, <16 x i32> %535)
+  store volatile <16 x i32> %536, <16 x i32>* @VectorResult, align 64
+  %537 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %538 = call <16 x i32> @llvm.hexagon.V6.vcl0h(<16 x i32> %537)
+  store volatile <16 x i32> %538, <16 x i32>* @VectorResult, align 64
+  %539 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %540 = call <16 x i32> @llvm.hexagon.V6.vlsrh(<16 x i32> %539, i32 -1)
+  store volatile <16 x i32> %540, <16 x i32>* @VectorResult, align 64
+  %541 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %542 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %543 = call <16 x i32> @llvm.hexagon.V6.vmaxuh(<16 x i32> %541, <16 x i32> %542)
+  store volatile <16 x i32> %543, <16 x i32>* @VectorResult, align 64
+  %544 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %545 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %546 = call <16 x i32> @llvm.hexagon.V6.vminuh(<16 x i32> %544, <16 x i32> %545)
+  store volatile <16 x i32> %546, <16 x i32>* @VectorResult, align 64
+  %547 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %548 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %549 = call <16 x i32> @llvm.hexagon.V6.vpackwuh.sat(<16 x i32> %547, <16 x i32> %548)
+  store volatile <16 x i32> %549, <16 x i32>* @VectorResult, align 64
+  %550 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %551 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %552 = call <16 x i32> @llvm.hexagon.V6.vroundwuh(<16 x i32> %550, <16 x i32> %551)
+  store volatile <16 x i32> %552, <16 x i32>* @VectorResult, align 64
+  %553 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %554 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %555 = call <16 x i32> @llvm.hexagon.V6.vsubuhsat(<16 x i32> %553, <16 x i32> %554)
+  store volatile <16 x i32> %555, <16 x i32>* @VectorResult, align 64
+  %556 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %557 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %558 = call <16 x i32> @llvm.hexagon.V6.vabsdiffw(<16 x i32> %556, <16 x i32> %557)
+  store volatile <16 x i32> %558, <16 x i32>* @VectorResult, align 64
+  %559 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %560 = call <16 x i32> @llvm.hexagon.V6.vcl0w(<16 x i32> %559)
+  store volatile <16 x i32> %560, <16 x i32>* @VectorResult, align 64
+  %561 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %562 = call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %561, i32 -1)
+  store volatile <16 x i32> %562, <16 x i32>* @VectorResult, align 64
+  %563 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %564 = call <16 x i32> @llvm.hexagon.V6.vrmpyub(<16 x i32> %563, i32 -1)
+  store volatile <16 x i32> %564, <16 x i32>* @VectorResult, align 64
+  %565 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %566 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %567 = call <16 x i32> @llvm.hexagon.V6.vrmpyubv(<16 x i32> %565, <16 x i32> %566)
+  store volatile <16 x i32> %567, <16 x i32>* @VectorResult, align 64
+  %568 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %569 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %570 = call <16 x i32> @llvm.hexagon.V6.vrmpyub.acc(<16 x i32> %568, <16 x i32> %569, i32 -1)
+  store volatile <16 x i32> %570, <16 x i32>* @VectorResult, align 64
+  %571 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %572 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %573 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 2), align 64
+  %574 = call <16 x i32> @llvm.hexagon.V6.vrmpyubv.acc(<16 x i32> %571, <16 x i32> %572, <16 x i32> %573)
+  store volatile <16 x i32> %574, <16 x i32>* @VectorResult, align 64
+  %575 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %576 = bitcast <16 x i32> %575 to <512 x i1>
+  %577 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %578 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %579 = call <16 x i32> @llvm.hexagon.V6.vaddwq(<512 x i1> %576, <16 x i32> %577, <16 x i32> %578)
+  store volatile <16 x i32> %579, <16 x i32>* @VectorResult, align 64
+  %580 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %581 = bitcast <16 x i32> %580 to <512 x i1>
+  %582 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %583 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %584 = call <16 x i32> @llvm.hexagon.V6.vaddwnq(<512 x i1> %581, <16 x i32> %582, <16 x i32> %583)
+  store volatile <16 x i32> %584, <16 x i32>* @VectorResult, align 64
+  %585 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %586 = bitcast <16 x i32> %585 to <512 x i1>
+  %587 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %588 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %589 = call <16 x i32> @llvm.hexagon.V6.vsubwq(<512 x i1> %586, <16 x i32> %587, <16 x i32> %588)
+  store volatile <16 x i32> %589, <16 x i32>* @VectorResult, align 64
+  %590 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %591 = bitcast <16 x i32> %590 to <512 x i1>
+  %592 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %593 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %594 = call <16 x i32> @llvm.hexagon.V6.vsubwnq(<512 x i1> %591, <16 x i32> %592, <16 x i32> %593)
+  store volatile <16 x i32> %594, <16 x i32>* @VectorResult, align 64
+  %595 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %596 = call <16 x i32> @llvm.hexagon.V6.vabsw(<16 x i32> %595)
+  store volatile <16 x i32> %596, <16 x i32>* @VectorResult, align 64
+  %597 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %598 = call <16 x i32> @llvm.hexagon.V6.vabsw.sat(<16 x i32> %597)
+  store volatile <16 x i32> %598, <16 x i32>* @VectorResult, align 64
+  %599 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %600 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %601 = call <16 x i32> @llvm.hexagon.V6.vaddw(<16 x i32> %599, <16 x i32> %600)
+  store volatile <16 x i32> %601, <16 x i32>* @VectorResult, align 64
+  %602 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %603 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %604 = call <16 x i32> @llvm.hexagon.V6.vaddwsat(<16 x i32> %602, <16 x i32> %603)
+  store volatile <16 x i32> %604, <16 x i32>* @VectorResult, align 64
+  %605 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %606 = call <16 x i32> @llvm.hexagon.V6.vaslw(<16 x i32> %605, i32 -1)
+  store volatile <16 x i32> %606, <16 x i32>* @VectorResult, align 64
+  %607 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %608 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %609 = call <16 x i32> @llvm.hexagon.V6.vaslwv(<16 x i32> %607, <16 x i32> %608)
+  store volatile <16 x i32> %609, <16 x i32>* @VectorResult, align 64
+  %610 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %611 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %612 = call <16 x i32> @llvm.hexagon.V6.vaslw.acc(<16 x i32> %610, <16 x i32> %611, i32 -1)
+  store volatile <16 x i32> %612, <16 x i32>* @VectorResult, align 64
+  %613 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %614 = call <16 x i32> @llvm.hexagon.V6.vasrw(<16 x i32> %613, i32 -1)
+  store volatile <16 x i32> %614, <16 x i32>* @VectorResult, align 64
+  %615 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %616 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %617 = call <16 x i32> @llvm.hexagon.V6.vasrwv(<16 x i32> %615, <16 x i32> %616)
+  store volatile <16 x i32> %617, <16 x i32>* @VectorResult, align 64
+  %618 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %619 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %620 = call <16 x i32> @llvm.hexagon.V6.vasrw.acc(<16 x i32> %618, <16 x i32> %619, i32 -1)
+  store volatile <16 x i32> %620, <16 x i32>* @VectorResult, align 64
+  %621 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %622 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %623 = call <16 x i32> @llvm.hexagon.V6.vavgw(<16 x i32> %621, <16 x i32> %622)
+  store volatile <16 x i32> %623, <16 x i32>* @VectorResult, align 64
+  %624 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %625 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %626 = call <16 x i32> @llvm.hexagon.V6.vavgwrnd(<16 x i32> %624, <16 x i32> %625)
+  store volatile <16 x i32> %626, <16 x i32>* @VectorResult, align 64
+  %627 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %628 = call <16 x i32> @llvm.hexagon.V6.vdmpyhb(<16 x i32> %627, i32 -1)
+  store volatile <16 x i32> %628, <16 x i32>* @VectorResult, align 64
+  %629 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %630 = call <16 x i32> @llvm.hexagon.V6.vdmpyhsat(<16 x i32> %629, i32 -1)
+  store volatile <16 x i32> %630, <16 x i32>* @VectorResult, align 64
+  %631 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %632 = call <16 x i32> @llvm.hexagon.V6.vdmpyhsusat(<16 x i32> %631, i32 -1)
+  store volatile <16 x i32> %632, <16 x i32>* @VectorResult, align 64
+  %633 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %634 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %635 = call <16 x i32> @llvm.hexagon.V6.vdmpyhvsat(<16 x i32> %633, <16 x i32> %634)
+  store volatile <16 x i32> %635, <16 x i32>* @VectorResult, align 64
+  %636 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %637 = call <16 x i32> @llvm.hexagon.V6.vdmpyhisat(<32 x i32> %636, i32 -1)
+  store volatile <16 x i32> %637, <16 x i32>* @VectorResult, align 64
+  %638 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %639 = call <16 x i32> @llvm.hexagon.V6.vdmpyhsuisat(<32 x i32> %638, i32 -1)
+  store volatile <16 x i32> %639, <16 x i32>* @VectorResult, align 64
+  %640 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %641 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %642 = call <16 x i32> @llvm.hexagon.V6.vdmpyhb.acc(<16 x i32> %640, <16 x i32> %641, i32 -1)
+  store volatile <16 x i32> %642, <16 x i32>* @VectorResult, align 64
+  %643 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %644 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %645 = call <16 x i32> @llvm.hexagon.V6.vdmpyhsat.acc(<16 x i32> %643, <16 x i32> %644, i32 -1)
+  store volatile <16 x i32> %645, <16 x i32>* @VectorResult, align 64
+  %646 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %647 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %648 = call <16 x i32> @llvm.hexagon.V6.vdmpyhsusat.acc(<16 x i32> %646, <16 x i32> %647, i32 -1)
+  store volatile <16 x i32> %648, <16 x i32>* @VectorResult, align 64
+  %649 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %650 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %651 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 2), align 64
+  %652 = call <16 x i32> @llvm.hexagon.V6.vdmpyhvsat.acc(<16 x i32> %649, <16 x i32> %650, <16 x i32> %651)
+  store volatile <16 x i32> %652, <16 x i32>* @VectorResult, align 64
+  %653 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %654 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %655 = call <16 x i32> @llvm.hexagon.V6.vdmpyhisat.acc(<16 x i32> %653, <32 x i32> %654, i32 -1)
+  store volatile <16 x i32> %655, <16 x i32>* @VectorResult, align 64
+  %656 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %657 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %658 = call <16 x i32> @llvm.hexagon.V6.vdmpyhsuisat.acc(<16 x i32> %656, <32 x i32> %657, i32 -1)
+  store volatile <16 x i32> %658, <16 x i32>* @VectorResult, align 64
+  %659 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %660 = call <16 x i32> @llvm.hexagon.V6.vinsertwr(<16 x i32> %659, i32 -1)
+  store volatile <16 x i32> %660, <16 x i32>* @VectorResult, align 64
+  %661 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %662 = call <16 x i32> @llvm.hexagon.V6.vinsertwr(<16 x i32> %661, i32 0)
+  store volatile <16 x i32> %662, <16 x i32>* @VectorResult, align 64
+  %663 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %664 = call <16 x i32> @llvm.hexagon.V6.vinsertwr(<16 x i32> %663, i32 1)
+  store volatile <16 x i32> %664, <16 x i32>* @VectorResult, align 64
+  %665 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %666 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %667 = call <16 x i32> @llvm.hexagon.V6.vlsrwv(<16 x i32> %665, <16 x i32> %666)
+  store volatile <16 x i32> %667, <16 x i32>* @VectorResult, align 64
+  %668 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %669 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %670 = call <16 x i32> @llvm.hexagon.V6.vmaxw(<16 x i32> %668, <16 x i32> %669)
+  store volatile <16 x i32> %670, <16 x i32>* @VectorResult, align 64
+  %671 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %672 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %673 = call <16 x i32> @llvm.hexagon.V6.vminw(<16 x i32> %671, <16 x i32> %672)
+  store volatile <16 x i32> %673, <16 x i32>* @VectorResult, align 64
+  %674 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %675 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %676 = call <16 x i32> @llvm.hexagon.V6.vmpyewuh(<16 x i32> %674, <16 x i32> %675)
+  store volatile <16 x i32> %676, <16 x i32>* @VectorResult, align 64
+  %677 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %678 = call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %677, i32 -1)
+  store volatile <16 x i32> %678, <16 x i32>* @VectorResult, align 64
+  %679 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %680 = call <16 x i32> @llvm.hexagon.V6.vmpyiwh(<16 x i32> %679, i32 -1)
+  store volatile <16 x i32> %680, <16 x i32>* @VectorResult, align 64
+  %681 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %682 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %683 = call <16 x i32> @llvm.hexagon.V6.vmpyiwb.acc(<16 x i32> %681, <16 x i32> %682, i32 -1)
+  store volatile <16 x i32> %683, <16 x i32>* @VectorResult, align 64
+  %684 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %685 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %686 = call <16 x i32> @llvm.hexagon.V6.vmpyiwh.acc(<16 x i32> %684, <16 x i32> %685, i32 -1)
+  store volatile <16 x i32> %686, <16 x i32>* @VectorResult, align 64
+  %687 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %688 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %689 = call <16 x i32> @llvm.hexagon.V6.vmpyiewuh(<16 x i32> %687, <16 x i32> %688)
+  store volatile <16 x i32> %689, <16 x i32>* @VectorResult, align 64
+  %690 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %691 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %692 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 2), align 64
+  %693 = call <16 x i32> @llvm.hexagon.V6.vmpyiewh.acc(<16 x i32> %690, <16 x i32> %691, <16 x i32> %692)
+  store volatile <16 x i32> %693, <16 x i32>* @VectorResult, align 64
+  %694 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %695 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %696 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 2), align 64
+  %697 = call <16 x i32> @llvm.hexagon.V6.vmpyiewuh.acc(<16 x i32> %694, <16 x i32> %695, <16 x i32> %696)
+  store volatile <16 x i32> %697, <16 x i32>* @VectorResult, align 64
+  %698 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %699 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %700 = call <16 x i32> @llvm.hexagon.V6.vmpyieoh(<16 x i32> %698, <16 x i32> %699)
+  store volatile <16 x i32> %700, <16 x i32>* @VectorResult, align 64
+  %701 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %702 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %703 = call <16 x i32> @llvm.hexagon.V6.vmpyiowh(<16 x i32> %701, <16 x i32> %702)
+  store volatile <16 x i32> %703, <16 x i32>* @VectorResult, align 64
+  %704 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %705 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %706 = call <16 x i32> @llvm.hexagon.V6.vmpyowh.rnd(<16 x i32> %704, <16 x i32> %705)
+  store volatile <16 x i32> %706, <16 x i32>* @VectorResult, align 64
+  %707 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %708 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %709 = call <16 x i32> @llvm.hexagon.V6.vmpyowh(<16 x i32> %707, <16 x i32> %708)
+  store volatile <16 x i32> %709, <16 x i32>* @VectorResult, align 64
+  %710 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %711 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %712 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 2), align 64
+  %713 = call <16 x i32> @llvm.hexagon.V6.vmpyowh.rnd.sacc(<16 x i32> %710, <16 x i32> %711, <16 x i32> %712)
+  store volatile <16 x i32> %713, <16 x i32>* @VectorResult, align 64
+  %714 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %715 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %716 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 2), align 64
+  %717 = call <16 x i32> @llvm.hexagon.V6.vmpyowh.sacc(<16 x i32> %714, <16 x i32> %715, <16 x i32> %716)
+  store volatile <16 x i32> %717, <16 x i32>* @VectorResult, align 64
+  %718 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %719 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %720 = call <16 x i32> @llvm.hexagon.V6.vnavgw(<16 x i32> %718, <16 x i32> %719)
+  store volatile <16 x i32> %720, <16 x i32>* @VectorResult, align 64
+  %721 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %722 = call <16 x i32> @llvm.hexagon.V6.vnormamtw(<16 x i32> %721)
+  store volatile <16 x i32> %722, <16 x i32>* @VectorResult, align 64
+  %723 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %724 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %725 = call <16 x i32> @llvm.hexagon.V6.vrmpybv(<16 x i32> %723, <16 x i32> %724)
+  store volatile <16 x i32> %725, <16 x i32>* @VectorResult, align 64
+  %726 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %727 = call <16 x i32> @llvm.hexagon.V6.vrmpybus(<16 x i32> %726, i32 -1)
+  store volatile <16 x i32> %727, <16 x i32>* @VectorResult, align 64
+  %728 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %729 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %730 = call <16 x i32> @llvm.hexagon.V6.vrmpybusv(<16 x i32> %728, <16 x i32> %729)
+  store volatile <16 x i32> %730, <16 x i32>* @VectorResult, align 64
+  %731 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %732 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %733 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 2), align 64
+  %734 = call <16 x i32> @llvm.hexagon.V6.vrmpybv.acc(<16 x i32> %731, <16 x i32> %732, <16 x i32> %733)
+  store volatile <16 x i32> %734, <16 x i32>* @VectorResult, align 64
+  %735 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %736 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %737 = call <16 x i32> @llvm.hexagon.V6.vrmpybus.acc(<16 x i32> %735, <16 x i32> %736, i32 -1)
+  store volatile <16 x i32> %737, <16 x i32>* @VectorResult, align 64
+  %738 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %739 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %740 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 2), align 64
+  %741 = call <16 x i32> @llvm.hexagon.V6.vrmpybusv.acc(<16 x i32> %738, <16 x i32> %739, <16 x i32> %740)
+  store volatile <16 x i32> %741, <16 x i32>* @VectorResult, align 64
+  %742 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %743 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %744 = call <16 x i32> @llvm.hexagon.V6.vsubw(<16 x i32> %742, <16 x i32> %743)
+  store volatile <16 x i32> %744, <16 x i32>* @VectorResult, align 64
+  %745 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %746 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %747 = call <16 x i32> @llvm.hexagon.V6.vsubwsat(<16 x i32> %745, <16 x i32> %746)
+  store volatile <16 x i32> %747, <16 x i32>* @VectorResult, align 64
+  %748 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %749 = call <32 x i32> @llvm.hexagon.V6.vassignp(<32 x i32> %748)
+  store volatile <32 x i32> %749, <32 x i32>* @VectorPairResult, align 128
+  %750 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %751 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %752 = call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %750, <16 x i32> %751)
+  store volatile <32 x i32> %752, <32 x i32>* @VectorPairResult, align 128
+  %753 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %754 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %755 = call <32 x i32> @llvm.hexagon.V6.vdealvdd(<16 x i32> %753, <16 x i32> %754, i32 -1)
+  store volatile <32 x i32> %755, <32 x i32>* @VectorPairResult, align 128
+  %756 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %757 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %758 = call <32 x i32> @llvm.hexagon.V6.vshuffvdd(<16 x i32> %756, <16 x i32> %757, i32 -1)
+  store volatile <32 x i32> %758, <32 x i32>* @VectorPairResult, align 128
+  %759 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %760 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %761 = call <32 x i32> @llvm.hexagon.V6.vshuffvdd(<16 x i32> %759, <16 x i32> %760, i32 0)
+  store volatile <32 x i32> %761, <32 x i32>* @VectorPairResult, align 128
+  %762 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %763 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %764 = call <32 x i32> @llvm.hexagon.V6.vshuffvdd(<16 x i32> %762, <16 x i32> %763, i32 1)
+  store volatile <32 x i32> %764, <32 x i32>* @VectorPairResult, align 128
+  %765 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %766 = bitcast <16 x i32> %765 to <512 x i1>
+  %767 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %768 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %769 = call <32 x i32> @llvm.hexagon.V6.vswap(<512 x i1> %766, <16 x i32> %767, <16 x i32> %768)
+  store volatile <32 x i32> %769, <32 x i32>* @VectorPairResult, align 128
+  %770 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %771 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %772 = call <32 x i32> @llvm.hexagon.V6.vaddb.dv(<32 x i32> %770, <32 x i32> %771)
+  store volatile <32 x i32> %772, <32 x i32>* @VectorPairResult, align 128
+  %773 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %774 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %775 = call <32 x i32> @llvm.hexagon.V6.vshufoeb(<16 x i32> %773, <16 x i32> %774)
+  store volatile <32 x i32> %775, <32 x i32>* @VectorPairResult, align 128
+  %776 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %777 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %778 = call <32 x i32> @llvm.hexagon.V6.vsubb.dv(<32 x i32> %776, <32 x i32> %777)
+  store volatile <32 x i32> %778, <32 x i32>* @VectorPairResult, align 128
+  %779 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %780 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %781 = call <32 x i32> @llvm.hexagon.V6.vaddubh(<16 x i32> %779, <16 x i32> %780)
+  store volatile <32 x i32> %781, <32 x i32>* @VectorPairResult, align 128
+  %782 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %783 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %784 = call <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32> %782, <32 x i32> %783)
+  store volatile <32 x i32> %784, <32 x i32>* @VectorPairResult, align 128
+  %785 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %786 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %787 = call <32 x i32> @llvm.hexagon.V6.vaddhsat.dv(<32 x i32> %785, <32 x i32> %786)
+  store volatile <32 x i32> %787, <32 x i32>* @VectorPairResult, align 128
+  %788 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %789 = call <32 x i32> @llvm.hexagon.V6.vdmpybus.dv(<32 x i32> %788, i32 -1)
+  store volatile <32 x i32> %789, <32 x i32>* @VectorPairResult, align 128
+  %790 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %791 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %792 = call <32 x i32> @llvm.hexagon.V6.vdmpybus.dv.acc(<32 x i32> %790, <32 x i32> %791, i32 -1)
+  store volatile <32 x i32> %792, <32 x i32>* @VectorPairResult, align 128
+  %793 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %794 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %795 = call <32 x i32> @llvm.hexagon.V6.vlutvwh(<16 x i32> %793, <16 x i32> %794, i32 -1)
+  store volatile <32 x i32> %795, <32 x i32>* @VectorPairResult, align 128
+  %796 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %797 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %798 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %799 = call <32 x i32> @llvm.hexagon.V6.vlutvwh.oracc(<32 x i32> %796, <16 x i32> %797, <16 x i32> %798, i32 -1)
+  store volatile <32 x i32> %799, <32 x i32>* @VectorPairResult, align 128
+  %800 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %801 = call <32 x i32> @llvm.hexagon.V6.vmpabus(<32 x i32> %800, i32 -1)
+  store volatile <32 x i32> %801, <32 x i32>* @VectorPairResult, align 128
+  %802 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %803 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %804 = call <32 x i32> @llvm.hexagon.V6.vmpabusv(<32 x i32> %802, <32 x i32> %803)
+  store volatile <32 x i32> %804, <32 x i32>* @VectorPairResult, align 128
+  %805 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %806 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %807 = call <32 x i32> @llvm.hexagon.V6.vmpabuuv(<32 x i32> %805, <32 x i32> %806)
+  store volatile <32 x i32> %807, <32 x i32>* @VectorPairResult, align 128
+  %808 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %809 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %810 = call <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32> %808, <32 x i32> %809, i32 -1)
+  store volatile <32 x i32> %810, <32 x i32>* @VectorPairResult, align 128
+  %811 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %812 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %813 = call <32 x i32> @llvm.hexagon.V6.vmpybv(<16 x i32> %811, <16 x i32> %812)
+  store volatile <32 x i32> %813, <32 x i32>* @VectorPairResult, align 128
+  %814 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %815 = call <32 x i32> @llvm.hexagon.V6.vmpybus(<16 x i32> %814, i32 -1)
+  store volatile <32 x i32> %815, <32 x i32>* @VectorPairResult, align 128
+  %816 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %817 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %818 = call <32 x i32> @llvm.hexagon.V6.vmpybusv(<16 x i32> %816, <16 x i32> %817)
+  store volatile <32 x i32> %818, <32 x i32>* @VectorPairResult, align 128
+  %819 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %820 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %821 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %822 = call <32 x i32> @llvm.hexagon.V6.vmpybv.acc(<32 x i32> %819, <16 x i32> %820, <16 x i32> %821)
+  store volatile <32 x i32> %822, <32 x i32>* @VectorPairResult, align 128
+  %823 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %824 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %825 = call <32 x i32> @llvm.hexagon.V6.vmpybus.acc(<32 x i32> %823, <16 x i32> %824, i32 -1)
+  store volatile <32 x i32> %825, <32 x i32>* @VectorPairResult, align 128
+  %826 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %827 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %828 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %829 = call <32 x i32> @llvm.hexagon.V6.vmpybusv.acc(<32 x i32> %826, <16 x i32> %827, <16 x i32> %828)
+  store volatile <32 x i32> %829, <32 x i32>* @VectorPairResult, align 128
+  %830 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %831 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %832 = call <32 x i32> @llvm.hexagon.V6.vshufoeh(<16 x i32> %830, <16 x i32> %831)
+  store volatile <32 x i32> %832, <32 x i32>* @VectorPairResult, align 128
+  %833 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %834 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %835 = call <32 x i32> @llvm.hexagon.V6.vsububh(<16 x i32> %833, <16 x i32> %834)
+  store volatile <32 x i32> %835, <32 x i32>* @VectorPairResult, align 128
+  %836 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %837 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %838 = call <32 x i32> @llvm.hexagon.V6.vsubh.dv(<32 x i32> %836, <32 x i32> %837)
+  store volatile <32 x i32> %838, <32 x i32>* @VectorPairResult, align 128
+  %839 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %840 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %841 = call <32 x i32> @llvm.hexagon.V6.vsubhsat.dv(<32 x i32> %839, <32 x i32> %840)
+  store volatile <32 x i32> %841, <32 x i32>* @VectorPairResult, align 128
+  %842 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %843 = call <32 x i32> @llvm.hexagon.V6.vsb(<16 x i32> %842)
+  store volatile <32 x i32> %843, <32 x i32>* @VectorPairResult, align 128
+  %844 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %845 = call <32 x i32> @llvm.hexagon.V6.vtmpyb(<32 x i32> %844, i32 -1)
+  store volatile <32 x i32> %845, <32 x i32>* @VectorPairResult, align 128
+  %846 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %847 = call <32 x i32> @llvm.hexagon.V6.vtmpybus(<32 x i32> %846, i32 -1)
+  store volatile <32 x i32> %847, <32 x i32>* @VectorPairResult, align 128
+  %848 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %849 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %850 = call <32 x i32> @llvm.hexagon.V6.vtmpyb.acc(<32 x i32> %848, <32 x i32> %849, i32 -1)
+  store volatile <32 x i32> %850, <32 x i32>* @VectorPairResult, align 128
+  %851 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %852 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %853 = call <32 x i32> @llvm.hexagon.V6.vtmpybus.acc(<32 x i32> %851, <32 x i32> %852, i32 -1)
+  store volatile <32 x i32> %853, <32 x i32>* @VectorPairResult, align 128
+  %854 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %855 = call <32 x i32> @llvm.hexagon.V6.vunpackb(<16 x i32> %854)
+  store volatile <32 x i32> %855, <32 x i32>* @VectorPairResult, align 128
+  %856 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %857 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %858 = call <32 x i32> @llvm.hexagon.V6.vunpackob(<32 x i32> %856, <16 x i32> %857)
+  store volatile <32 x i32> %858, <32 x i32>* @VectorPairResult, align 128
+  %859 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %860 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %861 = call <32 x i32> @llvm.hexagon.V6.vaddubsat.dv(<32 x i32> %859, <32 x i32> %860)
+  store volatile <32 x i32> %861, <32 x i32>* @VectorPairResult, align 128
+  %862 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %863 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %864 = call <32 x i32> @llvm.hexagon.V6.vsububsat.dv(<32 x i32> %862, <32 x i32> %863)
+  store volatile <32 x i32> %864, <32 x i32>* @VectorPairResult, align 128
+  %865 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %866 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %867 = call <32 x i32> @llvm.hexagon.V6.vadduhsat.dv(<32 x i32> %865, <32 x i32> %866)
+  store volatile <32 x i32> %867, <32 x i32>* @VectorPairResult, align 128
+  %868 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %869 = call <32 x i32> @llvm.hexagon.V6.vmpyub(<16 x i32> %868, i32 -1)
+  store volatile <32 x i32> %869, <32 x i32>* @VectorPairResult, align 128
+  %870 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %871 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %872 = call <32 x i32> @llvm.hexagon.V6.vmpyubv(<16 x i32> %870, <16 x i32> %871)
+  store volatile <32 x i32> %872, <32 x i32>* @VectorPairResult, align 128
+  %873 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %874 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %875 = call <32 x i32> @llvm.hexagon.V6.vmpyub.acc(<32 x i32> %873, <16 x i32> %874, i32 -1)
+  store volatile <32 x i32> %875, <32 x i32>* @VectorPairResult, align 128
+  %876 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %877 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %878 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %879 = call <32 x i32> @llvm.hexagon.V6.vmpyubv.acc(<32 x i32> %876, <16 x i32> %877, <16 x i32> %878)
+  store volatile <32 x i32> %879, <32 x i32>* @VectorPairResult, align 128
+  %880 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %881 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %882 = call <32 x i32> @llvm.hexagon.V6.vsubuhsat.dv(<32 x i32> %880, <32 x i32> %881)
+  store volatile <32 x i32> %882, <32 x i32>* @VectorPairResult, align 128
+  %883 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %884 = call <32 x i32> @llvm.hexagon.V6.vunpackub(<16 x i32> %883)
+  store volatile <32 x i32> %884, <32 x i32>* @VectorPairResult, align 128
+  %885 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %886 = call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %885)
+  store volatile <32 x i32> %886, <32 x i32>* @VectorPairResult, align 128
+  %887 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %888 = call <32 x i32> @llvm.hexagon.V6.vdsaduh(<32 x i32> %887, i32 -1)
+  store volatile <32 x i32> %888, <32 x i32>* @VectorPairResult, align 128
+  %889 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %890 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %891 = call <32 x i32> @llvm.hexagon.V6.vdsaduh.acc(<32 x i32> %889, <32 x i32> %890, i32 -1)
+  store volatile <32 x i32> %891, <32 x i32>* @VectorPairResult, align 128
+  %892 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %893 = call <32 x i32> @llvm.hexagon.V6.vmpyuh(<16 x i32> %892, i32 -1)
+  store volatile <32 x i32> %893, <32 x i32>* @VectorPairResult, align 128
+  %894 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %895 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %896 = call <32 x i32> @llvm.hexagon.V6.vmpyuhv(<16 x i32> %894, <16 x i32> %895)
+  store volatile <32 x i32> %896, <32 x i32>* @VectorPairResult, align 128
+  %897 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %898 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %899 = call <32 x i32> @llvm.hexagon.V6.vmpyuh.acc(<32 x i32> %897, <16 x i32> %898, i32 -1)
+  store volatile <32 x i32> %899, <32 x i32>* @VectorPairResult, align 128
+  %900 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %901 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %902 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %903 = call <32 x i32> @llvm.hexagon.V6.vmpyuhv.acc(<32 x i32> %900, <16 x i32> %901, <16 x i32> %902)
+  store volatile <32 x i32> %903, <32 x i32>* @VectorPairResult, align 128
+  %904 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %905 = call <32 x i32> @llvm.hexagon.V6.vrmpyubi(<32 x i32> %904, i32 -1, i32 0)
+  store volatile <32 x i32> %905, <32 x i32>* @VectorPairResult, align 128
+  %906 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %907 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %908 = call <32 x i32> @llvm.hexagon.V6.vrmpyubi.acc(<32 x i32> %906, <32 x i32> %907, i32 -1, i32 0)
+  store volatile <32 x i32> %908, <32 x i32>* @VectorPairResult, align 128
+  %909 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %910 = call <32 x i32> @llvm.hexagon.V6.vrsadubi(<32 x i32> %909, i32 -1, i32 0)
+  store volatile <32 x i32> %910, <32 x i32>* @VectorPairResult, align 128
+  %911 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %912 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %913 = call <32 x i32> @llvm.hexagon.V6.vrsadubi.acc(<32 x i32> %911, <32 x i32> %912, i32 -1, i32 0)
+  store volatile <32 x i32> %913, <32 x i32>* @VectorPairResult, align 128
+  %914 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %915 = call <32 x i32> @llvm.hexagon.V6.vunpackuh(<16 x i32> %914)
+  store volatile <32 x i32> %915, <32 x i32>* @VectorPairResult, align 128
+  %916 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %917 = call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %916)
+  store volatile <32 x i32> %917, <32 x i32>* @VectorPairResult, align 128
+  %918 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %919 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %920 = call <32 x i32> @llvm.hexagon.V6.vaddhw(<16 x i32> %918, <16 x i32> %919)
+  store volatile <32 x i32> %920, <32 x i32>* @VectorPairResult, align 128
+  %921 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %922 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %923 = call <32 x i32> @llvm.hexagon.V6.vadduhw(<16 x i32> %921, <16 x i32> %922)
+  store volatile <32 x i32> %923, <32 x i32>* @VectorPairResult, align 128
+  %924 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %925 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %926 = call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %924, <32 x i32> %925)
+  store volatile <32 x i32> %926, <32 x i32>* @VectorPairResult, align 128
+  %927 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %928 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %929 = call <32 x i32> @llvm.hexagon.V6.vaddwsat.dv(<32 x i32> %927, <32 x i32> %928)
+  store volatile <32 x i32> %929, <32 x i32>* @VectorPairResult, align 128
+  %930 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %931 = call <32 x i32> @llvm.hexagon.V6.vdmpyhb.dv(<32 x i32> %930, i32 -1)
+  store volatile <32 x i32> %931, <32 x i32>* @VectorPairResult, align 128
+  %932 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %933 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %934 = call <32 x i32> @llvm.hexagon.V6.vdmpyhb.dv.acc(<32 x i32> %932, <32 x i32> %933, i32 -1)
+  store volatile <32 x i32> %934, <32 x i32>* @VectorPairResult, align 128
+  %935 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %936 = call <32 x i32> @llvm.hexagon.V6.vmpahb(<32 x i32> %935, i32 -1)
+  store volatile <32 x i32> %936, <32 x i32>* @VectorPairResult, align 128
+  %937 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %938 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %939 = call <32 x i32> @llvm.hexagon.V6.vmpahb.acc(<32 x i32> %937, <32 x i32> %938, i32 -1)
+  store volatile <32 x i32> %939, <32 x i32>* @VectorPairResult, align 128
+  %940 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %941 = call <32 x i32> @llvm.hexagon.V6.vmpyh(<16 x i32> %940, i32 -1)
+  store volatile <32 x i32> %941, <32 x i32>* @VectorPairResult, align 128
+  %942 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %943 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %944 = call <32 x i32> @llvm.hexagon.V6.vmpyhv(<16 x i32> %942, <16 x i32> %943)
+  store volatile <32 x i32> %944, <32 x i32>* @VectorPairResult, align 128
+  %945 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %946 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %947 = call <32 x i32> @llvm.hexagon.V6.vmpyhus(<16 x i32> %945, <16 x i32> %946)
+  store volatile <32 x i32> %947, <32 x i32>* @VectorPairResult, align 128
+  %948 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %949 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %950 = call <32 x i32> @llvm.hexagon.V6.vmpyhsat.acc(<32 x i32> %948, <16 x i32> %949, i32 -1)
+  store volatile <32 x i32> %950, <32 x i32>* @VectorPairResult, align 128
+  %951 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %952 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %953 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %954 = call <32 x i32> @llvm.hexagon.V6.vmpyhv.acc(<32 x i32> %951, <16 x i32> %952, <16 x i32> %953)
+  store volatile <32 x i32> %954, <32 x i32>* @VectorPairResult, align 128
+  %955 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %956 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %957 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %958 = call <32 x i32> @llvm.hexagon.V6.vmpyhus.acc(<32 x i32> %955, <16 x i32> %956, <16 x i32> %957)
+  store volatile <32 x i32> %958, <32 x i32>* @VectorPairResult, align 128
+  %959 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %960 = call <32 x i32> @llvm.hexagon.V6.vrmpybusi(<32 x i32> %959, i32 -1, i32 0)
+  store volatile <32 x i32> %960, <32 x i32>* @VectorPairResult, align 128
+  %961 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %962 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %963 = call <32 x i32> @llvm.hexagon.V6.vrmpybusi.acc(<32 x i32> %961, <32 x i32> %962, i32 -1, i32 0)
+  store volatile <32 x i32> %963, <32 x i32>* @VectorPairResult, align 128
+  %964 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %965 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %966 = call <32 x i32> @llvm.hexagon.V6.vsubhw(<16 x i32> %964, <16 x i32> %965)
+  store volatile <32 x i32> %966, <32 x i32>* @VectorPairResult, align 128
+  %967 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %968 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+  %969 = call <32 x i32> @llvm.hexagon.V6.vsubuhw(<16 x i32> %967, <16 x i32> %968)
+  store volatile <32 x i32> %969, <32 x i32>* @VectorPairResult, align 128
+  %970 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %971 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %972 = call <32 x i32> @llvm.hexagon.V6.vsubw.dv(<32 x i32> %970, <32 x i32> %971)
+  store volatile <32 x i32> %972, <32 x i32>* @VectorPairResult, align 128
+  %973 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %974 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %975 = call <32 x i32> @llvm.hexagon.V6.vsubwsat.dv(<32 x i32> %973, <32 x i32> %974)
+  store volatile <32 x i32> %975, <32 x i32>* @VectorPairResult, align 128
+  %976 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %977 = call <32 x i32> @llvm.hexagon.V6.vsh(<16 x i32> %976)
+  store volatile <32 x i32> %977, <32 x i32>* @VectorPairResult, align 128
+  %978 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %979 = call <32 x i32> @llvm.hexagon.V6.vtmpyhb(<32 x i32> %978, i32 -1)
+  store volatile <32 x i32> %979, <32 x i32>* @VectorPairResult, align 128
+  %980 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %981 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 1), align 128
+  %982 = call <32 x i32> @llvm.hexagon.V6.vtmpyhb.acc(<32 x i32> %980, <32 x i32> %981, i32 -1)
+  store volatile <32 x i32> %982, <32 x i32>* @VectorPairResult, align 128
+  %983 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %984 = call <32 x i32> @llvm.hexagon.V6.vunpackh(<16 x i32> %983)
+  store volatile <32 x i32> %984, <32 x i32>* @VectorPairResult, align 128
+  %985 = load volatile <32 x i32>, <32 x i32>* getelementptr inbounds ([15 x <32 x i32>], [15 x <32 x i32>]* @vector_pairs, i32 0, i32 0), align 128
+  %986 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+  %987 = call <32 x i32> @llvm.hexagon.V6.vunpackoh(<32 x i32> %985, <16 x i32> %986)
+  store volatile <32 x i32> %987, <32 x i32>* @VectorPairResult, align 128
+  ret i32 0
+}
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.and(<512 x i1>, <512 x i1>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.and.n(<512 x i1>, <512 x i1>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.not(<512 x i1>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.or(<512 x i1>, <512 x i1>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.or.n(<512 x i1>, <512 x i1>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vandvrt(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vandvrt.acc(<512 x i1>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqb(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqb.and(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqh.and(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqw.and(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqb.or(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqh.or(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqw.or(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqb.xor(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqh.xor(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.veqw.xor(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtb(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgth(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtuh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtuw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtb.and(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgth.and(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtub.and(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtuh.and(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtuw.and(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtw.and(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtb.or(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgth.or(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtub.or(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtuh.or(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtuw.or(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtw.or(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtb.xor(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgth.xor(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtub.xor(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtuh.xor(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtuw.xor(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vgtw.xor(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.scalar2(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.xor(<512 x i1>, <512 x i1>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vassign(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vandqrt(<512 x i1>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vand(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vandqrt.acc(<16 x i32>, <512 x i1>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdelta(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vlalignb(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmux(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vnot(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vor(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrdelta(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vror(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lvsplatw(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vxor(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vd0() #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddbq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddbnq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubbq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubbnq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddb(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrhbrndsat(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdealb(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdealb4w(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vlutvvb(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vnavgub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vpackhb.sat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vpackeb(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vpackob(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vroundhb(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vshuffeb(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vshuffob(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubb(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddhq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddhnq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubhq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubhnq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vabsh(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vabsh.sat(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddhsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaslh(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaslhv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrh(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrhv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrwhrndsat(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrwhsat(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vavghrnd(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdealh(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpybus(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpybus.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vlsrhv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmaxh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vminh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyhsrs(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyhss(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyhvsrs(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyihb(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyih(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyihb.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyih.acc(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vnormamth(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vpackwh.sat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vpackeh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vpackoh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vpopcounth(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vroundwh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vshuffh(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vshufeh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vshufoh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubhsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vabsdiffub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddubsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrhubrndsat(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vavgub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vavgubrnd(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmaxub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vminub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vpackhub.sat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vroundhub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsububsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vabsdiffh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vabsdiffuh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vadduhsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrwuhsat(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vavguh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vavguhrnd(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vcl0h(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vlsrh(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmaxuh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vminuh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vpackwuh.sat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vroundwuh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubuhsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vabsdiffw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vcl0w(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrmpyub(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrmpyubv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrmpyub.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrmpyubv.acc(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddwq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddwnq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubwq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubwnq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vabsw(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vabsw.sat(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaddwsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaslw(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaslwv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vaslw.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrw(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrwv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrw.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vavgw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vavgwrnd(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhb(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhsat(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhsusat(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhvsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhisat(<32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhsuisat(<32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhb.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhsat.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhsusat.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhvsat.acc(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhisat.acc(<16 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vdmpyhsuisat.acc(<16 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vinsertwr(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vlsrwv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmaxw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vminw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyewuh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyiwh(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyiwb.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyiwh.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyiewuh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyiewh.acc(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyiewuh.acc(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyieoh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyiowh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyowh.rnd(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyowh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyowh.rnd.sacc(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vmpyowh.sacc(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vnavgw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vnormamtw(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrmpybv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrmpybus(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrmpybusv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrmpybv.acc(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrmpybus.acc(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vrmpybusv.acc(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vsubwsat(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vassignp(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vdealvdd(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vshuffvdd(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vswap(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddb.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vshufoeb(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsubb.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddubh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddhsat.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vdmpybus.dv(<32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vdmpybus.dv.acc(<32 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vlutvwh(<16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vlutvwh.oracc(<32 x i32>, <16 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpabus(<32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpabusv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpabuuv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpybv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpybus(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpybusv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpybv.acc(<32 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpybus.acc(<32 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpybusv.acc(<32 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vshufoeh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsububh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsubh.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsubhsat.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsb(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vtmpyb(<32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vtmpybus(<32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vtmpyb.acc(<32 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vtmpybus.acc(<32 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vunpackb(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vunpackob(<32 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddubsat.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsububsat.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vadduhsat.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyub(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyubv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyub.acc(<32 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyubv.acc(<32 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsubuhsat.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vunpackub(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vdsaduh(<32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vdsaduh.acc(<32 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyuh(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyuhv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyuh.acc(<32 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyuhv.acc(<32 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vrmpyubi(<32 x i32>, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vrmpyubi.acc(<32 x i32>, <32 x i32>, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vrsadubi(<32 x i32>, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vrsadubi.acc(<32 x i32>, <32 x i32>, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vunpackuh(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddhw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vadduhw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddwsat.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vdmpyhb.dv(<32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vdmpyhb.dv.acc(<32 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpahb(<32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpahb.acc(<32 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyh(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyhv(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyhus(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyhsat.acc(<32 x i32>, <16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyhv.acc(<32 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vmpyhus.acc(<32 x i32>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vrmpybusi(<32 x i32>, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vrmpybusi.acc(<32 x i32>, <32 x i32>, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsubhw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsubuhw(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsubw.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsubwsat.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsh(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vtmpyhb(<32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vtmpyhb.acc(<32 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vunpackh(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vunpackoh(<32 x i32>, <16 x i32>) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="+hvx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/v60Vasr.ll b/test/CodeGen/Hexagon/v60Vasr.ll
new file mode 100644
index 0000000000000..fb177f614f72d
--- /dev/null
+++ b/test/CodeGen/Hexagon/v60Vasr.ll
@@ -0,0 +1,247 @@
+; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60  < %s | FileCheck %s
+
+; CHECK: vasr(v{{[0-9]+}}.h,v{{[0-9]+}}.h,r{{[0-7]+}}):sat
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon-unknown--elf"
+
+%struct.buffer_t = type { i64, i8*, [4 x i32], [4 x i32], [4 x i32], i32, i8, i8, [6 x i8] }
+
+; Function Attrs: norecurse nounwind
+define i32 @__test_vasr(%struct.buffer_t* noalias nocapture %f.buffer, %struct.buffer_t* noalias nocapture %g.buffer, %struct.buffer_t* noalias nocapture %res.buffer) #0 {
+entry:
+  %buf_host = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 1
+  %f.host = load i8*, i8** %buf_host, align 4
+  %buf_dev = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 0
+  %f.dev = load i64, i64* %buf_dev, align 8
+  %0 = icmp eq i8* %f.host, null
+  %1 = icmp eq i64 %f.dev, 0
+  %f.host_and_dev_are_null = and i1 %0, %1
+  %buf_min = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 0
+  %f.min.0 = load i32, i32* %buf_min, align 4
+  %buf_host10 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 1
+  %g.host = load i8*, i8** %buf_host10, align 4
+  %buf_dev11 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 0
+  %g.dev = load i64, i64* %buf_dev11, align 8
+  %2 = icmp eq i8* %g.host, null
+  %3 = icmp eq i64 %g.dev, 0
+  %g.host_and_dev_are_null = and i1 %2, %3
+  %buf_min22 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 0
+  %g.min.0 = load i32, i32* %buf_min22, align 4
+  %buf_host27 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 1
+  %res.host = load i8*, i8** %buf_host27, align 4
+  %buf_dev28 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 0
+  %res.dev = load i64, i64* %buf_dev28, align 8
+  %4 = icmp eq i8* %res.host, null
+  %5 = icmp eq i64 %res.dev, 0
+  %res.host_and_dev_are_null = and i1 %4, %5
+  %buf_extent31 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 0
+  %res.extent.0 = load i32, i32* %buf_extent31, align 4
+  %buf_min39 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 0
+  %res.min.0 = load i32, i32* %buf_min39, align 4
+  %6 = add nsw i32 %res.extent.0, -1
+  %7 = and i32 %6, -64
+  %8 = add i32 %res.min.0, 63
+  %9 = add i32 %8, %7
+  %10 = add nsw i32 %res.min.0, %res.extent.0
+  %11 = add nsw i32 %10, -1
+  %12 = icmp slt i32 %9, %11
+  %13 = select i1 %12, i32 %9, i32 %11
+  %14 = add nsw i32 %10, -64
+  %15 = icmp slt i32 %res.min.0, %14
+  %16 = select i1 %15, i32 %res.min.0, i32 %14
+  %f.extent.0.required.s = sub nsw i32 %13, %16
+  br i1 %f.host_and_dev_are_null, label %true_bb, label %after_bb
+
+true_bb:                                          ; preds = %entry
+  %buf_elem_size44 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 5
+  store i32 1, i32* %buf_elem_size44, align 4
+  store i32 %16, i32* %buf_min, align 4
+  %17 = add nsw i32 %f.extent.0.required.s, 1
+  %buf_extent46 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 0
+  store i32 %17, i32* %buf_extent46, align 4
+  %buf_stride47 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 0
+  store i32 1, i32* %buf_stride47, align 4
+  %buf_min48 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 1
+  store i32 0, i32* %buf_min48, align 4
+  %buf_extent49 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 1
+  store i32 0, i32* %buf_extent49, align 4
+  %buf_stride50 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 1
+  store i32 0, i32* %buf_stride50, align 4
+  %buf_min51 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 2
+  store i32 0, i32* %buf_min51, align 4
+  %buf_extent52 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 2
+  store i32 0, i32* %buf_extent52, align 4
+  %buf_stride53 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 2
+  store i32 0, i32* %buf_stride53, align 4
+  %buf_min54 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 3
+  store i32 0, i32* %buf_min54, align 4
+  %buf_extent55 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 3
+  store i32 0, i32* %buf_extent55, align 4
+  %buf_stride56 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 3
+  store i32 0, i32* %buf_stride56, align 4
+  br label %after_bb
+
+after_bb:                                         ; preds = %true_bb, %entry
+  br i1 %g.host_and_dev_are_null, label %true_bb57, label %after_bb59
+
+true_bb57:                                        ; preds = %after_bb
+  %buf_elem_size60 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 5
+  store i32 1, i32* %buf_elem_size60, align 4
+  store i32 %16, i32* %buf_min22, align 4
+  %18 = add nsw i32 %f.extent.0.required.s, 1
+  %buf_extent62 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 0
+  store i32 %18, i32* %buf_extent62, align 4
+  %buf_stride63 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 0
+  store i32 1, i32* %buf_stride63, align 4
+  %buf_min64 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 1
+  store i32 0, i32* %buf_min64, align 4
+  %buf_extent65 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 1
+  store i32 0, i32* %buf_extent65, align 4
+  %buf_stride66 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 1
+  store i32 0, i32* %buf_stride66, align 4
+  %buf_min67 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 2
+  store i32 0, i32* %buf_min67, align 4
+  %buf_extent68 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 2
+  store i32 0, i32* %buf_extent68, align 4
+  %buf_stride69 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 2
+  store i32 0, i32* %buf_stride69, align 4
+  %buf_min70 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 3
+  store i32 0, i32* %buf_min70, align 4
+  %buf_extent71 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 3
+  store i32 0, i32* %buf_extent71, align 4
+  %buf_stride72 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 3
+  store i32 0, i32* %buf_stride72, align 4
+  br label %after_bb59
+
+after_bb59:                                       ; preds = %true_bb57, %after_bb
+  br i1 %res.host_and_dev_are_null, label %after_bb75.thread, label %after_bb75
+
+after_bb75.thread:                                ; preds = %after_bb59
+  %buf_elem_size76 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 5
+  store i32 1, i32* %buf_elem_size76, align 4
+  store i32 %16, i32* %buf_min39, align 4
+  %19 = add nsw i32 %f.extent.0.required.s, 1
+  store i32 %19, i32* %buf_extent31, align 4
+  %buf_stride79 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 0
+  store i32 1, i32* %buf_stride79, align 4
+  %buf_min80 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 1
+  store i32 0, i32* %buf_min80, align 4
+  %buf_extent81 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 1
+  store i32 0, i32* %buf_extent81, align 4
+  %buf_stride82 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 1
+  store i32 0, i32* %buf_stride82, align 4
+  %buf_min83 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 2
+  store i32 0, i32* %buf_min83, align 4
+  %buf_extent84 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 2
+  store i32 0, i32* %buf_extent84, align 4
+  %buf_stride85 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 2
+  store i32 0, i32* %buf_stride85, align 4
+  %buf_min86 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 3
+  store i32 0, i32* %buf_min86, align 4
+  %buf_extent87 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 3
+  store i32 0, i32* %buf_extent87, align 4
+  %buf_stride88 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 3
+  store i32 0, i32* %buf_stride88, align 4
+  br label %destructor_block
+
+after_bb75:                                       ; preds = %after_bb59
+  %20 = or i1 %f.host_and_dev_are_null, %g.host_and_dev_are_null
+  br i1 %20, label %destructor_block, label %"produce res"
+
+"produce res":                                    ; preds = %after_bb75
+  %21 = ashr i32 %res.extent.0, 6
+  %22 = icmp sgt i32 %21, 0
+  br i1 %22, label %"for res.s0.x.x", label %"end for res.s0.x.x", !prof !4
+
+"for res.s0.x.x":                                 ; preds = %"for res.s0.x.x", %"produce res"
+  %res.s0.x.x = phi i32 [ %41, %"for res.s0.x.x" ], [ 0, %"produce res" ]
+  %23 = shl nsw i32 %res.s0.x.x, 6
+  %24 = add nsw i32 %23, %res.min.0
+  %25 = sub nsw i32 %24, %f.min.0
+  %26 = getelementptr inbounds i8, i8* %f.host, i32 %25
+  %27 = bitcast i8* %26 to <16 x i32>*
+  %28 = load <16 x i32>, <16 x i32>* %27, align 1, !tbaa !5
+  %29 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %28)
+  %30 = sub nsw i32 %24, %g.min.0
+  %31 = getelementptr inbounds i8, i8* %g.host, i32 %30
+  %32 = bitcast i8* %31 to <16 x i32>*
+  %33 = load <16 x i32>, <16 x i32>* %32, align 1, !tbaa !8
+  %34 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %33)
+  %35 = tail call <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32> %29, <32 x i32> %34)
+  %36 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %35)
+  %37 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %35)
+  %38 = tail call <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32> %36, <16 x i32> %37, i32 4)
+  %39 = getelementptr inbounds i8, i8* %res.host, i32 %23
+  %40 = bitcast i8* %39 to <16 x i32>*
+  store <16 x i32> %38, <16 x i32>* %40, align 1, !tbaa !10
+  %41 = add nuw nsw i32 %res.s0.x.x, 1
+  %42 = icmp eq i32 %41, %21
+  br i1 %42, label %"end for res.s0.x.x", label %"for res.s0.x.x"
+
+"end for res.s0.x.x":                             ; preds = %"for res.s0.x.x", %"produce res"
+  %43 = add nsw i32 %res.extent.0, 63
+  %44 = ashr i32 %43, 6
+  %45 = icmp sgt i32 %44, %21
+  br i1 %45, label %"for res.s0.x.x92.preheader", label %destructor_block, !prof !4
+
+"for res.s0.x.x92.preheader":                     ; preds = %"end for res.s0.x.x"
+  %46 = sub i32 -64, %f.min.0
+  %47 = add i32 %46, %10
+  %48 = getelementptr inbounds i8, i8* %f.host, i32 %47
+  %49 = bitcast i8* %48 to <16 x i32>*
+  %50 = load <16 x i32>, <16 x i32>* %49, align 1
+  %51 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %50)
+  %52 = sub i32 -64, %g.min.0
+  %53 = add i32 %52, %10
+  %54 = getelementptr inbounds i8, i8* %g.host, i32 %53
+  %55 = bitcast i8* %54 to <16 x i32>*
+  %56 = load <16 x i32>, <16 x i32>* %55, align 1
+  %57 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %56)
+  %58 = tail call <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32> %51, <32 x i32> %57)
+  %59 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %58)
+  %60 = add nsw i32 %res.extent.0, -64
+  %61 = getelementptr inbounds i8, i8* %res.host, i32 %60
+  %62 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %58)
+  %63 = tail call <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32> %62, <16 x i32> %59, i32 4)
+  %64 = bitcast i8* %61 to <16 x i32>*
+  store <16 x i32> %63, <16 x i32>* %64, align 1, !tbaa !10
+  br label %destructor_block
+
+destructor_block:                                 ; preds = %"for res.s0.x.x92.preheader", %"end for res.s0.x.x", %after_bb75, %after_bb75.thread
+  ret i32 0
+}
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32>, <16 x i32>, i32) #1
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
+!llvm.module.flags = !{!1, !2, !3}
+
+!0 = !{!"Clang $LLVM_VERSION_MAJOR.$LLVM_VERSION_MINOR (based on LLVM 3.8.0)"}
+!1 = !{i32 2, !"halide_use_soft_float_abi", i32 0}
+!2 = !{i32 2, !"halide_mcpu", !"hexagonv60"}
+!3 = !{i32 2, !"halide_mattrs", !"+hvx"}
+!4 = !{!"branch_weights", i32 1073741824, i32 0}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"f", !7}
+!7 = !{!"Halide buffer"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"g", !7}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"res", !7}
diff --git a/test/CodeGen/Hexagon/v60small.ll b/test/CodeGen/Hexagon/v60small.ll
new file mode 100644
index 0000000000000..8a6a6155a3998
--- /dev/null
+++ b/test/CodeGen/Hexagon/v60small.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60  < %s | FileCheck %s
+
+; CHECK: q{{[0-3]}} = v{{[0-9]*}}and(v{{[0-9]*}},r{{[0-9]*}})
+target datalayout = "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon"
+
+@K = global i64 0, align 8
+@src = global i8 -1, align 1
+@vecpreds = common global [15 x <16 x i32>] zeroinitializer, align 64
+@Q6VecPredResult = common global <16 x i32> zeroinitializer, align 64
+@vectors = common global [15 x <16 x i32>] zeroinitializer, align 64
+@VectorResult = common global <16 x i32> zeroinitializer, align 64
+@vector_pairs = common global [15 x <32 x i32>] zeroinitializer, align 128
+@VectorPairResult = common global <32 x i32> zeroinitializer, align 128
+@dst_addresses = common global [15 x i8] zeroinitializer, align 8
+@ptr_addresses = common global [15 x i8*] zeroinitializer, align 8
+@src_addresses = common global [15 x i8*] zeroinitializer, align 8
+@dst = common global i8 0, align 1
+@ptr = common global [32768 x i8] zeroinitializer, align 8
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %0 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %1 = bitcast <16 x i32> %0 to <512 x i1>
+  %2 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 1), align 64
+  %3 = bitcast <16 x i32> %2 to <512 x i1>
+  %4 = call <512 x i1> @llvm.hexagon.V6.pred.and(<512 x i1> %1, <512 x i1> %3)
+  %5 = bitcast <512 x i1> %4 to <16 x i32>
+  store volatile <16 x i32> %5, <16 x i32>* @Q6VecPredResult, align 64
+  %6 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+  %7 = bitcast <16 x i32> %6 to <512 x i1>
+  %8 = load volatile <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 1), align 64
+  %9 = bitcast <16 x i32> %8 to <512 x i1>
+  %10 = call <512 x i1> @llvm.hexagon.V6.pred.and.n(<512 x i1> %7, <512 x i1> %9)
+  %11 = bitcast <512 x i1> %10 to <16 x i32>
+  store volatile <16 x i32> %11, <16 x i32>* @Q6VecPredResult, align 64
+  ret i32 0
+
+}
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.and(<512 x i1>, <512 x i1>) #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.pred.and.n(<512 x i1>, <512 x i1>) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="+hvx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll b/test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll
index f5ee5d001510a..70c4aeb4bac0a 100644
--- a/test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll
+++ b/test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
 ; This one should generate a combine with two immediates.
 ; CHECK: combine(#7, #7)
 @B = common global [400 x i32] zeroinitializer, align 8
diff --git a/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll b/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
index 16591ef685366..91b32652400f7 100644
--- a/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
+++ b/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
 
 ; Check that store is post-incremented.
 ; CHECK: memuh(r{{[0-9]+}} + {{ *}}#6{{ *}})
diff --git a/test/CodeGen/Hexagon/vect/vect-shuffle.ll b/test/CodeGen/Hexagon/vect/vect-shuffle.ll
index 9d80df2e08875..bd5b2b981695d 100644
--- a/test/CodeGen/Hexagon/vect/vect-shuffle.ll
+++ b/test/CodeGen/Hexagon/vect/vect-shuffle.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
 
 ; Check that store is post-incremented.
 ; CHECK-NOT: extractu
diff --git a/test/CodeGen/Hexagon/vect/vect-splat.ll b/test/CodeGen/Hexagon/vect/vect-splat.ll
index 3613dbf6fdd19..8cc226a00dab8 100644
--- a/test/CodeGen/Hexagon/vect/vect-splat.ll
+++ b/test/CodeGen/Hexagon/vect/vect-splat.ll
@@ -1,6 +1,6 @@
 ; Extracted from test/CodeGen/Generic/vector.ll: used to loop indefinitely.
 ; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
-; CHECK: combine
+; CHECK: splat_i4
 
 %i4 = type <4 x i32>
 
diff --git a/test/CodeGen/Hexagon/vect/vect-xor.ll b/test/CodeGen/Hexagon/vect/vect-xor.ll
index 961185581128b..96719e683413e 100644
--- a/test/CodeGen/Hexagon/vect/vect-xor.ll
+++ b/test/CodeGen/Hexagon/vect/vect-xor.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
 
 ; Check that the parsing succeeded.
 ; CHECK: r{{[0-9]+:[0-9]+}} = xor(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
diff --git a/test/CodeGen/Inputs/DbgValueOtherTargets.ll b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
index efa1a0849a8ed..440073fea1536 100644
--- a/test/CodeGen/Inputs/DbgValueOtherTargets.ll
+++ b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
@@ -1,6 +1,6 @@
 ; Check that DEBUG_VALUE comments come through on a variety of targets.
 
-define i32 @main() nounwind ssp {
+define i32 @main() nounwind ssp !dbg !0 {
 entry:
 ; CHECK: DEBUG_VALUE
   call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !7, metadata !DIExpression()), !dbg !9
@@ -14,14 +14,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!13}
 
-!0 = !DISubprogram(name: "main", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !12, scope: !1, type: !3, function: i32 ()* @main)
+!0 = distinct !DISubprogram(name: "main", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !12, scope: !1, type: !3)
 !1 = !DIFile(filename: "/tmp/x.c", directory: "/Users/manav")
-!2 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 120996)", isOptimized: false, emissionKind: 0, file: !12, enums: !6, retainedTypes: !6, subprograms: !11)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 120996)", isOptimized: false, emissionKind: 0, file: !12, enums: !6, retainedTypes: !6, subprograms: !11)
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5}
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !6 = !{}
-!7 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 3, scope: !8, file: !1, type: !5)
+!7 = !DILocalVariable(name: "i", line: 3, scope: !8, file: !1, type: !5)
 !8 = distinct !DILexicalBlock(line: 2, column: 12, file: !12, scope: !0)
 !9 = !DILocation(line: 3, column: 11, scope: !8)
 !10 = !DILocation(line: 4, column: 2, scope: !8)
diff --git a/test/CodeGen/MIR/AArch64/cfi-def-cfa.mir b/test/CodeGen/MIR/AArch64/cfi-def-cfa.mir
new file mode 100644
index 0000000000000..cf7572ecad37e
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/cfi-def-cfa.mir
@@ -0,0 +1,31 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the .cfi_def_cfa operands
+# correctly.
+
+--- |
+
+  declare void @foo()
+
+  define void @trivial_fp_func() {
+  entry:
+    call void @foo()
+    ret void
+  }
+
+...
+---
+name:            trivial_fp_func
+body: |
+  bb.0.entry:
+    liveins: %lr, %fp, %lr, %fp
+
+    %sp = frame-setup STPXpre killed %fp, killed %lr, %sp, -2
+    %fp = frame-setup ADDXri %sp, 0, 0
+    ; CHECK: CFI_INSTRUCTION .cfi_def_cfa %w29, 16
+    frame-setup CFI_INSTRUCTION .cfi_def_cfa %w29, 16
+    frame-setup CFI_INSTRUCTION .cfi_offset %w30, -8
+    frame-setup CFI_INSTRUCTION .cfi_offset %w29, -16
+    BL @foo, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp
+    %sp, %fp, %lr = LDPXpost %sp, 2
+    RET_ReallyLR
+...
diff --git a/test/CodeGen/MIR/AArch64/expected-target-flag-name.mir b/test/CodeGen/MIR/AArch64/expected-target-flag-name.mir
new file mode 100644
index 0000000000000..b7bac2682c70c
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/expected-target-flag-name.mir
@@ -0,0 +1,23 @@
+# RUN: not llc -mtriple=aarch64-none-linux-gnu -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @var_i32 = global i32 42
+  @var_i64 = global i64 0
+
+  define i32 @sub_small() {
+  entry:
+    %val32 = load i32, i32* @var_i32
+    ret i32 %val32
+  }
+
+...
+---
+name:            sub_small
+body: |
+  bb.0.entry:
+    %x8 = ADRP target-flags(aarch64-page) @var_i32
+  ; CHECK: [[@LINE+1]]:60: expected the name of the target flag
+    %w0 = LDRWui killed %x8, target-flags(aarch64-pageoff, ) @var_i32
+    RET_ReallyLR implicit %w0
+...
diff --git a/test/CodeGen/MIR/AArch64/invalid-target-flag-name.mir b/test/CodeGen/MIR/AArch64/invalid-target-flag-name.mir
new file mode 100644
index 0000000000000..d4145b8961df2
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/invalid-target-flag-name.mir
@@ -0,0 +1,23 @@
+# RUN: not llc -mtriple=aarch64-none-linux-gnu -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @var_i32 = global i32 42
+  @var_i64 = global i64 0
+
+  define i32 @sub_small() {
+  entry:
+    %val32 = load i32, i32* @var_i32
+    ret i32 %val32
+  }
+
+...
+---
+name:            sub_small
+body: |
+  bb.0.entry:
+    %x8 = ADRP target-flags(aarch64-page) @var_i32
+  ; CHECK: [[@LINE+1]]:60: use of undefined target flag 'ncc'
+    %w0 = LDRWui killed %x8, target-flags(aarch64-pageoff, ncc) @var_i32
+    RET_ReallyLR implicit %w0
+...
diff --git a/test/CodeGen/MIR/AArch64/lit.local.cfg b/test/CodeGen/MIR/AArch64/lit.local.cfg
new file mode 100644
index 0000000000000..f4f77c5aa3124
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/lit.local.cfg
@@ -0,0 +1,8 @@
+import re
+
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
+
+# For now we don't test arm64-win32.
+if re.search(r'cygwin|mingw32|win32|windows-gnu|windows-msvc', config.target_triple):
+    config.unsupported = True
diff --git a/test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir b/test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir
new file mode 100644
index 0000000000000..e23a352dff210
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir
@@ -0,0 +1,28 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser can parse multiple register machine
+# operands before '='.
+
+--- |
+
+  declare void @foo()
+
+  define void @trivial_fp_func() {
+  entry:
+    call void @foo()
+    ret void
+  }
+
+...
+---
+name:            trivial_fp_func
+body: |
+  bb.0.entry:
+    liveins: %lr, %fp, %lr, %fp
+
+    %sp = frame-setup STPXpre killed %fp, killed %lr, %sp, -2
+    %fp = frame-setup ADDXri %sp, 0, 0
+    BL @foo, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp
+  ; CHECK: %sp, %fp, %lr = LDPXpost %sp, 2
+    %sp, %fp, %lr = LDPXpost %sp, 2
+    RET_ReallyLR
+...
diff --git a/test/CodeGen/MIR/AArch64/stack-object-local-offset.mir b/test/CodeGen/MIR/AArch64/stack-object-local-offset.mir
new file mode 100644
index 0000000000000..9471516db647f
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/stack-object-local-offset.mir
@@ -0,0 +1,41 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+
+--- |
+  @var = global i64 0
+  @local_addr = global i64* null
+
+  define void @stack_local() {
+  entry:
+    %local_var = alloca i64
+    %val = load i64, i64* @var
+    store i64 %val, i64* %local_var
+    store i64* %local_var, i64** @local_addr
+    ret void
+  }
+...
+---
+name:            stack_local
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gpr64common }
+  - { id: 1, class: gpr64 }
+  - { id: 2, class: gpr64common }
+  - { id: 3, class: gpr64common }
+frameInfo:
+  maxAlignment:    8
+# CHECK-LABEL: stack_local
+# CHECK: stack:
+# CHECK_NEXT: { id:0, name:local_var, offset:0, size:8, alignment:8, local-offset: -8 }
+stack:
+  - { id: 0,name: local_var,offset: 0,size: 8,alignment: 8, local-offset: -8 }
+body: |
+  bb.0.entry:
+    %0 = ADRP @var
+    %1 = LDRXui killed %0, @var :: (load 8 from @var)
+    STRXui killed %1, %stack.0.local_var, 0 :: (store 8 into %ir.local_var)
+    %2 = ADRP @local_addr
+    %3 = ADDXri %stack.0.local_var, 0, 0
+    STRXui killed %3, killed %2, @local_addr :: (store 8 into @local_addr)
+    RET_ReallyLR
+...
diff --git a/test/CodeGen/MIR/AArch64/target-flags.mir b/test/CodeGen/MIR/AArch64/target-flags.mir
new file mode 100644
index 0000000000000..e96fce7c2f2bb
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/target-flags.mir
@@ -0,0 +1,39 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+
+--- |
+
+  @var_i32 = global i32 42
+  @var_i64 = global i64 0
+
+  define void @sub_small() {
+  entry:
+    %val32 = load i32, i32* @var_i32
+    %newval32 = sub i32 %val32, 4095
+    store i32 %newval32, i32* @var_i32
+    %val64 = load i64, i64* @var_i64
+    %newval64 = sub i64 %val64, 52
+    store i64 %newval64, i64* @var_i64
+    ret void
+  }
+
+...
+---
+name:            sub_small
+body: |
+  bb.0.entry:
+  ; CHECK:      %x8 = ADRP target-flags(aarch64-page) @var_i32
+  ; CHECK-NEXT: %x9 = ADRP target-flags(aarch64-page) @var_i64
+  ; CHECK-NEXT: %w10 = LDRWui %x8, target-flags(aarch64-pageoff, aarch64-nc) @var_i32
+  ; CHECK-NEXT: %x11 = LDRXui %x9, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc) @var_i64
+  ; CHECK:      STRWui killed %w10, killed %x8, target-flags(aarch64-nc) @var_i32
+  ; CHECK:      STRXui killed %x11, killed %x9, target-flags(aarch64-pageoff, aarch64-nc) @var_i64
+    %x8 = ADRP target-flags(aarch64-page) @var_i32
+    %x9 = ADRP target-flags(aarch64-page) @var_i64
+    %w10 = LDRWui %x8, target-flags(aarch64-pageoff, aarch64-nc) @var_i32
+    %x11 = LDRXui %x9, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc) @var_i64
+    %w10 = SUBWri killed %w10, 4095, 0
+    %x11 = SUBXri killed %x11, 52, 0
+    STRWui killed %w10, killed %x8, target-flags(aarch64-nc) @var_i32
+    STRXui killed %x11, killed %x9, target-flags(aarch64-pageoff, aarch64-nc) @var_i64
+    RET_ReallyLR
+...
diff --git a/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir b/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
new file mode 100644
index 0000000000000..34793880a60b0
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
@@ -0,0 +1,64 @@
+# RUN: not llc -march=amdgcn -mcpu=SI -start-after postrapseudos -stop-after postrapseudos -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  %struct.foo = type { float, [5 x i32] }
+
+  @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
+
+  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  entry:
+    %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+    %1 = load float, float addrspace(2)* %0
+    store float %1, float addrspace(1)* %out
+    ret void
+  }
+
+  declare { i1, i64 } @llvm.SI.if(i1)
+
+  declare { i1, i64 } @llvm.SI.else(i64)
+
+  declare i64 @llvm.SI.break(i64)
+
+  declare i64 @llvm.SI.if.break(i1, i64)
+
+  declare i64 @llvm.SI.else.break(i64, i64)
+
+  declare i1 @llvm.SI.loop(i64)
+
+  declare void @llvm.SI.end.cf(i64)
+
+  attributes #0 = { "target-cpu"="SI" }
+
+...
+---
+name:            float
+tracksSubRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  maxAlignment:  8
+body: |
+  bb.0.entry:
+    liveins: %sgpr0_sgpr1
+
+    %sgpr2_sgpr3 = S_GETPC_B64
+  ; CHECK: [[@LINE+1]]:45: expected the name of the target index
+    %sgpr2 = S_ADD_U32 %sgpr2, target-index(0), implicit-def %scc, implicit-def %scc
+    %sgpr3 = S_ADDC_U32 %sgpr3, 0, implicit-def %scc, implicit %scc, implicit-def %scc, implicit %scc
+    %sgpr4_sgpr5 = S_LSHR_B64 %sgpr2_sgpr3, 32, implicit-def dead %scc
+    %sgpr6 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 11
+    %sgpr7 = S_ASHR_I32 %sgpr6, 31, implicit-def dead %scc
+    %sgpr6_sgpr7 = S_LSHL_B64 %sgpr6_sgpr7, 2, implicit-def dead %scc
+    %sgpr2 = S_ADD_U32 %sgpr2, @float_gv, implicit-def %scc
+    %sgpr3 = S_ADDC_U32 %sgpr4, 0, implicit-def dead %scc, implicit %scc
+    %sgpr4 = S_ADD_U32 %sgpr2, %sgpr6, implicit-def %scc
+    %sgpr5 = S_ADDC_U32 %sgpr3, %sgpr7, implicit-def dead %scc, implicit %scc
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    %vgpr0 = V_MOV_B32_e32 killed %sgpr2, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
diff --git a/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir b/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
new file mode 100644
index 0000000000000..e20cf376414ae
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
@@ -0,0 +1,64 @@
+# RUN: not llc -march=amdgcn -mcpu=SI -start-after postrapseudos -stop-after postrapseudos -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  %struct.foo = type { float, [5 x i32] }
+
+  @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
+
+  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  entry:
+    %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+    %1 = load float, float addrspace(2)* %0
+    store float %1, float addrspace(1)* %out
+    ret void
+  }
+
+  declare { i1, i64 } @llvm.SI.if(i1)
+
+  declare { i1, i64 } @llvm.SI.else(i64)
+
+  declare i64 @llvm.SI.break(i64)
+
+  declare i64 @llvm.SI.if.break(i1, i64)
+
+  declare i64 @llvm.SI.else.break(i64, i64)
+
+  declare i1 @llvm.SI.loop(i64)
+
+  declare void @llvm.SI.end.cf(i64)
+
+  attributes #0 = { "target-cpu"="SI" }
+
+...
+---
+name:            float
+tracksSubRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  maxAlignment:  8
+body: |
+  bb.0.entry:
+    liveins: %sgpr0_sgpr1
+
+    %sgpr2_sgpr3 = S_GETPC_B64
+  ; CHECK: [[@LINE+1]]:45: use of undefined target index 'constdata-start'
+    %sgpr2 = S_ADD_U32 %sgpr2, target-index(constdata-start), implicit-def %scc, implicit-def %scc
+    %sgpr3 = S_ADDC_U32 %sgpr3, 0, implicit-def %scc, implicit %scc, implicit-def %scc, implicit %scc
+    %sgpr4_sgpr5 = S_LSHR_B64 %sgpr2_sgpr3, 32, implicit-def dead %scc
+    %sgpr6 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 11
+    %sgpr7 = S_ASHR_I32 %sgpr6, 31, implicit-def dead %scc
+    %sgpr6_sgpr7 = S_LSHL_B64 %sgpr6_sgpr7, 2, implicit-def dead %scc
+    %sgpr2 = S_ADD_U32 %sgpr2, @float_gv, implicit-def %scc
+    %sgpr3 = S_ADDC_U32 %sgpr4, 0, implicit-def dead %scc, implicit %scc
+    %sgpr4 = S_ADD_U32 %sgpr2, %sgpr6, implicit-def %scc
+    %sgpr5 = S_ADDC_U32 %sgpr3, %sgpr7, implicit-def dead %scc, implicit %scc
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    %vgpr0 = V_MOV_B32_e32 killed %sgpr2, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
diff --git a/test/CodeGen/MIR/AMDGPU/lit.local.cfg b/test/CodeGen/MIR/AMDGPU/lit.local.cfg
new file mode 100644
index 0000000000000..2a665f06be72e
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/CodeGen/MIR/AMDGPU/target-index-operands.mir b/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
new file mode 100644
index 0000000000000..839fd3212c61b
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
@@ -0,0 +1,104 @@
+# RUN: llc -march=amdgcn -mcpu=SI -start-after postrapseudos -stop-after postrapseudos -o /dev/null %s | FileCheck %s
+# This test verifies that the MIR parser can parse target index operands.
+
+--- |
+
+  %struct.foo = type { float, [5 x i32] }
+
+  @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
+
+  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  entry:
+    %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+    %1 = load float, float addrspace(2)* %0
+    store float %1, float addrspace(1)* %out
+    ret void
+  }
+
+  define void @float2(float addrspace(1)* %out, i32 %index) #0 {
+  entry:
+    %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+    %1 = load float, float addrspace(2)* %0
+    store float %1, float addrspace(1)* %out
+    ret void
+  }
+
+  declare { i1, i64 } @llvm.SI.if(i1)
+
+  declare { i1, i64 } @llvm.SI.else(i64)
+
+  declare i64 @llvm.SI.break(i64)
+
+  declare i64 @llvm.SI.if.break(i1, i64)
+
+  declare i64 @llvm.SI.else.break(i64, i64)
+
+  declare i1 @llvm.SI.loop(i64)
+
+  declare void @llvm.SI.end.cf(i64)
+
+  attributes #0 = { "target-cpu"="SI" }
+
+...
+---
+name:            float
+tracksSubRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  maxAlignment:  8
+body: |
+  bb.0.entry:
+    liveins: %sgpr0_sgpr1
+
+    %sgpr2_sgpr3 = S_GETPC_B64
+  ; CHECK: %sgpr2 = S_ADD_U32 %sgpr2, target-index(amdgpu-constdata-start), implicit-def %scc, implicit-def %scc
+    %sgpr2 = S_ADD_U32 %sgpr2, target-index(amdgpu-constdata-start), implicit-def %scc, implicit-def %scc
+    %sgpr3 = S_ADDC_U32 %sgpr3, 0, implicit-def %scc, implicit %scc, implicit-def %scc, implicit %scc
+    %sgpr4_sgpr5 = S_LSHR_B64 %sgpr2_sgpr3, 32, implicit-def dead %scc
+    %sgpr6 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 11
+    %sgpr7 = S_ASHR_I32 %sgpr6, 31, implicit-def dead %scc
+    %sgpr6_sgpr7 = S_LSHL_B64 %sgpr6_sgpr7, 2, implicit-def dead %scc
+    %sgpr2 = S_ADD_U32 %sgpr2, @float_gv, implicit-def %scc
+    %sgpr3 = S_ADDC_U32 %sgpr4, 0, implicit-def dead %scc, implicit %scc
+    %sgpr4 = S_ADD_U32 %sgpr2, %sgpr6, implicit-def %scc
+    %sgpr5 = S_ADDC_U32 %sgpr3, %sgpr7, implicit-def dead %scc, implicit %scc
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    %vgpr0 = V_MOV_B32_e32 killed %sgpr2, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
+---
+name:            float2
+tracksSubRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  maxAlignment:  8
+body: |
+  bb.0.entry:
+    liveins: %sgpr0_sgpr1
+
+    %sgpr2_sgpr3 = S_GETPC_B64
+  ; CHECK: %sgpr2 = S_ADD_U32 %sgpr2, target-index(amdgpu-constdata-start) + 1, implicit-def %scc, implicit-def %scc
+    %sgpr2 = S_ADD_U32 %sgpr2, target-index(amdgpu-constdata-start) + 1, implicit-def %scc, implicit-def %scc
+    %sgpr3 = S_ADDC_U32 %sgpr3, 0, implicit-def %scc, implicit %scc, implicit-def %scc, implicit %scc
+    %sgpr4_sgpr5 = S_LSHR_B64 %sgpr2_sgpr3, 32, implicit-def dead %scc
+    %sgpr6 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 11
+    %sgpr7 = S_ASHR_I32 %sgpr6, 31, implicit-def dead %scc
+    %sgpr6_sgpr7 = S_LSHL_B64 %sgpr6_sgpr7, 2, implicit-def dead %scc
+    %sgpr2 = S_ADD_U32 %sgpr2, @float_gv, implicit-def %scc
+    %sgpr3 = S_ADDC_U32 %sgpr4, 0, implicit-def dead %scc, implicit %scc
+    %sgpr4 = S_ADD_U32 %sgpr2, %sgpr6, implicit-def %scc
+    %sgpr5 = S_ADDC_U32 %sgpr3, %sgpr7, implicit-def dead %scc, implicit %scc
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    %vgpr0 = V_MOV_B32_e32 killed %sgpr2, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
diff --git a/test/CodeGen/MIR/ARM/ARMLoadStoreDBG.mir b/test/CodeGen/MIR/ARM/ARMLoadStoreDBG.mir
new file mode 100644
index 0000000000000..e351713dc290c
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/ARMLoadStoreDBG.mir
@@ -0,0 +1,165 @@
+# RUN: llc -start-after machine-cp -stop-after=if-converter -mtriple=thumbv7 %s -o /dev/null 2>&1 | FileCheck %s
+--- |
+  ; ModuleID = '/Volumes/Data/llvm/test/CodeGen/ARM/sched-it-debug-nodes.ll'
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv7"
+  
+  %struct.s = type opaque
+  
+  ; Function Attrs: nounwind
+  define arm_aapcscc i32 @f(%struct.s* %s, i32 %u, i8* %b, i32 %n) #0 !dbg !4 {
+  entry:
+    tail call void @llvm.dbg.value(metadata %struct.s* %s, i64 0, metadata !18, metadata !27), !dbg !28
+    tail call void @llvm.dbg.value(metadata i32 %u, i64 0, metadata !19, metadata !27), !dbg !28
+    tail call void @llvm.dbg.value(metadata i8* %b, i64 0, metadata !20, metadata !27), !dbg !28
+    tail call void @llvm.dbg.value(metadata i32 %n, i64 0, metadata !21, metadata !27), !dbg !28
+    %cmp = icmp ult i32 %n, 4, !dbg !29
+    br i1 %cmp, label %return, label %if.end, !dbg !31
+  
+  if.end:                                           ; preds = %entry
+    tail call arm_aapcscc void @g(%struct.s* %s, i8* %b, i32 %n) #3, !dbg !32
+    br label %return, !dbg !33
+  
+  return:                                           ; preds = %if.end, %entry
+    %retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ]
+    ret i32 %retval.0, !dbg !34
+  }
+  
+  declare arm_aapcscc void @g(%struct.s*, i8*, i32) #1
+  
+  ; Function Attrs: nounwind readnone
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+  
+  attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { nounwind readnone }
+  attributes #3 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!22, !23, !24, !25}
+  !llvm.ident = !{!26}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0  (llvm/trunk 237059)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "<stdin>", directory: "/Users/compnerd/Source/llvm")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 9, type: !5, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, variables: !17)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{!7, !8, !11, !12, !16}
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 32, align: 32)
+  !9 = !DIDerivedType(tag: DW_TAG_typedef, name: "s", file: !1, line: 5, baseType: !10)
+  !10 = !DICompositeType(tag: DW_TAG_structure_type, name: "s", file: !1, line: 5, flags: DIFlagFwdDecl)
+  !11 = !DIBasicType(name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+  !12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 32, align: 32)
+  !13 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !14)
+  !14 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint8_t", file: !1, line: 2, baseType: !15)
+  !15 = !DIBasicType(name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
+  !16 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !1, line: 3, baseType: !11)
+  !17 = !{!18, !19, !20, !21}
+  !18 = !DILocalVariable(name: "s", arg: 1, scope: !4, file: !1, line: 9, type: !8)
+  !19 = !DILocalVariable(name: "u", arg: 2, scope: !4, file: !1, line: 9, type: !11)
+  !20 = !DILocalVariable(name: "b", arg: 3, scope: !4, file: !1, line: 9, type: !12)
+  !21 = !DILocalVariable(name: "n", arg: 4, scope: !4, file: !1, line: 9, type: !16)
+  !22 = !{i32 2, !"Dwarf Version", i32 4}
+  !23 = !{i32 2, !"Debug Info Version", i32 3}
+  !24 = !{i32 1, !"wchar_size", i32 4}
+  !25 = !{i32 1, !"min_enum_size", i32 4}
+  !26 = !{!"clang version 3.7.0  (llvm/trunk 237059)"}
+  !27 = !DIExpression()
+  !28 = !DILocation(line: 9, scope: !4)
+  !29 = !DILocation(line: 10, scope: !30)
+  !30 = distinct !DILexicalBlock(scope: !4, file: !1, line: 10)
+  !31 = !DILocation(line: 10, scope: !4)
+  !32 = !DILocation(line: 13, scope: !4)
+  !33 = !DILocation(line: 14, scope: !4)
+  !34 = !DILocation(line: 15, scope: !4)
+
+...
+---
+name:            f
+alignment:       1
+exposesReturnsTwice: false
+hasInlineAsm:    false
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:         
+  - { reg: '%r0' }
+  - { reg: '%r2' }
+  - { reg: '%r3' }
+calleeSavedRegisters: [ '%lr', '%d8', '%d9', '%d10', '%d11', '%d12', '%d13', 
+                        '%d14', '%d15', '%q4', '%q5', '%q6', '%q7', '%r4', 
+                        '%r5', '%r6', '%r7', '%r8', '%r9', '%r10', '%r11', 
+                        '%s16', '%s17', '%s18', '%s19', '%s20', '%s21', 
+                        '%s22', '%s23', '%s24', '%s25', '%s26', '%s27', 
+                        '%s28', '%s29', '%s30', '%s31', '%d8_d10', '%d9_d11', 
+                        '%d10_d12', '%d11_d13', '%d12_d14', '%d13_d15', 
+                        '%q4_q5', '%q5_q6', '%q6_q7', '%q4_q5_q6_q7', '%r4_r5', 
+                        '%r6_r7', '%r8_r9', '%r10_r11', '%d8_d9_d10', '%d9_d10_d11', 
+                        '%d10_d11_d12', '%d11_d12_d13', '%d12_d13_d14', 
+                        '%d13_d14_d15', '%d8_d10_d12', '%d9_d11_d13', '%d10_d12_d14', 
+                        '%d11_d13_d15', '%d8_d10_d12_d14', '%d9_d11_d13_d15', 
+                        '%d9_d10', '%d11_d12', '%d13_d14', '%d9_d10_d11_d12', 
+                        '%d11_d12_d13_d14' ]
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       '%bb.2.if.end'
+  restorePoint:    '%bb.2.if.end'
+stack:           
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '%lr' }
+  - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '%r7' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2.if.end
+    liveins: %r0, %r2, %r3, %lr, %r7
+  
+    DBG_VALUE debug-use %r0, debug-use _, !18, !27, debug-location !28
+    DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
+    DBG_VALUE debug-use %r2, debug-use _, !20, !27, debug-location !28
+    DBG_VALUE debug-use %r3, debug-use _, !21, !27, debug-location !28
+    t2CMPri %r3, 4, 14, _, implicit-def %cpsr, debug-location !31
+    t2Bcc %bb.2.if.end, 2, killed %cpsr
+  
+  bb.1:
+    liveins: %lr, %r7
+  
+    DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
+    %r0 = t2MOVi -1, 14, _, _
+    DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
+    tBX_RET 14, _, implicit %r0, debug-location !34
+  
+  bb.2.if.end:
+    liveins: %r0, %r2, %r3, %r7, %lr
+  
+    %sp = frame-setup t2STMDB_UPD %sp, 14, _, killed %r7, killed %lr
+    frame-setup CFI_INSTRUCTION .cfi_def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION .cfi_offset %lr, -4
+    frame-setup CFI_INSTRUCTION .cfi_offset %r7, -8
+    DBG_VALUE debug-use %r0, debug-use _, !18, !27, debug-location !28
+    DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
+    DBG_VALUE debug-use %r2, debug-use _, !20, !27, debug-location !28
+    DBG_VALUE debug-use %r3, debug-use _, !21, !27, debug-location !28
+    %r1 = COPY killed %r2, debug-location !32
+    %r2 = COPY killed %r3, debug-location !32
+    tBL 14, _, @g, csr_aapcs, implicit-def dead %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit-def %sp, debug-location !32
+    %r0 = t2MOVi 0, 14, _, _
+    %sp = t2LDMIA_UPD %sp, 14, _, def %r7, def %lr
+    DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
+    tBX_RET 14, _, implicit %r0, debug-location !34
+# Verify that the DBG_VALUE is ignored.
+# CHECK: %sp = t2LDMIA_RET %sp, 14, _, def %r7, def %pc, implicit %r0
+
+...
diff --git a/test/CodeGen/MIR/ARM/bundled-instructions.mir b/test/CodeGen/MIR/ARM/bundled-instructions.mir
new file mode 100644
index 0000000000000..814c4e188ea53
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/bundled-instructions.mir
@@ -0,0 +1,75 @@
+# RUN: llc -mtriple thumbv7-apple-ios -start-after block-placement -stop-after block-placement -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the bundled machine instructions
+# and 'internal' register flags correctly.
+
+--- |
+
+  define i32 @test1(i32 %a) {
+  entry:
+    %cmp = icmp sgt i32 %a, -78
+    %. = zext i1 %cmp to i32
+    ret i32 %.
+  }
+
+  define i32 @test2(i32 %a) {
+  entry:
+    %cmp = icmp sgt i32 %a, -78
+    %. = zext i1 %cmp to i32
+    ret i32 %.
+  }
+
+...
+---
+name:            test1
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0' }
+body: |
+  bb.0.entry:
+    liveins: %r0
+    ; CHECK-LABEL: name: test1
+    ; CHECK:      %r1 = t2MOVi 0, 14, _, _
+    ; CHECK-NEXT: t2CMNri killed %r0, 78, 14, _, implicit-def %cpsr
+    ; CHECK-NEXT: BUNDLE implicit-def dead %itstate, implicit-def %r1, implicit killed %cpsr {
+    ; CHECK-NEXT:   t2IT 12, 8, implicit-def %itstate
+    ; CHECK-NEXT:   %r1 = t2MOVi 1, 12, killed %cpsr, _, implicit internal killed %itstate
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %r0 = tMOVr killed %r1, 14, _
+    ; CHECK-NEXT: tBX_RET 14, _, implicit killed %r0
+    %r1 = t2MOVi 0, 14, _, _
+    t2CMNri killed %r0, 78, 14, _, implicit-def %cpsr
+    BUNDLE implicit-def dead %itstate, implicit-def %r1, implicit killed %cpsr {
+      t2IT 12, 8, implicit-def %itstate
+      %r1 = t2MOVi 1, 12, killed %cpsr, _, implicit internal killed %itstate
+    }
+    %r0 = tMOVr killed %r1, 14, _
+    tBX_RET 14, _, implicit killed %r0
+...
+---
+name:            test2
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0' }
+body: |
+  bb.0.entry:
+    liveins: %r0
+
+    ; Verify that the next machine instruction can be on the same line as
+    ; '{' or '}'.
+
+    ; CHECK-LABEL: name: test2
+    ; CHECK:      %r1 = t2MOVi 0, 14, _, _
+    ; CHECK-NEXT: t2CMNri killed %r0, 78, 14, _, implicit-def %cpsr
+    ; CHECK-NEXT: BUNDLE implicit-def dead %itstate, implicit-def %r1, implicit killed %cpsr {
+    ; CHECK-NEXT:   t2IT 12, 8, implicit-def %itstate
+    ; CHECK-NEXT:   %r1 = t2MOVi 1, 12, killed %cpsr, _, implicit internal killed %itstate
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %r0 = tMOVr killed %r1, 14, _
+    ; CHECK-NEXT: tBX_RET 14, _, implicit killed %r0
+    %r1 = t2MOVi 0, 14, _, _
+    t2CMNri killed %r0, 78, 14, _, implicit-def %cpsr
+    BUNDLE implicit-def dead %itstate, implicit-def %r1, implicit killed %cpsr { t2IT 12, 8, implicit-def %itstate
+      %r1 = t2MOVi 1, 12, killed %cpsr, _, internal implicit killed %itstate
+    } %r0 = tMOVr killed %r1, 14, _
+    tBX_RET 14, _, implicit killed %r0
+...
diff --git a/test/CodeGen/MIR/ARM/cfi-same-value.mir b/test/CodeGen/MIR/ARM/cfi-same-value.mir
new file mode 100644
index 0000000000000..f9850abe04639
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/cfi-same-value.mir
@@ -0,0 +1,80 @@
+# RUN: llc -mtriple=arm-linux-unknown-gnueabi -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+
+--- |
+  declare void @dummy_use(i32*, i32)
+
+  define void @test_basic() #0 {
+  entry:
+    %mem = alloca i32, i32 10
+    call void @dummy_use(i32* %mem, i32 10)
+    ret void
+  }
+
+  attributes #0 = { "split-stack" }
+...
+---
+name:            test_basic
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       48
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+stack:
+  - { id: 0, name: mem, offset: -48, size: 40, alignment: 4 }
+  - { id: 1, type: spill-slot, offset: -4, size: 4, alignment: 4,
+      callee-saved-register: '%lr' }
+  - { id: 2, type: spill-slot, offset: -8, size: 4, alignment: 4,
+      callee-saved-register: '%r11' }
+body: |
+  bb.0:
+    successors: %bb.2, %bb.1
+    liveins: %r11, %lr
+
+    %sp = STMDB_UPD %sp, 14, _, %r4, %r5
+    CFI_INSTRUCTION .cfi_def_cfa_offset 8
+    CFI_INSTRUCTION .cfi_offset %r5, -4
+    CFI_INSTRUCTION .cfi_offset %r4, -8
+    %r5 = MOVr %sp, 14, _, _
+    %r4 = MRC 15, 0, 13, 0, 3, 14, _
+    %r4 = LDRi12 %r4, 4, 14, _
+    CMPrr %r4, %r5, 14, _, implicit-def %cpsr
+    Bcc %bb.2, 3, %cpsr
+
+  bb.1:
+    successors: %bb.2
+    liveins: %r11, %lr
+
+    %r4 = MOVi 48, 14, _, _
+    %r5 = MOVi 0, 14, _, _
+    %sp = STMDB_UPD %sp, 14, _, %lr
+    CFI_INSTRUCTION .cfi_def_cfa_offset 12
+    CFI_INSTRUCTION .cfi_offset %lr, -12
+    BL $__morestack, implicit-def %lr, implicit %sp
+    %sp = LDMIA_UPD %sp, 14, _, %lr
+    %sp = LDMIA_UPD %sp, 14, _, %r4, %r5
+    CFI_INSTRUCTION .cfi_def_cfa_offset 0
+    BX_RET 14, _
+
+  bb.2:
+    liveins: %r11, %lr
+
+    %sp = LDMIA_UPD %sp, 14, _, %r4, %r5
+    CFI_INSTRUCTION .cfi_def_cfa_offset 0
+  ; CHECK:      CFI_INSTRUCTION .cfi_same_value %r4
+  ; CHECK-NEXT: CFI_INSTRUCTION .cfi_same_value %r5
+    CFI_INSTRUCTION .cfi_same_value %r4
+    CFI_INSTRUCTION .cfi_same_value %r5
+    %sp = frame-setup STMDB_UPD %sp, 14, _, killed %r11, killed %lr
+    frame-setup CFI_INSTRUCTION .cfi_def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION .cfi_offset %lr, -4
+    frame-setup CFI_INSTRUCTION .cfi_offset %r11, -8
+    %sp = frame-setup SUBri killed %sp, 40, 14, _, _
+    frame-setup CFI_INSTRUCTION .cfi_def_cfa_offset 48
+    %r0 = MOVr %sp, 14, _, _
+    %r1 = MOVi 10, 14, _, _
+    BL @dummy_use, csr_aapcs, implicit-def dead %lr, implicit %sp, implicit %r0, implicit killed %r1, implicit-def %sp
+    %sp = ADDri killed %sp, 40, 14, _, _
+    %sp = LDMIA_UPD %sp, 14, _, %r4, %r5
+    MOVPCLR 14, _
+...
diff --git a/test/CodeGen/MIR/ARM/expected-closing-brace.mir b/test/CodeGen/MIR/ARM/expected-closing-brace.mir
new file mode 100644
index 0000000000000..78d91aead2479
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/expected-closing-brace.mir
@@ -0,0 +1,50 @@
+# RUN: not llc -mtriple thumbv7-apple-ios -start-after block-placement -stop-after block-placement -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+  @G = external global i32
+
+  define i32 @test1(i32 %a) {
+  entry:
+    br label %foo
+
+  foo:
+    %cmp = icmp sgt i32 %a, -78
+    %. = zext i1 %cmp to i32
+    br i1 %cmp, label %if.then, label %if.else
+
+  if.then:
+    ret i32 %.
+
+  if.else:
+    %b = load i32, i32* @G
+    %c = add i32 %b, 1
+    br label %foo
+  }
+...
+---
+name:            test1
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0' }
+body: |
+  bb.0.entry:
+    successors: %bb.1.foo
+    liveins: %r0
+  bb.1.foo:
+    successors: %bb.2.if.then, %bb.1.foo
+    liveins: %r0
+
+    t2CMNri %r0, 78, 14, _, implicit-def %cpsr
+    %r1 = t2MOVi 0, 14, _, _
+    BUNDLE implicit-def dead %itstate, implicit-def %r1, implicit killed %cpsr {
+      t2IT 12, 8, implicit-def %itstate
+      %r1 = t2MOVi 1, 12, killed %cpsr, _, implicit killed %itstate
+    t2CMNri %r0, 77, 14, _, implicit-def %cpsr
+    t2Bcc %bb.1.foo, 11, killed %cpsr
+  ; CHECK: [[@LINE+1]]:3: expected '}'
+  bb.2.if.then:
+    liveins: %r1
+
+    %r0 = tMOVr killed %r1, 14, _
+    tBX_RET 14, _, implicit killed %r0
+...
diff --git a/test/CodeGen/MIR/ARM/extraneous-closing-brace-error.mir b/test/CodeGen/MIR/ARM/extraneous-closing-brace-error.mir
new file mode 100644
index 0000000000000..a069dd307936c
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/extraneous-closing-brace-error.mir
@@ -0,0 +1,20 @@
+# RUN: not llc -mtriple thumbv7-apple-ios -start-after block-placement -stop-after block-placement -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+  define i32 @test1(i32 %a) {
+  entry:
+    ret i32 %a
+  }
+...
+---
+name:            test1
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0' }
+body: |
+  bb.0.entry:
+    liveins: %r0
+    tBX_RET 14, _, implicit killed %r0
+  ; CHECK: [[@LINE+1]]:5: extraneous closing brace ('}')
+    }
+...
diff --git a/test/CodeGen/MIR/ARM/lit.local.cfg b/test/CodeGen/MIR/ARM/lit.local.cfg
new file mode 100644
index 0000000000000..236e1d3441665
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True
diff --git a/test/CodeGen/MIR/ARM/nested-instruction-bundle-error.mir b/test/CodeGen/MIR/ARM/nested-instruction-bundle-error.mir
new file mode 100644
index 0000000000000..b93697857e798
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/nested-instruction-bundle-error.mir
@@ -0,0 +1,30 @@
+# RUN: not llc -mtriple thumbv7-apple-ios -start-after block-placement -stop-after block-placement -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+  define i32 @test1(i32 %a) {
+  entry:
+    %cmp = icmp sgt i32 %a, -78
+    %. = zext i1 %cmp to i32
+    ret i32 %.
+  }
+...
+---
+name:            test1
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0' }
+body: |
+  bb.0.entry:
+    liveins: %r0
+    %r1 = t2MOVi 0, 14, _, _
+    t2CMNri killed %r0, 78, 14, _, implicit-def %cpsr
+    BUNDLE implicit-def dead %itstate, implicit-def %r1, implicit killed %cpsr {
+      t2IT 12, 8, implicit-def %itstate
+      %r1 = t2MOVi 1, 12, killed %cpsr, _
+  ; CHECK: [[@LINE+1]]:14: nested instruction bundles are not allowed
+      BUNDLE {
+      }
+    }
+    %r0 = tMOVr killed %r1, 14, _
+    tBX_RET 14, _, implicit killed %r0
+...
diff --git a/test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir b/test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir
new file mode 100644
index 0000000000000..5b5750b8d1e8a
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir
@@ -0,0 +1,160 @@
+# RUN: llc -mtriple thumbv7 -start-after if-converter -print-before=post-RA-sched -print-after=post-RA-sched %s -o /dev/null 2>&1 | FileCheck %s
+--- |
+  ; ModuleID = '/Volumes/Data/llvm/test/CodeGen/ARM/sched-it-debug-nodes.ll'
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv7"
+  
+  %struct.s = type opaque
+  
+  ; Function Attrs: nounwind
+  define arm_aapcscc i32 @f(%struct.s* %s, i32 %u, i8* %b, i32 %n) #0 !dbg !4 {
+  entry:
+    tail call void @llvm.dbg.value(metadata %struct.s* %s, i64 0, metadata !18, metadata !27), !dbg !28
+    tail call void @llvm.dbg.value(metadata i32 %u, i64 0, metadata !19, metadata !27), !dbg !28
+    tail call void @llvm.dbg.value(metadata i8* %b, i64 0, metadata !20, metadata !27), !dbg !28
+    tail call void @llvm.dbg.value(metadata i32 %n, i64 0, metadata !21, metadata !27), !dbg !28
+    %cmp = icmp ult i32 %n, 4, !dbg !29
+    br i1 %cmp, label %return, label %if.end, !dbg !31
+  
+  if.end:                                           ; preds = %entry
+    tail call arm_aapcscc void @g(%struct.s* %s, i8* %b, i32 %n) #3, !dbg !32
+    br label %return, !dbg !33
+  
+  return:                                           ; preds = %if.end, %entry
+    %retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ]
+    ret i32 %retval.0, !dbg !34
+  }
+
+  ; NOTE: This is checking that the register in the DEBUG_VALUE node is not
+  ; accidentally being marked as KILL.  The DBG_VALUE node gets introduced in
+  ; If-Conversion, and gets bundled into the IT block.  The Post RA Scheduler
+  ; attempts to schedule the Machine Instr, and tries to tag the register in the
+  ; debug value as KILL'ed, resulting in a DEBUG_VALUE node changing codegen!  (or
+  ; hopefully, triggering an assert).
+   
+  ; CHECK: BUNDLE %ITSTATE<imp-def,dead>
+  ; CHECK:  * DBG_VALUE %R1, %noreg, !"u"
+  ; CHECK-NOT:  * DBG_VALUE %R1<kill>, %noreg, !"u"
+ 
+  declare arm_aapcscc void @g(%struct.s*, i8*, i32) #1
+  
+  ; Function Attrs: nounwind readnone
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+  
+  attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { nounwind readnone }
+  attributes #3 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!22, !23, !24, !25}
+  !llvm.ident = !{!26}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0  (llvm/trunk 237059)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "<stdin>", directory: "/Users/compnerd/Source/llvm")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 9, type: !5, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, variables: !17)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{!7, !8, !11, !12, !16}
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 32, align: 32)
+  !9 = !DIDerivedType(tag: DW_TAG_typedef, name: "s", file: !1, line: 5, baseType: !10)
+  !10 = !DICompositeType(tag: DW_TAG_structure_type, name: "s", file: !1, line: 5, flags: DIFlagFwdDecl)
+  !11 = !DIBasicType(name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+  !12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 32, align: 32)
+  !13 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !14)
+  !14 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint8_t", file: !1, line: 2, baseType: !15)
+  !15 = !DIBasicType(name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
+  !16 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !1, line: 3, baseType: !11)
+  !17 = !{!18, !19, !20, !21}
+  !18 = !DILocalVariable(name: "s", arg: 1, scope: !4, file: !1, line: 9, type: !8)
+  !19 = !DILocalVariable(name: "u", arg: 2, scope: !4, file: !1, line: 9, type: !11)
+  !20 = !DILocalVariable(name: "b", arg: 3, scope: !4, file: !1, line: 9, type: !12)
+  !21 = !DILocalVariable(name: "n", arg: 4, scope: !4, file: !1, line: 9, type: !16)
+  !22 = !{i32 2, !"Dwarf Version", i32 4}
+  !23 = !{i32 2, !"Debug Info Version", i32 3}
+  !24 = !{i32 1, !"wchar_size", i32 4}
+  !25 = !{i32 1, !"min_enum_size", i32 4}
+  !26 = !{!"clang version 3.7.0  (llvm/trunk 237059)"}
+  !27 = !DIExpression()
+  !28 = !DILocation(line: 9, scope: !4)
+  !29 = !DILocation(line: 10, scope: !30)
+  !30 = distinct !DILexicalBlock(scope: !4, file: !1, line: 10)
+  !31 = !DILocation(line: 10, scope: !4)
+  !32 = !DILocation(line: 13, scope: !4)
+  !33 = !DILocation(line: 14, scope: !4)
+  !34 = !DILocation(line: 15, scope: !4)
+
+...
+---
+name:            f
+alignment:       1
+exposesReturnsTwice: false
+hasInlineAsm:    false
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:         
+  - { reg: '%r0' }
+  - { reg: '%r2' }
+  - { reg: '%r3' }
+calleeSavedRegisters: [ '%lr', '%d8', '%d9', '%d10', '%d11', '%d12', '%d13', 
+                        '%d14', '%d15', '%q4', '%q5', '%q6', '%q7', '%r4', 
+                        '%r5', '%r6', '%r7', '%r8', '%r9', '%r10', '%r11', 
+                        '%s16', '%s17', '%s18', '%s19', '%s20', '%s21', 
+                        '%s22', '%s23', '%s24', '%s25', '%s26', '%s27', 
+                        '%s28', '%s29', '%s30', '%s31', '%d8_d10', '%d9_d11', 
+                        '%d10_d12', '%d11_d13', '%d12_d14', '%d13_d15', 
+                        '%q4_q5', '%q5_q6', '%q6_q7', '%q4_q5_q6_q7', '%r4_r5', 
+                        '%r6_r7', '%r8_r9', '%r10_r11', '%d8_d9_d10', '%d9_d10_d11', 
+                        '%d10_d11_d12', '%d11_d12_d13', '%d12_d13_d14', 
+                        '%d13_d14_d15', '%d8_d10_d12', '%d9_d11_d13', '%d10_d12_d14', 
+                        '%d11_d13_d15', '%d8_d10_d12_d14', '%d9_d11_d13_d15', 
+                        '%d9_d10', '%d11_d12', '%d13_d14', '%d9_d10_d11_d12', 
+                        '%d11_d12_d13_d14' ]
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+stack:           
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '%lr' }
+  - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '%r7' }
+body:             |
+  bb.0.entry:
+    liveins: %r0, %r2, %r3, %lr, %r7
+  
+    DBG_VALUE debug-use %r0, debug-use _, !18, !27, debug-location !28
+    DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
+    DBG_VALUE debug-use %r2, debug-use _, !20, !27, debug-location !28
+    DBG_VALUE debug-use %r3, debug-use _, !21, !27, debug-location !28
+    t2CMPri %r3, 4, 14, _, implicit-def %cpsr, debug-location !31
+    DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
+    %r0 = t2MOVi -1, 3, %cpsr, _, implicit undef %r0
+    DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
+    tBX_RET 3, %cpsr, implicit %r0, debug-location !34
+    %sp = frame-setup t2STMDB_UPD %sp, 14, _, killed %r7, killed %lr
+    frame-setup CFI_INSTRUCTION .cfi_def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION .cfi_offset %lr, -4
+    frame-setup CFI_INSTRUCTION .cfi_offset %r7, -8
+    DBG_VALUE debug-use %r0, debug-use _, !18, !27, debug-location !28
+    DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
+    DBG_VALUE debug-use %r2, debug-use _, !20, !27, debug-location !28
+    DBG_VALUE debug-use %r3, debug-use _, !21, !27, debug-location !28
+    %r1 = tMOVr killed %r2, 14, _, debug-location !32
+    %r2 = tMOVr killed %r3, 14, _, debug-location !32
+    tBL 14, _, @g, csr_aapcs, implicit-def dead %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit-def %sp, debug-location !32
+    %r0 = t2MOVi 0, 14, _, _
+    %sp = t2LDMIA_RET %sp, 14, _, def %r7, def %pc, implicit %r0
+
+...
diff --git a/test/CodeGen/MIR/Generic/basic-blocks.mir b/test/CodeGen/MIR/Generic/basic-blocks.mir
new file mode 100644
index 0000000000000..22f8d28290db8
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/basic-blocks.mir
@@ -0,0 +1,49 @@
+# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses machine functions correctly.
+
+--- |
+
+  define i32 @foo() {
+  entry:
+    ret i32 0
+  }
+
+  define i32 @bar() {
+  start:
+    ret i32 0
+  }
+
+  define i32 @test() {
+  start:
+    ret i32 0
+  }
+
+...
+---
+# CHECK-LABEL: name: foo
+# CHECK: body:
+# CHECK-NEXT: bb.0.entry:
+name:            foo
+body: |
+  bb.0.entry:
+...
+---
+# CHECK-LABEL: name: bar
+# CHECK: body:
+# CHECK-NEXT: bb.0.start (align 4):
+# CHECK:      bb.1 (address-taken):
+name:            bar
+body: |
+  bb.0.start (align 4):
+  bb.1 (address-taken):
+...
+---
+# CHECK-LABEL: name: test
+# CHECK: body:
+# CHECK-NEXT: bb.0.start (address-taken, align 4):
+# CHECK:      bb.1 (address-taken, align 4):
+name:            test
+body: |
+  bb.0.start (align 4, address-taken):
+  bb.1 (address-taken, align 4):
+...
diff --git a/test/CodeGen/MIR/Generic/expected-colon-after-basic-block.mir b/test/CodeGen/MIR/Generic/expected-colon-after-basic-block.mir
new file mode 100644
index 0000000000000..892258666d100
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/expected-colon-after-basic-block.mir
@@ -0,0 +1,16 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo() {
+  entry:
+    ret i32 0
+  }
+
+...
+---
+name:            foo
+body: |
+  ; CHECK: [[@LINE+1]]:13: expected ':'
+  bb.0.entry
+...
diff --git a/test/CodeGen/MIR/Generic/expected-mbb-reference-for-successor-mbb.mir b/test/CodeGen/MIR/Generic/expected-mbb-reference-for-successor-mbb.mir
new file mode 100644
index 0000000000000..a5e04f86c6d10
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/expected-mbb-reference-for-successor-mbb.mir
@@ -0,0 +1,28 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:29: expected a machine basic block reference
+    successors: %bb.1.less, 2
+
+  bb.1.less:
+
+  bb.2.exit:
+...
diff --git a/test/CodeGen/MIR/Generic/frame-info.mir b/test/CodeGen/MIR/Generic/frame-info.mir
new file mode 100644
index 0000000000000..6e4e3955cb171
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/frame-info.mir
@@ -0,0 +1,89 @@
+# RUN: llc -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses machine frame info properties
+# correctly.
+
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+  define i32 @test2(i32 %a) {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+
+# CHECK: frameInfo:
+# CHECK-NEXT: isFrameAddressTaken: false
+# CHECK-NEXT: isReturnAddressTaken: false
+# CHECK-NEXT: hasStackMap: false
+# CHECK-NEXT: hasPatchPoint: false
+# CHECK-NEXT: stackSize: 0
+# CHECK-NEXT: offsetAdjustment: 0
+# Note: max alignment can be target specific when printed.
+# CHECK-NEXT: maxAlignment:
+# CHECK-NEXT: adjustsStack: false
+# CHECK-NEXT: hasCalls: false
+# CHECK-NEXT: maxCallFrameSize: 0
+# CHECK-NEXT: hasOpaqueSPAdjustment: false
+# CHECK-NEXT: hasVAStart: false
+# CHECK-NEXT: hasMustTailInVarArgFunc: false
+# CHECK: body
+frameInfo:
+  maxAlignment:    4
+body: |
+  bb.0.entry:
+...
+---
+name:            test2
+isSSA:           true
+tracksRegLiveness: true
+
+# CHECK: test2
+# CHECK: frameInfo:
+# CHECK-NEXT: isFrameAddressTaken: true
+# CHECK-NEXT: isReturnAddressTaken: true
+# CHECK-NEXT: hasStackMap: true
+# CHECK-NEXT: hasPatchPoint: true
+# CHECK-NEXT: stackSize: 4
+# CHECK-NEXT: offsetAdjustment: 4
+# Note: max alignment can be target specific when printed.
+# CHECK-NEXT: maxAlignment:
+# CHECK-NEXT: adjustsStack: true
+# CHECK-NEXT: hasCalls: true
+# CHECK-NEXT: maxCallFrameSize: 4
+# CHECK-NEXT: hasOpaqueSPAdjustment: true
+# CHECK-NEXT: hasVAStart: true
+# CHECK-NEXT: hasMustTailInVarArgFunc: true
+# CHECK: body
+frameInfo:
+  isFrameAddressTaken: true
+  isReturnAddressTaken: true
+  hasStackMap:     true
+  hasPatchPoint:   true
+  stackSize:       4
+  offsetAdjustment: 4
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 4
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      true
+  hasMustTailInVarArgFunc: true
+body: |
+  bb.0.entry:
+...
+
diff --git a/test/CodeGen/MIR/Generic/function-missing-machine-function.mir b/test/CodeGen/MIR/Generic/function-missing-machine-function.mir
new file mode 100644
index 0000000000000..71b5b28453405
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/function-missing-machine-function.mir
@@ -0,0 +1,13 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test verifies that an error is reported when a MIR file has some
+# function but is missing a corresponding machine function.
+
+# CHECK: no machine function information for function 'foo' in the MIR file
+
+--- |
+
+  define i32 @foo() {
+    ret i32 0
+  }
+
+...
diff --git a/test/CodeGen/MIR/Generic/invalid-jump-table-kind.mir b/test/CodeGen/MIR/Generic/invalid-jump-table-kind.mir
new file mode 100644
index 0000000000000..576de4bd9dc7f
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/invalid-jump-table-kind.mir
@@ -0,0 +1,53 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test_jumptable(i32 %in) {
+  entry:
+    switch i32 %in, label %def [
+      i32 0, label %lbl1
+      i32 1, label %lbl2
+      i32 2, label %lbl3
+      i32 3, label %lbl4
+    ]
+
+  def:
+    ret i32 0
+
+  lbl1:
+    ret i32 1
+
+  lbl2:
+    ret i32 2
+
+  lbl3:
+    ret i32 4
+
+  lbl4:
+    ret i32 8
+  }
+
+...
+---
+name:            test_jumptable
+jumpTable:
+  # CHECK: [[@LINE+1]]:18: unknown enumerated scalar
+  kind:          switch
+  entries:
+    - id:        0
+      blocks:    [ '%bb.3.lbl1', '%bb.4.lbl2', '%bb.5.lbl3', '%bb.6.lbl4' ]
+body: |
+  bb.0.entry:
+
+  bb.1.entry:
+
+  bb.2.def:
+
+  bb.3.lbl1:
+
+  bb.4.lbl2:
+
+  bb.5.lbl3:
+
+  bb.6.lbl4:
+...
diff --git a/test/CodeGen/MIR/Generic/lit.local.cfg b/test/CodeGen/MIR/Generic/lit.local.cfg
new file mode 100644
index 0000000000000..f3f03bd7047e1
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/lit.local.cfg
@@ -0,0 +1,3 @@
+if 'native' not in config.available_features:
+    config.unsupported = True
+
diff --git a/test/CodeGen/MIR/Generic/llvm-ir-error-reported.mir b/test/CodeGen/MIR/Generic/llvm-ir-error-reported.mir
new file mode 100644
index 0000000000000..3508c341c44d8
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/llvm-ir-error-reported.mir
@@ -0,0 +1,22 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures an error is reported if the embedded LLVM IR contains an
+# error.
+
+--- |
+  
+  ; CHECK: [[@LINE+3]]:15: use of undefined value '%a'
+  define i32 @foo(i32 %x, i32 %y) {
+    %z = alloca i32, align 4
+    store i32 %a, i32* %z, align 4
+    br label %Test
+  Test:
+    %m = load i32, i32* %z, align 4
+    %cond = icmp eq i32 %y, %m
+    br i1 %cond, label %IfEqual, label %IfUnequal
+  IfEqual:
+    ret i32 1
+  IfUnequal:
+    ret i32 0
+  }
+  
+...
diff --git a/test/CodeGen/MIR/Generic/llvmIR.mir b/test/CodeGen/MIR/Generic/llvmIR.mir
new file mode 100644
index 0000000000000..c7a220afa505f
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/llvmIR.mir
@@ -0,0 +1,37 @@
+# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the LLVM IR that's embedded with MIR is parsed
+# correctly.
+
+--- |
+  ; CHECK: define i32 @foo(i32 %x, i32 %y)
+  ; CHECK: %z = alloca i32, align 4
+  ; CHECK: store i32 %x, i32* %z, align 4
+  ; CHECK: br label %Test
+  ; CHECK: Test:
+  ; CHECK: %m = load i32, i32* %z, align 4
+  ; CHECK: %cond = icmp eq i32 %y, %m
+  ; CHECK: br i1 %cond, label %IfEqual, label %IfUnequal
+  ; CHECK: IfEqual:
+  ; CHECK: ret i32 1
+  ; CHECK: IfUnequal:
+  ; CHECK: ret i32 0
+  define i32 @foo(i32 %x, i32 %y) {
+    %z = alloca i32, align 4
+    store i32 %x, i32* %z, align 4
+    br label %Test
+  Test:
+    %m = load i32, i32* %z, align 4
+    %cond = icmp eq i32 %y, %m
+    br i1 %cond, label %IfEqual, label %IfUnequal
+  IfEqual:
+    ret i32 1
+  IfUnequal:
+    ret i32 0
+  }
+  
+...
+---
+name: foo
+body: |
+  bb.0:
+...
diff --git a/test/CodeGen/MIR/Generic/llvmIRMissing.mir b/test/CodeGen/MIR/Generic/llvmIRMissing.mir
new file mode 100644
index 0000000000000..afa96010f297a
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/llvmIRMissing.mir
@@ -0,0 +1,9 @@
+# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser accepts files without the LLVM IR.
+
+---
+# CHECK: name: foo
+name: foo
+body: |
+  bb.0:
+...
diff --git a/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir b/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir
new file mode 100644
index 0000000000000..d6ecd5dc85145
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir
@@ -0,0 +1,17 @@
+# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser preserves unnamed LLVM IR block
+# references.
+
+--- |
+
+  define i32 @foo() {
+    ret i32 0
+  }
+
+...
+---
+name:            foo
+body: |
+  ; CHECK: bb.0 (%ir-block.0):
+  bb.0 (%ir-block.0):
+...
diff --git a/test/CodeGen/MIR/Generic/machine-basic-block-redefinition-error.mir b/test/CodeGen/MIR/Generic/machine-basic-block-redefinition-error.mir
new file mode 100644
index 0000000000000..41747535c3519
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/machine-basic-block-redefinition-error.mir
@@ -0,0 +1,18 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo() {
+  entry:
+    ret i32 0
+  }
+
+...
+---
+name:            foo
+body: |
+  ; CHECK: [[@LINE+3]]:3: redefinition of machine basic block with id #0
+  bb.0:
+
+  bb.0:
+...
diff --git a/test/CodeGen/MIR/Generic/machine-basic-block-undefined-ir-block.mir b/test/CodeGen/MIR/Generic/machine-basic-block-undefined-ir-block.mir
new file mode 100644
index 0000000000000..df559f852ec06
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/machine-basic-block-undefined-ir-block.mir
@@ -0,0 +1,15 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo() {
+    ret i32 0
+  }
+
+...
+---
+name:            foo
+body: |
+  ; CHECK: [[@LINE+1]]:9: use of undefined IR block '%ir-block.10'
+  bb.0 (%ir-block.10):
+...
diff --git a/test/CodeGen/MIR/Generic/machine-basic-block-unknown-name.mir b/test/CodeGen/MIR/Generic/machine-basic-block-unknown-name.mir
new file mode 100644
index 0000000000000..876947b868b01
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/machine-basic-block-unknown-name.mir
@@ -0,0 +1,18 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that an error is reported whenever the MIR parser can't find
+# a basic block with the machine basis block's name.
+
+--- |
+
+  define i32 @foo() {
+  entry:
+    ret i32 0
+  }
+
+...
+---
+name:            foo
+body: |
+  ; CHECK: [[@LINE+1]]:3: basic block 'entrie' is not defined in the function 'foo'
+  bb.0.entrie:
+...
diff --git a/test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir b/test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir
new file mode 100644
index 0000000000000..0dc7477f6275b
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir
@@ -0,0 +1,15 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when it encounters a
+# machine function with an empty body.
+
+--- |
+
+  define i32 @foo() {
+    ret i32 0
+  }
+
+...
+---
+# CHECK: machine function 'foo' requires at least one machine basic block in its body
+name:            foo
+...
diff --git a/test/CodeGen/MIR/Generic/machine-function-missing-function.mir b/test/CodeGen/MIR/Generic/machine-function-missing-function.mir
new file mode 100644
index 0000000000000..6800f8724324c
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/machine-function-missing-function.mir
@@ -0,0 +1,23 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that an error is reported when the mir file has LLVM IR and
+# one of the machine functions has a name that doesn't match any function in
+# the LLVM IR.
+
+--- |
+
+  define i32 @foo() {
+    ret i32 0
+  }
+
+...
+---
+name:            foo
+body: |
+  bb.0:
+...
+---
+# CHECK: function 'faa' isn't defined in the provided LLVM IR
+name:            faa
+body: |
+  bb.0:
+...
diff --git a/test/CodeGen/MIR/Generic/machine-function-missing-name.mir b/test/CodeGen/MIR/Generic/machine-function-missing-name.mir
new file mode 100644
index 0000000000000..f65b77880e971
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/machine-function-missing-name.mir
@@ -0,0 +1,26 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that an error is reported when a machine function doesn't
+# have a name attribute.
+
+--- |
+
+  define i32 @foo() {
+    ret i32 0
+  }
+
+  define i32 @bar() {
+    ret i32 0
+  }
+
+...
+---
+# CHECK: [[@LINE+1]]:1: missing required key 'name'
+nme:             foo
+body: |
+  bb.0:
+...
+---
+name:            bar
+body: |
+  bb.0:
+...
diff --git a/test/CodeGen/MIR/Generic/machine-function-redefinition-error.mir b/test/CodeGen/MIR/Generic/machine-function-redefinition-error.mir
new file mode 100644
index 0000000000000..be84161b5630a
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/machine-function-redefinition-error.mir
@@ -0,0 +1,10 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the machine function errors are reported correctly.
+
+---
+name:            foo
+...
+---
+# CHECK: redefinition of machine function 'foo'
+name:            foo
+...
diff --git a/test/CodeGen/MIR/Generic/machine-function.mir b/test/CodeGen/MIR/Generic/machine-function.mir
new file mode 100644
index 0000000000000..1c4ca3d07d2a4
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/machine-function.mir
@@ -0,0 +1,66 @@
+# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses machine functions correctly.
+
+--- |
+
+  define i32 @foo() {
+    ret i32 0
+  }
+
+  define i32 @bar() {
+    ret i32 0
+  }
+
+  define i32 @func() {
+    ret i32 0
+  }
+
+  define i32 @func2() {
+    ret i32 0
+  }
+  
+...
+---
+# CHECK: name: foo
+# CHECK-NEXT: alignment:
+# CHECK-NEXT: exposesReturnsTwice: false
+# CHECK-NEXT: hasInlineAsm: false
+# CHECK: ...
+name:            foo
+body: |
+  bb.0:
+...
+---
+# CHECK: name: bar
+# CHECK-NEXT: alignment:
+# CHECK-NEXT: exposesReturnsTwice: false
+# CHECK-NEXT: hasInlineAsm: false
+# CHECK: ...
+name:            bar
+body: |
+  bb.0:
+...
+---
+# CHECK: name: func
+# CHECK-NEXT: alignment: 8
+# CHECK-NEXT: exposesReturnsTwice: false
+# CHECK-NEXT: hasInlineAsm: false
+# CHECK: ...
+name:            func
+alignment:       8
+body: |
+  bb.0:
+...
+---
+# CHECK: name: func2
+# CHECK-NEXT: alignment: 16
+# CHECK-NEXT: exposesReturnsTwice: true
+# CHECK-NEXT: hasInlineAsm: true
+# CHECK: ...
+name:            func2
+alignment:       16
+exposesReturnsTwice: true
+hasInlineAsm:    true
+body: |
+  bb.0:
+...
diff --git a/test/CodeGen/MIR/Generic/register-info.mir b/test/CodeGen/MIR/Generic/register-info.mir
new file mode 100644
index 0000000000000..229cf0f9130f1
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/register-info.mir
@@ -0,0 +1,40 @@
+# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses machine register info properties
+# correctly.
+
+--- |
+
+  define i32 @foo() {
+  entry:
+    ret i32 0
+  }
+
+  define i32 @bar() {
+  start:
+    ret i32 0
+  }
+
+...
+---
+# CHECK: name: foo
+# CHECK:      isSSA: false
+# CHECK-NEXT: tracksRegLiveness: false
+# CHECK-NEXT: tracksSubRegLiveness: false
+# CHECK: ...
+name:            foo
+body: |
+  bb.0:
+...
+---
+# CHECK: name: bar
+# CHECK:      isSSA: false
+# CHECK-NEXT: tracksRegLiveness: true
+# CHECK-NEXT: tracksSubRegLiveness: true
+# CHECK: ...
+name: bar
+isSSA: false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+body: |
+  bb.0:
+...
diff --git a/test/CodeGen/MIR/Mips/expected-global-value-or-symbol-after-call-entry.mir b/test/CodeGen/MIR/Mips/expected-global-value-or-symbol-after-call-entry.mir
new file mode 100644
index 0000000000000..ea94c99065576
--- /dev/null
+++ b/test/CodeGen/MIR/Mips/expected-global-value-or-symbol-after-call-entry.mir
@@ -0,0 +1,41 @@
+# RUN: not llc -march=mipsel -mattr=mips16 -relocation-model=pic -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+  define i32 @test(i32 %a) {
+  entry:
+    %call = call i32 @foo(i32 %a)
+    ret i32 0
+  }
+
+  declare i32 @foo(i32)
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%a0' }
+frameInfo:
+  stackSize:       24
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 16
+stack:
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4,
+      callee-saved-register: '%ra' }
+body: |
+  bb.0.entry:
+    liveins: %a0, %ra
+
+    Save16 %ra, 24, implicit-def %sp, implicit %sp
+    %v0, %v1 = GotPrologue16 $_gp_disp, $_gp_disp
+    %v0 = SllX16 killed %v0, 16
+    %v0 = AdduRxRyRz16 killed %v1, killed %v0
+  ; CHECK: [[@LINE+1]]:67: expected a global value or an external symbol after 'call-entry'
+    %v1 = LwRxRyOffMemX16 %v0, @foo, 0 :: (load 4 from call-entry foo)
+    %t9 = COPY %v1
+    %gp = COPY killed %v0
+    JumpLinkReg16 killed %v1, csr_o32, implicit-def %ra, implicit killed %t9, implicit %a0, implicit killed %gp, implicit-def %sp, implicit-def dead %v0
+    %v0 = LiRxImmX16 0
+    %ra = Restore16 24, implicit-def %sp, implicit %sp
+    RetRA16 implicit %v0
+...
diff --git a/test/CodeGen/MIR/Mips/lit.local.cfg b/test/CodeGen/MIR/Mips/lit.local.cfg
new file mode 100644
index 0000000000000..7d12f7a9c5649
--- /dev/null
+++ b/test/CodeGen/MIR/Mips/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'Mips' in config.root.targets:
+    config.unsupported = True
diff --git a/test/CodeGen/MIR/Mips/memory-operands.mir b/test/CodeGen/MIR/Mips/memory-operands.mir
new file mode 100644
index 0000000000000..d4206b067f7e6
--- /dev/null
+++ b/test/CodeGen/MIR/Mips/memory-operands.mir
@@ -0,0 +1,102 @@
+# RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the call entry pseudo source
+# values in memory operands correctly.
+
+--- |
+  define i32 @test(i32 %a) {
+  entry:
+    %call = call i32 @foo(i32 %a)
+    ret i32 0
+  }
+
+  declare i32 @foo(i32)
+
+  define float @test2() #0 {
+  entry:
+    %call = tail call float bitcast (float (...)* @g to float ()*)()
+    call void @__mips16_ret_sf(float %call)
+    ret float %call
+  }
+
+  declare float @g(...)
+
+  declare void @__mips16_ret_sf(float) #1
+
+  attributes #0 = { "saveS2" }
+  attributes #1 = { noinline readnone "__Mips16RetHelper" }
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%a0' }
+frameInfo:
+  stackSize:       24
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 16
+stack:
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4,
+      callee-saved-register: '%ra' }
+body:             |
+  bb.0.entry:
+    liveins: %a0, %ra
+
+    Save16 %ra, 24, implicit-def %sp, implicit %sp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 24
+    CFI_INSTRUCTION .cfi_offset %ra_64, -4
+    %v0, %v1 = GotPrologue16 $_gp_disp, $_gp_disp
+    %v0 = SllX16 killed %v0, 16
+    %v0 = AdduRxRyRz16 killed %v1, killed %v0
+  ; CHECK-LABEL: name: test
+  ; CHECK: %v1 = LwRxRyOffMemX16 %v0, @foo, 0 :: (load 4 from call-entry @foo)
+    %v1 = LwRxRyOffMemX16 %v0, @foo, 0 :: (load 4 from call-entry @foo)
+    %t9 = COPY %v1
+    %gp = COPY killed %v0
+    JumpLinkReg16 killed %v1, csr_o32, implicit-def %ra, implicit killed %t9, implicit %a0, implicit killed %gp, implicit-def %sp, implicit-def dead %v0
+    %v0 = LiRxImmX16 0
+    %ra = Restore16 24, implicit-def %sp, implicit %sp
+    RetRA16 implicit %v0
+...
+---
+name:            test2
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       32
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 16
+stack:
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4,
+      callee-saved-register: '%ra' }
+  - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4,
+      callee-saved-register: '%s2' }
+  - { id: 2, type: spill-slot, offset: -12, size: 4, alignment: 4,
+      callee-saved-register: '%s0' }
+body:             |
+  bb.0.entry:
+    liveins: %ra, %s2, %s0, %ra, %s2, %s0
+
+    SaveX16 %s0, %ra, %s2, 32, implicit-def %sp, implicit %sp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 32
+    CFI_INSTRUCTION .cfi_offset %ra_64, -4
+    CFI_INSTRUCTION .cfi_offset %s2_64, -8
+    CFI_INSTRUCTION .cfi_offset %s0_64, -12
+    %v0, %v1 = GotPrologue16 $_gp_disp, $_gp_disp
+    %v0 = SllX16 killed %v0, 16
+    %s0 = AdduRxRyRz16 killed %v1, killed %v0
+    %v0 = LwRxRyOffMemX16 %s0, @g, 0 :: (load 4 from call-entry @g)
+  ; CHECK-LABEL: test2
+  ; CHECK: %v1 = LwRxRyOffMemX16 %s0, $__mips16_call_stub_sf_0, 0 :: (load 4 from call-entry $__mips16_call_stub_sf_0)
+    %v1 = LwRxRyOffMemX16 %s0, $__mips16_call_stub_sf_0, 0 :: (load 4 from call-entry $__mips16_call_stub_sf_0)
+    %gp = COPY %s0
+    JumpLinkReg16 killed %v1, csr_o32, implicit-def %ra, implicit %v0, implicit killed %gp, implicit-def %sp, implicit-def %v0
+    %v1 = LwRxRyOffMemX16 %s0, @__mips16_ret_sf, 0 :: (load 4 from call-entry @__mips16_ret_sf)
+    %t9 = COPY %v1
+    %gp = COPY killed %s0
+    JumpLinkReg16 killed %v1, csr_mips16rethelper, implicit-def %ra, implicit killed %t9, implicit %v0, implicit killed %gp, implicit-def %sp
+    %s0, %ra, %s2 = RestoreX16 32, implicit-def %sp, implicit %sp
+    RetRA16 implicit %v0
+...
diff --git a/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir b/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
new file mode 100644
index 0000000000000..28fb2a2cf5c9b
--- /dev/null
+++ b/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=nvptx -mcpu=sm_20 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define float @test(float %k) {
+  entry:
+    %0 = fadd float %k, 3.250000e+00
+    ret float %0
+  }
+
+...
+---
+name:            test
+registers:
+  - { id: 0, class: float32regs }
+  - { id: 1, class: float32regs }
+body: |
+  bb.0.entry:
+    %0 = LD_f32_avar 0, 4, 1, 2, 32, $test_param_0
+  ; CHECK: [[@LINE+1]]:33: expected a floating point literal
+    %1 = FADD_rnf32ri %0, float 3
+    StoreRetvalF32 %1, 0
+    Return
+...
diff --git a/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
new file mode 100644
index 0000000000000..18866d58a9466
--- /dev/null
+++ b/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
@@ -0,0 +1,81 @@
+# RUN: llc -march=nvptx -mcpu=sm_20 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses floating point constant operands
+# correctly.
+
+--- |
+
+  define float @test(float %k, i32 %i) {
+  entry:
+    %0 = fpext float %k to double
+    %1 = fadd double %0, 3.250000e+00
+    %2 = fptrunc double %1 to float
+    %3 = sitofp i32 %i to float
+    %4 = fadd float %3, 6.250000e+00
+    %5 = fmul float %4, %2
+    ret float %5
+  }
+
+  define float @test2(float %k, i32 %i) {
+  entry:
+    %0 = fpext float %k to double
+    %1 = fadd double %0, 0x7FF8000000000000
+    %2 = fptrunc double %1 to float
+    %3 = sitofp i32 %i to float
+    %4 = fadd float %3, 0x7FF8000000000000
+    %5 = fmul float %4, %2
+    ret float %5
+  }
+
+...
+---
+name:            test
+registers:
+  - { id: 0, class: float32regs }
+  - { id: 1, class: float64regs }
+  - { id: 2, class: int32regs }
+  - { id: 3, class: float64regs }
+  - { id: 4, class: float32regs }
+  - { id: 5, class: float32regs }
+  - { id: 6, class: float32regs }
+  - { id: 7, class: float32regs }
+body: |
+  bb.0.entry:
+    %0 = LD_f32_avar 0, 4, 1, 2, 32, $test_param_0
+    %1 = CVT_f64_f32 %0, 0
+    %2 = LD_i32_avar 0, 4, 1, 0, 32, $test_param_1
+  ; CHECK: %3 = FADD_rnf64ri %1, double 3.250000e+00
+    %3 = FADD_rnf64ri %1, double 3.250000e+00
+    %4 = CVT_f32_f64 %3, 5
+    %5 = CVT_f32_s32 %2, 5
+  ; CHECK: %6 = FADD_rnf32ri %5, float 6.250000e+00
+    %6 = FADD_rnf32ri %5, float 6.250000e+00
+    %7 = FMUL_rnf32rr %6, %4
+    StoreRetvalF32 %7, 0
+    Return
+...
+---
+name:            test2
+registers:
+  - { id: 0, class: float32regs }
+  - { id: 1, class: float64regs }
+  - { id: 2, class: int32regs }
+  - { id: 3, class: float64regs }
+  - { id: 4, class: float32regs }
+  - { id: 5, class: float32regs }
+  - { id: 6, class: float32regs }
+  - { id: 7, class: float32regs }
+body: |
+  bb.0.entry:
+    %0 = LD_f32_avar 0, 4, 1, 2, 32, $test2_param_0
+    %1 = CVT_f64_f32 %0, 0
+    %2 = LD_i32_avar 0, 4, 1, 0, 32, $test2_param_1
+  ; CHECK: %3 = FADD_rnf64ri %1, double 0x7FF8000000000000
+    %3 = FADD_rnf64ri %1, double 0x7FF8000000000000
+    %4 = CVT_f32_f64 %3, 5
+    %5 = CVT_f32_s32 %2, 5
+  ; CHECK: %6 = FADD_rnf32ri %5, float 0x7FF8000000000000
+    %6 = FADD_rnf32ri %5, float 0x7FF8000000000000
+    %7 = FMUL_rnf32rr %6, %4
+    StoreRetvalF32 %7, 0
+    Return
+...
diff --git a/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir b/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
new file mode 100644
index 0000000000000..e4080f80ee520
--- /dev/null
+++ b/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=nvptx -mcpu=sm_20 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define float @test(float %k) {
+  entry:
+    %0 = fadd float %k, 3.250000e+00
+    ret float %0
+  }
+
+...
+---
+name:            test
+registers:
+  - { id: 0, class: float32regs }
+  - { id: 1, class: float32regs }
+body: |
+  bb.0.entry:
+    %0 = LD_f32_avar 0, 4, 1, 2, 32, $test_param_0
+  ; CHECK: [[@LINE+1]]:33: floating point constant does not have type 'float'
+    %1 = FADD_rnf32ri %0, float 0xH3C00
+    StoreRetvalF32 %1, 0
+    Return
+...
diff --git a/test/CodeGen/MIR/NVPTX/lit.local.cfg b/test/CodeGen/MIR/NVPTX/lit.local.cfg
new file mode 100644
index 0000000000000..2cb98eb371b21
--- /dev/null
+++ b/test/CodeGen/MIR/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True
diff --git a/test/CodeGen/MIR/PowerPC/lit.local.cfg b/test/CodeGen/MIR/PowerPC/lit.local.cfg
new file mode 100644
index 0000000000000..091332439b186
--- /dev/null
+++ b/test/CodeGen/MIR/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
diff --git a/test/CodeGen/MIR/PowerPC/unordered-implicit-registers.mir b/test/CodeGen/MIR/PowerPC/unordered-implicit-registers.mir
new file mode 100644
index 0000000000000..39d14e72ffeed
--- /dev/null
+++ b/test/CodeGen/MIR/PowerPC/unordered-implicit-registers.mir
@@ -0,0 +1,45 @@
+# RUN: llc -mtriple=powerpc64-unknown-linux-gnu -start-after machine-combiner -stop-after machine-combiner -o /dev/null %s | FileCheck %s
+# PR24724
+
+--- |
+  define signext i32 @main(i32* %p) #0 {
+  entry:
+    %0 = load i32, i32* %p, align 4
+    %or = or i32 0, %0
+    store i32 %or, i32* %p, align 4
+    %lnot.1 = icmp eq i32 undef, 0
+    %lnot.ext.1 = zext i1 %lnot.1 to i32
+    %shr.i.1 = lshr i32 2072, %lnot.ext.1
+    %call.lobit.1 = lshr i32 %shr.i.1, 7
+    %1 = and i32 %call.lobit.1, 1
+    %or.1 = or i32 %1, %or
+    ret i32 %or.1
+  }
+
+  attributes #0 = { nounwind "target-cpu"="ppc64" }
+...
+---
+name:            main
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: g8rc_and_g8rc_nox0 }
+  - { id: 1, class: gprc }
+  - { id: 2, class: gprc }
+  - { id: 3, class: gprc }
+  - { id: 4, class: g8rc }
+liveins:
+  - { reg: '%x3', virtual-reg: '%0' }
+body: |
+  bb.0.entry:
+    liveins: %x3
+
+    %0 = COPY %x3
+    %1 = LWZ 0, %0 :: (load 4 from %ir.p)
+    %2 = LI 0
+    %3 = RLWIMI %2, killed %1, 0, 0, 31
+    %4 = EXTSW_32_64 killed %3
+    %x3 = COPY %4
+  ; CHECK: BLR8 implicit %lr8, implicit %rm, implicit %x3
+    BLR8 implicit %lr8, implicit %rm, implicit %x3
+...
diff --git a/test/CodeGen/MIR/X86/basic-block-liveins.mir b/test/CodeGen/MIR/X86/basic-block-liveins.mir
index d749a05244223..00732975495d5 100644
--- a/test/CodeGen/MIR/X86/basic-block-liveins.mir
+++ b/test/CodeGen/MIR/X86/basic-block-liveins.mir
@@ -9,17 +9,54 @@
     ret i32 %c
   }
 
+  define i32 @test2(i32 %a, i32 %b) {
+  body:
+    %c = add i32 %a, %b
+    ret i32 %c
+  }
+
+  define i32 @test3() {
+  body:
+    ret i32 0
+  }
+
 ...
 ---
 name:            test
-body:
-  # CHECK: name: body
-  # CHECK: liveins: [ '%edi', '%esi' ]
-  # CHECK-NEXT: instructions:
-  - id:          0
-    name:        body
-    liveins:     [ '%edi', '%esi' ]
-    instructions:
-      - '%eax = LEA64_32r killed %rdi, 1, killed %rsi, 0, _'
-      - 'RETQ %eax'
+body: |
+  ; CHECK-LABEL: bb.0.body:
+  ; CHECK-NEXT:    liveins: %edi, %esi
+  bb.0.body:
+    liveins: %edi, %esi
+
+    %eax = LEA64_32r killed %rdi, 1, killed %rsi, 0, _
+    RETQ %eax
+...
+---
+name:            test2
+body: |
+  ; CHECK-LABEL: name: test2
+  ; Verify that we can have multiple lists of liveins that will be merged into
+  ; one.
+  ; CHECK:       bb.0.body:
+  ; CHECK-NEXT:    liveins: %edi, %esi
+  bb.0.body:
+    liveins: %edi
+    liveins: %esi
+
+    %eax = LEA64_32r killed %rdi, 1, killed %rsi, 0, _
+    RETQ %eax
+...
+---
+name:            test3
+body: |
+  ; Verify that we can have an empty list of liveins.
+  ; CHECK-LABEL: name: test3
+  ; CHECK:       bb.0.body:
+  ; CHECK-NEXT:    %eax = MOV32r0 implicit-def dead %eflags
+  bb.0.body:
+    liveins:
+
+    %eax = MOV32r0 implicit-def dead %eflags
+    RETQ killed %eax
 ...
diff --git a/test/CodeGen/MIR/X86/basic-block-not-at-start-of-line-error.mir b/test/CodeGen/MIR/X86/basic-block-not-at-start-of-line-error.mir
new file mode 100644
index 0000000000000..b4b7dddea56c0
--- /dev/null
+++ b/test/CodeGen/MIR/X86/basic-block-not-at-start-of-line-error.mir
@@ -0,0 +1,41 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:                                             ; preds = %entry
+    ret i32 0
+
+  exit:                                             ; preds = %entry
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+tracksRegLiveness: true
+liveins:
+  - { reg: '%edi' }
+body: |
+  bb.0.entry:
+    successors: %bb.1.less, %bb.2.exit
+    liveins: %edi 44
+
+    CMP32ri8 %edi, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit killed %eflags
+
+  ; CHECK: [[@LINE+1]]:8: basic block definition should be located at the start of the line
+  less bb.1:
+    %eax = MOV32r0 implicit-def dead %eflags
+    RETQ killed %eax
+
+  bb.2.exit:
+    liveins: %edi
+
+    %eax = COPY killed %edi
+    RETQ killed %eax
+...
diff --git a/test/CodeGen/MIR/X86/block-address-operands.mir b/test/CodeGen/MIR/X86/block-address-operands.mir
new file mode 100644
index 0000000000000..3c2d2aefff202
--- /dev/null
+++ b/test/CodeGen/MIR/X86/block-address-operands.mir
@@ -0,0 +1,121 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the block address operands
+# correctly.
+
+--- |
+
+  @addr = global i8* null
+
+  define void @test() {
+  entry:
+    store volatile i8* blockaddress(@test, %block), i8** @addr
+    %val = load volatile i8*, i8** @addr
+    indirectbr i8* %val, [label %block]
+
+  block:
+    ret void
+  }
+
+  define void @test2() {
+  entry:
+    store volatile i8* blockaddress(@test2, %"quoted block"), i8** @addr
+    %val = load volatile i8*, i8** @addr
+    indirectbr i8* %val, [label %"quoted block"]
+
+  "quoted block":
+    ret void
+  }
+
+  define void @slot_in_other_function(i8** %addr) {
+  entry:
+    store volatile i8* blockaddress(@test3, %0), i8** %addr
+    ret void
+  }
+
+  define void @test3() {
+  entry:
+    store volatile i8* blockaddress(@test3, %0), i8** @addr
+    %val = load volatile i8*, i8** @addr
+    indirectbr i8* %val, [label %0]
+
+    ret void
+  }
+
+  define void @test4() {
+  entry:
+    store volatile i8* blockaddress(@test4, %block), i8** @addr
+    %val = load volatile i8*, i8** @addr
+    indirectbr i8* %val, [label %block]
+
+  block:
+    ret void
+  }
+
+...
+---
+name:            test
+body: |
+  bb.0.entry:
+    successors: %bb.1.block
+  ; CHECK: %rax = LEA64r %rip, 1, _, blockaddress(@test, %ir-block.block), _
+    %rax = LEA64r %rip, 1, _, blockaddress(@test, %ir-block.block), _
+    MOV64mr %rip, 1, _, @addr, _, killed %rax
+    JMP64m %rip, 1, _, @addr, _
+
+  bb.1.block (address-taken):
+    RETQ
+...
+---
+name:            test2
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    successors: %bb.1
+  ; CHECK: %rax = LEA64r %rip, 1, _, blockaddress(@test2, %ir-block."quoted block"), _
+    %rax = LEA64r %rip, 1, _, blockaddress(@test2, %ir-block."quoted block"), _
+    MOV64mr %rip, 1, _, @addr, _, killed %rax
+    JMP64m %rip, 1, _, @addr, _
+
+  bb.1 (address-taken):
+    RETQ
+...
+---
+name:            slot_in_other_function
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK-LABEL: name: slot_in_other_function
+  ; CHECK: %rax = LEA64r %rip, 1, _, blockaddress(@test3, %ir-block.0), _
+    %rax = LEA64r %rip, 1, _, blockaddress(@test3, %ir-block.0), _
+    MOV64mr killed %rdi, 1, _, 0, _, killed %rax
+    RETQ
+...
+---
+name:            test3
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    successors: %bb.1
+  ; CHECK-LABEL: name: test3
+  ; CHECK: %rax = LEA64r %rip, 1, _, blockaddress(@test3, %ir-block.0), _
+    %rax = LEA64r %rip, 1, _, blockaddress(@test3, %ir-block.0), _
+    MOV64mr %rip, 1, _, @addr, _, killed %rax
+    JMP64m %rip, 1, _, @addr, _
+
+  bb.1 (address-taken):
+    RETQ
+...
+---
+name:            test4
+body: |
+  bb.0.entry:
+    successors: %bb.1.block
+  ; CHECK: %rax = LEA64r %rip, 1, _, blockaddress(@test, %ir-block.block) + 2, _
+    %rax = LEA64r %rip, 1, _, blockaddress(@test, %ir-block.block) + 2, _
+    MOV64mr %rip, 1, _, @addr, _, killed %rax
+    JMP64m %rip, 1, _, @addr, _
+
+  bb.1.block (address-taken):
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/callee-saved-info.mir b/test/CodeGen/MIR/X86/callee-saved-info.mir
new file mode 100644
index 0000000000000..17c7739951d95
--- /dev/null
+++ b/test/CodeGen/MIR/X86/callee-saved-info.mir
@@ -0,0 +1,95 @@
+# RUN: llc -march=x86-64 -start-after prologepilog -stop-after prologepilog -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses callee saved information in the
+# stack objects correctly.
+
+--- |
+
+  define i32 @compute(i32 %a) {
+  body:
+    ret i32 %a
+  }
+
+  define i32 @func(i32 %a) {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    br label %check
+
+  check:
+    %comp = icmp sle i32 %a, 10
+    br i1 %comp, label %loop, label %exit
+
+  loop:
+    %c = load i32, i32* %b
+    %d = call i32 @compute(i32 %c)
+    %e = sub i32 %d, 1
+    store i32 %e, i32* %b
+    br label %check
+
+  exit:
+    ret i32 0
+  }
+
+...
+---
+name:            compute
+tracksRegLiveness: true
+body: |
+  bb.0.body:
+    liveins: %edi
+
+    %eax = COPY killed %edi
+    RETQ killed %eax
+...
+---
+name:            func
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       24
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+# CHECK: fixedStack:
+# CHECK-NEXT: , callee-saved-register: '%rbx' }
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%rbx' }
+# CHECK: stack:
+# CHECK-NEXT: - { id: 0
+# CHECK-NEXT: , callee-saved-register: '%edi' }
+stack:
+  - { id: 0, name: b, offset: -20, size: 4, alignment: 4 }
+  - { id: 1, offset: -24, size: 4, alignment: 4, callee-saved-register: '%edi' }
+body: |
+  bb.0.entry:
+    successors: %bb.1.check
+    liveins: %edi, %rbx
+
+    frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    %rsp = frame-setup SUB64ri8 %rsp, 16, implicit-def dead %eflags
+    %ebx = COPY %edi
+    MOV32mr %rsp, 1, _, 12, _, %ebx
+
+  bb.1.check:
+    successors: %bb.2.loop, %bb.3.exit
+    liveins: %ebx
+
+    CMP32ri8 %ebx, 10, implicit-def %eflags
+    JG_1 %bb.3.exit, implicit killed %eflags
+    JMP_1 %bb.2.loop
+
+  bb.2.loop:
+    successors: %bb.1.check
+    liveins: %ebx
+
+    %edi = MOV32rm %rsp, 1, _, 12, _
+    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
+    %eax = DEC32r killed %eax, implicit-def dead %eflags
+    MOV32mr %rsp, 1, _, 12, _, killed %eax
+    JMP_1 %bb.1.check
+
+  bb.3.exit:
+    %eax = MOV32r0 implicit-def dead %eflags
+    %rsp = ADD64ri8 %rsp, 16, implicit-def dead %eflags
+    %rbx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir b/test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir
new file mode 100644
index 0000000000000..47051a53e3f4f
--- /dev/null
+++ b/test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir
@@ -0,0 +1,29 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the .cfi_def_cfa_offset operands
+# correctly.
+
+--- |
+
+  define void @test() {
+  entry:
+    %tmp = alloca [4168 x i8], align 4
+    ret void
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       4040
+stack:
+  - { id: 0, name: tmp, offset: -4176, size: 4168, alignment: 4 }
+body: |
+  bb.0.entry:
+    %rsp = SUB64ri32 %rsp, 4040, implicit-def dead %eflags
+    ; CHECK: CFI_INSTRUCTION .cfi_def_cfa_offset 4048
+    CFI_INSTRUCTION .cfi_def_cfa_offset 4048
+    %rsp = ADD64ri32 %rsp, 4040, implicit-def dead %eflags
+    RETQ
+...
+
diff --git a/test/CodeGen/MIR/X86/cfi-def-cfa-register.mir b/test/CodeGen/MIR/X86/cfi-def-cfa-register.mir
new file mode 100644
index 0000000000000..74a33b5c34375
--- /dev/null
+++ b/test/CodeGen/MIR/X86/cfi-def-cfa-register.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the .cfi_def_cfa_register
+# operands correctly.
+
+--- |
+
+  define void @func() #0 {
+  entry:
+    unreachable
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="true" }
+
+...
+---
+name:            func
+tracksRegLiveness: true
+frameInfo:
+  stackSize:     8
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16 }
+body: |
+  bb.0.entry:
+    liveins: %rbp
+
+    PUSH64r killed %rbp, implicit-def %rsp, implicit %rsp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 16
+    CFI_INSTRUCTION .cfi_offset %rbp, -16
+    %rbp = MOV64rr %rsp
+    ; CHECK: CFI_INSTRUCTION .cfi_def_cfa_register %rbp
+    CFI_INSTRUCTION .cfi_def_cfa_register %rbp
+...
diff --git a/test/CodeGen/MIR/X86/cfi-offset.mir b/test/CodeGen/MIR/X86/cfi-offset.mir
new file mode 100644
index 0000000000000..fd9e605a036a9
--- /dev/null
+++ b/test/CodeGen/MIR/X86/cfi-offset.mir
@@ -0,0 +1,47 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the .cfi_offset operands
+# correctly.
+
+--- |
+
+  declare void @foo(i32)
+
+  define i32 @test(i32 %a, i32 %b, i32 %c, i32 %d) {
+  entry:
+    %add = add nsw i32 %b, %a
+    %add1 = add nsw i32 %add, %c
+    %add2 = add nsw i32 %add1, %d
+    tail call void @foo(i32 %add2)
+    %add6 = add nsw i32 %add2, %add2
+    ret i32 %add6
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       8
+  adjustsStack:    true
+  hasCalls:        true
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16 }
+body: |
+  bb.0.entry:
+    liveins: %ecx, %edi, %edx, %esi, %rbx
+
+    PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 16
+    ; CHECK: CFI_INSTRUCTION .cfi_offset %rbx, -16
+    CFI_INSTRUCTION .cfi_offset %rbx, -16
+    %ebx = COPY %edi, implicit-def %rbx
+    %ebx = ADD32rr %ebx, killed %esi, implicit-def dead %eflags
+    %ebx = ADD32rr %ebx, killed %edx, implicit-def dead %eflags
+    %ebx = ADD32rr %ebx, killed %ecx, implicit-def dead %eflags
+    %edi = COPY %ebx
+    CALL64pcrel32 @foo, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp
+    %eax = LEA64_32r killed %rbx, 1, %rbx, 0, _
+    %rbx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+...
+
diff --git a/test/CodeGen/MIR/X86/constant-pool-item-redefinition-error.mir b/test/CodeGen/MIR/X86/constant-pool-item-redefinition-error.mir
new file mode 100644
index 0000000000000..2ddf5736b977b
--- /dev/null
+++ b/test/CodeGen/MIR/X86/constant-pool-item-redefinition-error.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define double @test(double %a, float %b) {
+  entry:
+    %c = fadd double %a, 3.250000e+00
+    ret double %c
+  }
+
+...
+---
+name:            test
+constants:
+  - id:          0
+    value:       'double 3.250000e+00'
+# CHECK: [[@LINE+1]]:18: redefinition of constant pool item '%const.0'
+  - id:          0
+    value:       'double 3.250000e+00'
+body: |
+  bb.0.entry:
+    %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _
+    RETQ %xmm0
+...
+
diff --git a/test/CodeGen/MIR/X86/constant-pool.mir b/test/CodeGen/MIR/X86/constant-pool.mir
new file mode 100644
index 0000000000000..213e4e2834851
--- /dev/null
+++ b/test/CodeGen/MIR/X86/constant-pool.mir
@@ -0,0 +1,139 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses constant pool constants and
+# constant pool operands correctly.
+
+--- |
+
+  define double @test(double %a, float %b) {
+  entry:
+    %c = fadd double %a, 3.250000e+00
+    %d = fadd float %b, 6.250000e+00
+    %e = fpext float %d to double
+    %f = fmul double %c, %e
+    ret double %f
+  }
+
+  define double @test2(double %a, float %b) {
+  entry:
+    %c = fadd double %a, 3.250000e+00
+    %d = fadd float %b, 6.250000e+00
+    %e = fpext float %d to double
+    %f = fmul double %c, %e
+    ret double %f
+  }
+
+  define double @test3(double %a, float %b) {
+  entry:
+    %c = fadd double %a, 3.250000e+00
+    %d = fadd float %b, 6.250000e+00
+    %e = fpext float %d to double
+    %f = fmul double %c, %e
+    ret double %f
+  }
+
+  define double @test4(double %a, float %b) {
+  entry:
+    %c = fadd double %a, 3.250000e+00
+    %d = fadd float %b, 6.250000e+00
+    %e = fpext float %d to double
+    %f = fmul double %c, %e
+    ret double %f
+  }
+...
+---
+# CHECK: name: test
+# CHECK: constants:
+# CHECK-NEXT: - id: 0
+# CHECK-NEXT:   value: 'double 3.250000e+00'
+# CHECK-NEXT:   alignment: 8
+# CHECK-NEXT: - id: 1
+# CHECK-NEXT:   value: 'float 6.250000e+00'
+# CHECK-NEXT:   alignment: 4
+name:            test
+constants:
+  - id:          0
+    value:       'double 3.250000e+00'
+    alignment:   8
+  - id:          1
+    value:       'float 6.250000e+00'
+    alignment:   4
+body: |
+  bb.0.entry:
+    ; CHECK:      %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _
+    ; CHECK-NEXT: %xmm1 = ADDSSrm killed %xmm1, %rip, 1, _, %const.1, _
+    %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _
+    %xmm1 = ADDSSrm killed %xmm1, %rip, 1, _, %const.1, _
+    %xmm1 = CVTSS2SDrr killed %xmm1
+    %xmm0 = MULSDrr killed %xmm0, killed %xmm1
+    RETQ %xmm0
+...
+---
+# Verify that alignment can be inferred:
+# CHECK: name: test2
+# CHECK: constants:
+# CHECK-NEXT: - id: 0
+# CHECK-NEXT:   value: 'double 3.250000e+00'
+# CHECK-NEXT:   alignment: 8
+# CHECK-NEXT: - id: 1
+# CHECK-NEXT:   value: 'float 6.250000e+00'
+# CHECK-NEXT:   alignment: 4
+name:            test2
+constants:
+  - id:          0
+    value:       'double 3.250000e+00'
+  - id:          1
+    value:       'float 6.250000e+00'
+body: |
+  bb.0.entry:
+    %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _
+    %xmm1 = ADDSSrm killed %xmm1, %rip, 1, _, %const.1, _
+    %xmm1 = CVTSS2SDrr killed %xmm1
+    %xmm0 = MULSDrr killed %xmm0, killed %xmm1
+    RETQ %xmm0
+...
+---
+# Verify that the non-standard alignments are respected:
+# CHECK: name: test3
+# CHECK: constants:
+# CHECK-NEXT: - id: 0
+# CHECK-NEXT:   value: 'double 3.250000e+00'
+# CHECK-NEXT:   alignment: 128
+# CHECK-NEXT: - id: 1
+# CHECK-NEXT:   value: 'float 6.250000e+00'
+# CHECK-NEXT:   alignment: 1
+name:            test3
+constants:
+  - id:          0
+    value:       'double 3.250000e+00'
+    alignment:   128
+  - id:          1
+    value:       'float 6.250000e+00'
+    alignment:   1
+body: |
+  bb.0.entry:
+    ; CHECK:      %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _
+    ; CHECK-NEXT: %xmm1 = ADDSSrm killed %xmm1, %rip, 1, _, %const.1, _
+    %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _
+    %xmm1 = ADDSSrm killed %xmm1, %rip, 1, _, %const.1, _
+    %xmm1 = CVTSS2SDrr killed %xmm1
+    %xmm0 = MULSDrr killed %xmm0, killed %xmm1
+    RETQ %xmm0
+...
+---
+# CHECK:  name:  test4
+name:            test4
+constants:
+  - id:          0
+    value:       'double 3.250000e+00'
+  - id:          1
+    value:       'float 6.250000e+00'
+body: |
+  bb.0.entry:
+    ; CHECK:      %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.1 - 12, _
+    ; CHECK-NEXT: %xmm1 = ADDSSrm killed %xmm1, %rip, 1, _, %const.0 + 8, _
+    %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.1 - 12, _
+    %xmm1 = ADDSSrm killed %xmm1, %rip, 1, _, %const.0 + 8, _
+    %xmm1 = CVTSS2SDrr killed %xmm1
+    %xmm0 = MULSDrr killed %xmm0, killed %xmm1
+    RETQ %xmm0
+...
diff --git a/test/CodeGen/MIR/X86/constant-value-error.mir b/test/CodeGen/MIR/X86/constant-value-error.mir
new file mode 100644
index 0000000000000..1e14d2282c5a2
--- /dev/null
+++ b/test/CodeGen/MIR/X86/constant-value-error.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when parsing an invalid
+# constant value.
+
+--- |
+
+  define double @test(double %a, float %b) {
+  entry:
+    %c = fadd double %a, 3.250000e+00
+    ret double %c
+  }
+
+...
+---
+name:            test
+constants:
+  - id:          0
+  # CHECK: [[@LINE+1]]:19: expected type
+    value:       'dub 3.250000e+00'
+body: |
+  bb.0.entry:
+    %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _
+    RETQ %xmm0
+...
+
diff --git a/test/CodeGen/MIR/X86/dead-register-flag.mir b/test/CodeGen/MIR/X86/dead-register-flag.mir
index 988b554659cb9..309e776de46a1 100644
--- a/test/CodeGen/MIR/X86/dead-register-flag.mir
+++ b/test/CodeGen/MIR/X86/dead-register-flag.mir
@@ -15,12 +15,10 @@
 ...
 ---
 name:            foo
-body:
-  # CHECK: name: body
-  - id:          0
-    name:        body
-    instructions:
-      # CHECK: - '%eax = IMUL32rri8 %edi, 11, implicit-def dead %eflags'
-      - '%eax = IMUL32rri8 %edi, 11, implicit-def dead %eflags'
-      - 'RETQ %eax'
+body: |
+  ; CHECK: bb.0.body:
+  bb.0.body:
+    ; CHECK: %eax = IMUL32rri8 %edi, 11, implicit-def dead %eflags
+    %eax = IMUL32rri8 %edi, 11, implicit-def dead %eflags
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/def-register-already-tied-error.mir b/test/CodeGen/MIR/X86/def-register-already-tied-error.mir
new file mode 100644
index 0000000000000..69c816f59b9b9
--- /dev/null
+++ b/test/CodeGen/MIR/X86/def-register-already-tied-error.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+  define i64 @test(i64 %x) #0 {
+  entry:
+    %asm = tail call i64 asm sideeffect "$foo", "=r,0"(i64 %x) nounwind
+    ret i64 %asm
+  }
+
+  attributes #0 = { nounwind }
+...
+---
+name:            test
+hasInlineAsm:    true
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+
+  ; CHECK: [[@LINE+1]]:83: the tied-def operand #3 is already tied with another register operand
+    INLINEASM $"$foo", 1, 2818058, def %rdi, 2147483657, killed %rdi(tied-def 3), killed %rdi(tied-def 3)
+    %rax = COPY killed %rdi
+    RETQ killed %rax
+...
diff --git a/test/CodeGen/MIR/X86/duplicate-memory-operand-flag.mir b/test/CodeGen/MIR/X86/duplicate-memory-operand-flag.mir
new file mode 100644
index 0000000000000..7d01810c792b2
--- /dev/null
+++ b/test/CodeGen/MIR/X86/duplicate-memory-operand-flag.mir
@@ -0,0 +1,27 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @volatile_inc(i32* %x) {
+  entry:
+    %0 = load volatile i32, i32* %x
+    %1 = add i32 %0, 1
+    store volatile i32 %1, i32* %x
+    ret i32 %1
+  }
+
+...
+---
+name:            volatile_inc
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:50: duplicate 'volatile' memory operand flag
+    %eax = MOV32rm %rdi, 1, _, 0, _ :: (volatile volatile load 4 from %ir.x)
+    %eax = INC32r killed %eax, implicit-def dead %eflags
+    MOV32mr killed %rdi, 1, _, 0, _, %eax :: (volatile store 4 into %ir.x)
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/duplicate-register-flag-error.mir b/test/CodeGen/MIR/X86/duplicate-register-flag-error.mir
new file mode 100644
index 0000000000000..d80c6ed061de8
--- /dev/null
+++ b/test/CodeGen/MIR/X86/duplicate-register-flag-error.mir
@@ -0,0 +1,35 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+body: |
+  bb.0.entry:
+    successors: %bb.1.less, %bb.2.exit
+
+    CMP32ri8 %edi, 10, implicit-def %eflags
+  ; CHECK: [[@LINE+1]]:31: duplicate 'implicit' register flag
+    JG_1 %bb.2.exit, implicit implicit %eflags
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def %eflags
+    RETQ %eax
+
+  bb.2.exit:
+    %eax = COPY %edi
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/early-clobber-register-flag.mir b/test/CodeGen/MIR/X86/early-clobber-register-flag.mir
new file mode 100644
index 0000000000000..4dc442e4fb944
--- /dev/null
+++ b/test/CodeGen/MIR/X86/early-clobber-register-flag.mir
@@ -0,0 +1,45 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the 'early-clobber' register
+# flags correctly.
+
+--- |
+
+  declare void @foo(i32)
+
+  define void @test(i32 %a, i32 %b) #0 {
+  entry:
+    %c = add i32 %a, %b
+    call void asm sideeffect "nop", "~{ax},~{di}"()
+    call void @foo(i32 %c)
+    ret void
+  }
+
+  attributes #0 = { optsize }
+
+...
+---
+name:            test
+hasInlineAsm:    true
+tracksRegLiveness: true
+liveins:
+  - { reg: '%edi' }
+  - { reg: '%esi' }
+frameInfo:
+  stackSize:     8
+  adjustsStack:  true
+  hasCalls:      true
+body: |
+  bb.0.entry:
+    liveins: %edi, %esi
+
+    frame-setup PUSH64r undef %rax, implicit-def %rsp, implicit %rsp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 16
+    %ecx = COPY %edi
+    %ecx = ADD32rr killed %ecx, killed %esi, implicit-def dead %eflags
+  ; CHECK: INLINEASM $nop, 1, 12, implicit-def dead early-clobber %ax, 12, implicit-def dead early-clobber %di
+    INLINEASM $nop, 1, 12, implicit-def dead early-clobber %ax, 12, implicit-def dead early-clobber %di
+    %edi = COPY killed %ecx
+    CALL64pcrel32 @foo, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp
+    %rax = POP64r implicit-def %rsp, implicit %rsp
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/expected-align-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-align-in-memory-operand.mir
new file mode 100644
index 0000000000000..f2e349454c5d2
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-align-in-memory-operand.mir
@@ -0,0 +1,30 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define void @memory_alignment(<8 x float>* %vec) {
+  entry:
+    %v = load <8 x float>, <8 x float>* %vec
+    %v2 = insertelement <8 x float> %v, float 0.0, i32 4
+    store <8 x float> %v2, <8 x float>* %vec
+    ret void
+  }
+
+...
+---
+name:            memory_alignment
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:65: expected 'align'
+    %xmm0 = MOVAPSrm %rdi, 1, _, 0, _ :: (load 16 from %ir.vec, 32)
+    %xmm1 = MOVAPSrm %rdi, 1, _, 16, _ :: (load 16 from %ir.vec + 16, align 32)
+    %xmm2 = FsFLD0SS
+    %xmm1 = MOVSSrr killed %xmm1, killed %xmm2
+    MOVAPSmr %rdi, 1, _, 0, _, killed %xmm0 :: (store 16 into %ir.vec, align 32)
+    MOVAPSmr killed %rdi, 1, _, 16, _, killed %xmm1 :: (store 16 into %ir.vec + 16, align 32)
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/expected-alignment-after-align-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-alignment-after-align-in-memory-operand.mir
new file mode 100644
index 0000000000000..7ce377f8c5fb1
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-alignment-after-align-in-memory-operand.mir
@@ -0,0 +1,30 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define void @memory_alignment(<8 x float>* %vec) {
+  entry:
+    %v = load <8 x float>, <8 x float>* %vec
+    %v2 = insertelement <8 x float> %v, float 0.0, i32 4
+    store <8 x float> %v2, <8 x float>* %vec
+    ret void
+  }
+
+...
+---
+name:            memory_alignment
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:70: expected an integer literal after 'align'
+    %xmm0 = MOVAPSrm %rdi, 1, _, 0, _ :: (load 16 from %ir.vec, align)
+    %xmm1 = MOVAPSrm %rdi, 1, _, 16, _ :: (load 16 from %ir.vec + 16, align 32)
+    %xmm2 = FsFLD0SS
+    %xmm1 = MOVSSrr killed %xmm1, killed %xmm2
+    MOVAPSmr %rdi, 1, _, 0, _, killed %xmm0 :: (store 16 into %ir.vec, align 32)
+    MOVAPSmr killed %rdi, 1, _, 16, _, killed %xmm1 :: (store 16 into %ir.vec + 16, align 32)
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/expected-basic-block-at-start-of-body.mir b/test/CodeGen/MIR/X86/expected-basic-block-at-start-of-body.mir
new file mode 100644
index 0000000000000..861baec4bcbc2
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-basic-block-at-start-of-body.mir
@@ -0,0 +1,40 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:                                             ; preds = %entry
+    ret i32 0
+
+  exit:                                             ; preds = %entry
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+tracksRegLiveness: true
+liveins:
+  - { reg: '%edi' }
+body: |
+  ; CHECK: [[@LINE+1]]:3: expected a basic block definition before instructions
+  successors: %bb.1.less, %bb.2.exit
+    liveins: %edi 44
+
+    CMP32ri8 %edi, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit killed %eflags
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def dead %eflags
+    RETQ killed %eax
+
+  bb.2.exit:
+    liveins: %edi
+
+    %eax = COPY killed %edi
+    RETQ killed %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-block-reference-in-blockaddress.mir b/test/CodeGen/MIR/X86/expected-block-reference-in-blockaddress.mir
new file mode 100644
index 0000000000000..ef7df4c8c20f3
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-block-reference-in-blockaddress.mir
@@ -0,0 +1,30 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @addr = global i8* null
+
+  define void @test() {
+  entry:
+    store volatile i8* blockaddress(@test, %block), i8** @addr
+    %val = load volatile i8*, i8** @addr
+    indirectbr i8* %val, [label %block]
+
+  block:
+    ret void
+  }
+
+...
+---
+name:            test
+body: |
+  bb.0.entry:
+    successors: %bb.1.block
+    ; CHECK: [[@LINE+1]]:51: expected an IR block reference
+    %rax = LEA64r %rip, 1, _, blockaddress(@test, _), _
+    MOV64mr %rip, 1, _, @addr, _, killed %rax
+    JMP64m %rip, 1, _, @addr, _
+
+  bb.1.block (address-taken):
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/expected-comma-after-cfi-register.mir b/test/CodeGen/MIR/X86/expected-comma-after-cfi-register.mir
new file mode 100644
index 0000000000000..ba7b2ab64c3ec
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-comma-after-cfi-register.mir
@@ -0,0 +1,42 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  declare void @foo(i32)
+
+  define i32 @test(i32 %a, i32 %b, i32 %c, i32 %d) {
+  entry:
+    %add = add nsw i32 %b, %a
+    %add1 = add nsw i32 %add, %c
+    %add2 = add nsw i32 %add1, %d
+    tail call void @foo(i32 %add2)
+    %add6 = add nsw i32 %add2, %add2
+    ret i32 %add6
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       8
+  adjustsStack:    true
+  hasCalls:        true
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16 }
+body: |
+  bb.0.entry:
+    PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 16
+    ; CHECK: [[@LINE+1]]:38: expected ','
+    CFI_INSTRUCTION .cfi_offset %rbx -16
+    %ebx = COPY %edi, implicit-def %rbx
+    %ebx = ADD32rr %ebx, killed %esi, implicit-def dead %eflags
+    %ebx = ADD32rr %ebx, killed %edx, implicit-def dead %eflags
+    %ebx = ADD32rr %ebx, killed %ecx, implicit-def dead %eflags
+    %edi = COPY %ebx
+    CALL64pcrel32 @foo, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp
+    %eax = LEA64_32r killed %rbx, 1, %rbx, 0, _
+    %rbx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-comma-after-memory-operand.mir b/test/CodeGen/MIR/X86/expected-comma-after-memory-operand.mir
new file mode 100644
index 0000000000000..dd5693952573d
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-comma-after-memory-operand.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define void @test(i32* %a) {
+  entry2:
+    %b = load i32, i32* %a
+    %c = add i32 %b, 1
+    store i32 %c, i32* %a
+    ret void
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry2:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:87: expected ',' before the next machine memory operand
+    INC32m killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (store 4 into %ir.a) (load 4 from %ir.a)
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir b/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir
index c5f5aaca34e0b..601551a7720a5 100644
--- a/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir
@@ -19,20 +19,16 @@
 ...
 ---
 name:            foo
-body:
- - id:              0
-   name:            entry
-   instructions:
-     - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
-# CHECK: [[@LINE+1]]:26: expected an implicit register operand 'implicit %eflags'
-     - 'JG_1 %bb.2.exit, implicit %eax'
- - id:              1
-   name:            less
-   instructions:
-     - '%eax = MOV32r0 implicit-def %eflags'
- - id:              2
-   name:            exit
-   instructions:
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    %eax = MOV32rm %rdi, 1, _, 0, _
+    CMP32ri8 %eax, 10, implicit-def %eflags
+  ; CHECK: [[@LINE+1]]:35: missing implicit register operand 'implicit %eflags'
+    JG_1 %bb.2.exit, implicit %eax
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def %eflags
+
+  bb.2.exit:
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir b/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir
index ecf3a122bf66b..6494960d3264d 100644
--- a/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir
+++ b/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir
@@ -19,20 +19,16 @@
 ...
 ---
 name:            foo
-body:
- - id:              0
-   name:            entry
-   instructions:
-     - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
-# CHECK: [[@LINE+1]]:26: expected an implicit register operand 'implicit %eflags'
-     - 'JG_1 %bb.2.exit, implicit-def %eflags'
- - id:              1
-   name:            less
-   instructions:
-     - '%eax = MOV32r0 implicit-def %eflags'
- - id:              2
-   name:            exit
-   instructions:
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    %eax = MOV32rm %rdi, 1, _, 0, _
+    CMP32ri8 %eax, 10, implicit-def %eflags
+  ; CHECK: [[@LINE+1]]:42: missing implicit register operand 'implicit %eflags'
+    JG_1 %bb.2.exit, implicit-def %eflags
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def %eflags
+
+  bb.2.exit:
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/expected-from-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-from-in-memory-operand.mir
new file mode 100644
index 0000000000000..f9e9d0b229686
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-from-in-memory-operand.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32* %a) {
+  entry:
+    %b = load i32, i32* %a
+    ret i32 %b
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:55: expected 'from'
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 %ir.a)
+    RETQ %eax
+...
+
diff --git a/test/CodeGen/MIR/X86/expected-function-reference-after-blockaddress.mir b/test/CodeGen/MIR/X86/expected-function-reference-after-blockaddress.mir
new file mode 100644
index 0000000000000..de6a745fd7029
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-function-reference-after-blockaddress.mir
@@ -0,0 +1,30 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @addr = global i8* null
+
+  define void @test() {
+  entry:
+    store volatile i8* blockaddress(@test, %block), i8** @addr
+    %val = load volatile i8*, i8** @addr
+    indirectbr i8* %val, [label %block]
+
+  block:
+    ret void
+  }
+
+...
+---
+name:            test
+body: |
+  bb.0.entry:
+    successors: %bb.1.block
+    ; CHECK: [[@LINE+1]]:44: expected an IR function reference
+    %rax = LEA64r %rip, 1, _, blockaddress(@addr, %ir-block.block), _
+    MOV64mr %rip, 1, _, @addr, _, killed %rax
+    JMP64m %rip, 1, _, @addr, _
+
+  bb.1.block (address-taken):
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/expected-global-value-after-blockaddress.mir b/test/CodeGen/MIR/X86/expected-global-value-after-blockaddress.mir
new file mode 100644
index 0000000000000..f737c06c3e1ed
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-global-value-after-blockaddress.mir
@@ -0,0 +1,30 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @addr = global i8* null
+
+  define void @test() {
+  entry:
+    store volatile i8* blockaddress(@test, %block), i8** @addr
+    %val = load volatile i8*, i8** @addr
+    indirectbr i8* %val, [label %block]
+
+  block:
+    ret void
+  }
+
+...
+---
+name:            test
+body: |
+  bb.0.entry:
+    successors: %bb.1.block
+    ; CHECK: [[@LINE+1]]:44: expected a global value
+    %rax = LEA64r %rip, 1, _, blockaddress(0, %ir-block.block), _
+    MOV64mr %rip, 1, _, @addr, _, killed %rax
+    JMP64m %rip, 1, _, @addr, _
+
+  bb.1.block (address-taken):
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/expected-integer-after-offset-sign.mir b/test/CodeGen/MIR/X86/expected-integer-after-offset-sign.mir
new file mode 100644
index 0000000000000..e337292f17a2d
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-integer-after-offset-sign.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @G = external global i32
+
+  define i32 @inc() {
+  entry:
+    %a = load i32, i32* @G
+    %b = add i32 %a, 1
+    ret i32 %b
+  }
+
+...
+---
+name: inc
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:37: expected an integer literal after '+'
+    %rax = MOV64rm %rip, 1, _, @G + , _
+    %eax = MOV32rm %rax, 1, _, 0, _
+    %eax = INC32r %eax, implicit-def %eflags
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir b/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
new file mode 100644
index 0000000000000..580d2bc0a419e
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+  define i64 @test(i64 %x) #0 {
+  entry:
+    %asm = tail call i64 asm sideeffect "$foo", "=r,0"(i64 %x) nounwind
+    ret i64 %asm
+  }
+
+  attributes #0 = { nounwind }
+...
+---
+name:            test
+hasInlineAsm:    true
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+
+  ; CHECK: [[@LINE+1]]:78: expected an integer literal after 'tied-def'
+    INLINEASM $"$foo", 1, 2818058, def %rdi, 2147483657, killed %rdi(tied-def)
+    %rax = COPY killed %rdi
+    RETQ killed %rax
+...
diff --git a/test/CodeGen/MIR/X86/expected-integer-in-successor-weight.mir b/test/CodeGen/MIR/X86/expected-integer-in-successor-weight.mir
new file mode 100644
index 0000000000000..83874eb674766
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-integer-in-successor-weight.mir
@@ -0,0 +1,38 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+body: |
+  bb.0.entry:
+  ; CHECK: [[@LINE+1]]:29: expected an integer literal after '('
+    successors: %bb.1.less (_), %bb.2.exit(32)
+    liveins: %edi
+
+    CMP32ri8 %edi, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit killed %eflags
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def dead %eflags
+    RETQ killed %eax
+
+  bb.2.exit:
+    liveins: %edi
+
+    %eax = COPY killed %edi
+    RETQ killed %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-load-or-store-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-load-or-store-in-memory-operand.mir
new file mode 100644
index 0000000000000..8fcd622a18e69
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-load-or-store-in-memory-operand.mir
@@ -0,0 +1,23 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32* %a) {
+  entry:
+    %b = load i32, i32* %a
+    ret i32 %b
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:48: expected 'load' or 'store' memory operation
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (4 from %ir.a)
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-machine-operand.mir b/test/CodeGen/MIR/X86/expected-machine-operand.mir
index 3725c93cd3ead..3ba5126b9982e 100644
--- a/test/CodeGen/MIR/X86/expected-machine-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-machine-operand.mir
@@ -10,12 +10,10 @@
 ...
 ---
 name:            foo
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK: [[@LINE+1]]:24: expected a machine operand
-     - '%eax = XOR32rr ='
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:20: expected a machine operand
+    %eax = XOR32rr =
+    RETQ %eax
 ...
 
diff --git a/test/CodeGen/MIR/X86/expected-metadata-node-after-debug-location.mir b/test/CodeGen/MIR/X86/expected-metadata-node-after-debug-location.mir
new file mode 100644
index 0000000000000..620bb5d961eeb
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-metadata-node-after-debug-location.mir
@@ -0,0 +1,59 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %x) #0 !dbg !4 {
+  entry:
+    %x.addr = alloca i32, align 4
+    store i32 %x, i32* %x.addr, align 4
+    call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !12, metadata !13), !dbg !14
+    %0 = load i32, i32* %x.addr, align 4, !dbg !15
+    ret i32 %0, !dbg !15
+  }
+
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+  attributes #0 = { nounwind "no-frame-pointer-elim"="false" }
+  attributes #1 = { nounwind readnone }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!9, !10}
+  !llvm.ident = !{!11}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "test.ll", directory: "")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+  !5 = !DIFile(filename: "test.c", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !9 = !{i32 2, !"Dwarf Version", i32 4}
+  !10 = !{i32 2, !"Debug Info Version", i32 3}
+  !11 = !{!"clang version 3.7.0"}
+  !12 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 4, type: !8)
+  !13 = !DIExpression()
+  !14 = !DILocation(line: 4, scope: !4)
+  !15 = !DILocation(line: 8, scope: !4)
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+frameInfo:
+  maxAlignment:  4
+stack:
+  - { id: 0, name: x.addr, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    %0 = COPY %edi
+    ; CHECK: [[@LINE+1]]:46: expected a metadata node after 'debug-location'
+    DBG_VALUE _, 0, !12, !13, debug-location 14
+    MOV32mr %stack.x.addr, 1, _, 0, _, %0
+    %eax = COPY %0
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-metadata-node-after-exclaim.mir b/test/CodeGen/MIR/X86/expected-metadata-node-after-exclaim.mir
new file mode 100644
index 0000000000000..6497f5db2026e
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-metadata-node-after-exclaim.mir
@@ -0,0 +1,59 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %x) #0 !dbg !4 {
+  entry:
+    %x.addr = alloca i32, align 4
+    store i32 %x, i32* %x.addr, align 4
+    call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !12, metadata !13), !dbg !14
+    %0 = load i32, i32* %x.addr, align 4, !dbg !15
+    ret i32 %0, !dbg !15
+  }
+
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+  attributes #0 = { nounwind "no-frame-pointer-elim"="false" }
+  attributes #1 = { nounwind readnone }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!9, !10}
+  !llvm.ident = !{!11}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "test.ll", directory: "")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+  !5 = !DIFile(filename: "test.c", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !9 = !{i32 2, !"Dwarf Version", i32 4}
+  !10 = !{i32 2, !"Debug Info Version", i32 3}
+  !11 = !{!"clang version 3.7.0"}
+  !12 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 4, type: !8)
+  !13 = !DIExpression()
+  !14 = !DILocation(line: 4, scope: !4)
+  !15 = !DILocation(line: 8, scope: !4)
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+frameInfo:
+  maxAlignment:  4
+stack:
+  - { id: 0, name: x.addr, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    %0 = COPY %edi
+    ; CHECK: [[@LINE+1]]:28: expected metadata id after '!'
+    DBG_VALUE _, 0, !12, ! _
+    MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
+    %eax = COPY %0
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-metadata-node-in-stack-object.mir b/test/CodeGen/MIR/X86/expected-metadata-node-in-stack-object.mir
new file mode 100644
index 0000000000000..9a4696779fb55
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-metadata-node-in-stack-object.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+  define i32 @test(i32 %x) {
+  entry:
+    %xa = alloca i32, align 4
+    store i32 %x, i32* %xa, align 4
+    %0 = load i32, i32* %xa, align 4
+    ret i32 %0
+  }
+...
+---
+name:            test
+liveins:
+  - { reg: '%edi' }
+stack:
+# CHECK: [[@LINE+1]]:74: expected a metadata node
+  - { id: 0, name: xa, offset: -12, size: 4, alignment: 4, di-variable: '0' }
+body: |
+  bb.0.entry:
+    liveins: %edi
+
+    MOV32mr %rsp, 1, _, -4, _, %edi :: (store 4 into %ir.xa)
+    %eax = COPY killed %edi
+    RETQ killed %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir b/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir
new file mode 100644
index 0000000000000..04568f6dde57a
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir
@@ -0,0 +1,29 @@
+# RUN: not llc -march=x86-64 -start-after machine-scheduler -stop-after machine-scheduler -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %a, i32 %b) {
+  body:
+    %c = mul i32 %a, %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  # CHECK: [[@LINE+1]]:48: expected a named register
+  - { id: 1, class: gr32, preferred-register: '%0' }
+  - { id: 2, class: gr32, preferred-register: '%edi' }
+body: |
+  bb.0.body:
+    liveins: %edi, %esi
+
+    %1 = COPY %esi
+    %2 = COPY %edi
+    %2 = IMUL32rr %2, %1, implicit-def dead %eflags
+    %eax = COPY %2
+    RETQ killed %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-named-register-in-callee-saved-register.mir b/test/CodeGen/MIR/X86/expected-named-register-in-callee-saved-register.mir
new file mode 100644
index 0000000000000..be57734ecf33c
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-named-register-in-callee-saved-register.mir
@@ -0,0 +1,88 @@
+# RUN: not llc -march=x86-64 -start-after prologepilog -stop-after prologepilog -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @compute(i32 %a) {
+  body:
+    ret i32 %a
+  }
+
+  define i32 @func(i32 %a) {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    br label %check
+
+  check:
+    %comp = icmp sle i32 %a, 10
+    br i1 %comp, label %loop, label %exit
+
+  loop:
+    %c = load i32, i32* %b
+    %d = call i32 @compute(i32 %c)
+    %e = sub i32 %d, 1
+    store i32 %e, i32* %b
+    br label %check
+
+  exit:
+    ret i32 0
+  }
+
+...
+---
+name:            compute
+tracksRegLiveness: true
+body: |
+  bb.0.body:
+    liveins: %edi
+
+    %eax = COPY killed %edi
+    RETQ killed %eax
+...
+---
+name:            func
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       24
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+fixedStack:
+  # CHECK: [[@LINE+1]]:93: expected a named register
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%0' }
+stack:
+  - { id: 0, name: b, offset: -20, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    successors: %bb.1.check
+    liveins: %edi, %rbx
+
+    frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    %rsp = frame-setup SUB64ri8 %rsp, 16, implicit-def dead %eflags
+    %ebx = COPY %edi
+    MOV32mr %rsp, 1, _, 12, _, %ebx
+
+  bb.1.check:
+    successors: %bb.2.loop, %bb.3.exit
+    liveins: %ebx
+
+    CMP32ri8 %ebx, 10, implicit-def %eflags
+    JG_1 %bb.3.exit, implicit killed %eflags
+    JMP_1 %bb.2.loop
+
+  bb.2.loop:
+    successors: %bb.1.check
+    liveins: %ebx
+
+    %edi = MOV32rm %rsp, 1, _, 12, _
+    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
+    %eax = DEC32r killed %eax, implicit-def dead %eflags
+    MOV32mr %rsp, 1, _, 12, _, killed %eax
+    JMP_1 %bb.1.check
+
+  bb.3.exit:
+    %eax = MOV32r0 implicit-def dead %eflags
+    %rsp = ADD64ri8 %rsp, 16, implicit-def dead %eflags
+    %rbx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-named-register-in-functions-livein.mir b/test/CodeGen/MIR/X86/expected-named-register-in-functions-livein.mir
new file mode 100644
index 0000000000000..ae9f776ad7695
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-named-register-in-functions-livein.mir
@@ -0,0 +1,27 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %a) {
+  body:
+    ret i32 %a
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+liveins:
+  # CHECK: [[@LINE+1]]:13: expected a named register
+  - { reg: '%0' }
+body: |
+  bb.0.body:
+    liveins: %edi
+
+    %0 = COPY %edi
+    %eax = COPY %0
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-named-register-livein.mir b/test/CodeGen/MIR/X86/expected-named-register-livein.mir
index 1fbe881c8c70d..41e6a4a6cc882 100644
--- a/test/CodeGen/MIR/X86/expected-named-register-livein.mir
+++ b/test/CodeGen/MIR/X86/expected-named-register-livein.mir
@@ -10,12 +10,11 @@
 ...
 ---
 name:            test
-body:
-  - id:          0
-    name:        body
-    # CHECK: [[@LINE+1]]:21: expected a named register
-    liveins:     [ '%0' ]
-    instructions:
-      - '%eax = COPY %edi'
-      - 'RETQ %eax'
+body: |
+  bb.0.body:
+    ; CHECK: [[@LINE+1]]:14: expected a named register
+    liveins: %0
+
+    %eax = COPY %edi
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/expected-newline-at-end-of-list.mir b/test/CodeGen/MIR/X86/expected-newline-at-end-of-list.mir
new file mode 100644
index 0000000000000..1f0439d126f4d
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-newline-at-end-of-list.mir
@@ -0,0 +1,41 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:                                             ; preds = %entry
+    ret i32 0
+
+  exit:                                             ; preds = %entry
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+tracksRegLiveness: true
+liveins:
+  - { reg: '%edi' }
+body: |
+  bb.0.entry:
+    successors: %bb.1.less, %bb.2.exit
+  ; CHECK: [[@LINE+1]]:19: expected line break at the end of a list
+    liveins: %edi 44
+
+    CMP32ri8 %edi, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit killed %eflags
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def dead %eflags
+    RETQ killed %eax
+
+  bb.2.exit:
+    liveins: %edi
+
+    %eax = COPY killed %edi
+    RETQ killed %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-number-after-bb.mir b/test/CodeGen/MIR/X86/expected-number-after-bb.mir
index 5343a847fbb90..a239cf176f5f8 100644
--- a/test/CodeGen/MIR/X86/expected-number-after-bb.mir
+++ b/test/CodeGen/MIR/X86/expected-number-after-bb.mir
@@ -18,20 +18,16 @@
 ...
 ---
 name:            foo
-body:
- - id: 0
-   name:   entry
-   instructions:
-     - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
-     # CHECK: [[@LINE+1]]:18: expected a number after '%bb.'
-     - 'JG_1 %bb.nah, implicit %eflags'
- - id: 1
-   name: yes
-   instructions:
-     - '%eax = MOV32r0 implicit-def %eflags'
- - id: 2
-   name: nah
-   instructions:
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    %eax = MOV32rm %rdi, 1, _, 0, _
+    CMP32ri8 %eax, 10, implicit-def %eflags
+    ; CHECK: [[@LINE+1]]:14: expected a number after '%bb.'
+    JG_1 %bb.nah, implicit %eflags
+
+  bb.1.true:
+    %eax = MOV32r0 implicit-def %eflags
+
+  bb.2.nah:
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/expected-offset-after-cfi-operand.mir b/test/CodeGen/MIR/X86/expected-offset-after-cfi-operand.mir
new file mode 100644
index 0000000000000..aefeed9ce05ee
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-offset-after-cfi-operand.mir
@@ -0,0 +1,27 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define void @test() {
+  entry:
+    %tmp = alloca [4168 x i8], align 4
+    ret void
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       4040
+stack:
+  - { id: 0, name: tmp, offset: -4176, size: 4168, alignment: 4 }
+body: |
+  bb.0.entry:
+    %rsp = SUB64ri32 %rsp, 4040, implicit-def dead %eflags
+    ; CHECK: [[@LINE+1]]:41: expected a cfi offset
+    CFI_INSTRUCTION .cfi_def_cfa_offset _
+    %rsp = ADD64ri32 %rsp, 4040, implicit-def dead %eflags
+    RETQ
+...
+
diff --git a/test/CodeGen/MIR/X86/expected-pointer-value-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-pointer-value-in-memory-operand.mir
new file mode 100644
index 0000000000000..fca078c3497cc
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-pointer-value-in-memory-operand.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32* %a) {
+  entry:
+    %b = load i32, i32* %a
+    ret i32 %b
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:60: expected a pointer IR value
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 from %ir.b)
+    RETQ %eax
+...
+
diff --git a/test/CodeGen/MIR/X86/expected-positive-alignment-after-align.mir b/test/CodeGen/MIR/X86/expected-positive-alignment-after-align.mir
new file mode 100644
index 0000000000000..31b4c5be1251c
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-positive-alignment-after-align.mir
@@ -0,0 +1,30 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define void @memory_alignment(<8 x float>* %vec) {
+  entry:
+    %v = load <8 x float>, <8 x float>* %vec
+    %v2 = insertelement <8 x float> %v, float 0.0, i32 4
+    store <8 x float> %v2, <8 x float>* %vec
+    ret void
+  }
+
+...
+---
+name:            memory_alignment
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:71: expected an integer literal after 'align'
+    %xmm0 = MOVAPSrm %rdi, 1, _, 0, _ :: (load 16 from %ir.vec, align -32)
+    %xmm1 = MOVAPSrm %rdi, 1, _, 16, _ :: (load 16 from %ir.vec + 16, align 32)
+    %xmm2 = FsFLD0SS
+    %xmm1 = MOVSSrr killed %xmm1, killed %xmm2
+    MOVAPSmr %rdi, 1, _, 0, _, killed %xmm0 :: (store 16 into %ir.vec, align 32)
+    MOVAPSmr killed %rdi, 1, _, 16, _, killed %xmm1 :: (store 16 into %ir.vec + 16, align 32)
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/expected-register-after-cfi-operand.mir b/test/CodeGen/MIR/X86/expected-register-after-cfi-operand.mir
new file mode 100644
index 0000000000000..3280fca6d551f
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-register-after-cfi-operand.mir
@@ -0,0 +1,42 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  declare void @foo(i32)
+
+  define i32 @test(i32 %a, i32 %b, i32 %c, i32 %d) {
+  entry:
+    %add = add nsw i32 %b, %a
+    %add1 = add nsw i32 %add, %c
+    %add2 = add nsw i32 %add1, %d
+    tail call void @foo(i32 %add2)
+    %add6 = add nsw i32 %add2, %add2
+    ret i32 %add6
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       8
+  adjustsStack:    true
+  hasCalls:        true
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16 }
+body: |
+  bb.0.entry:
+    PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 16
+    ; CHECK: [[@LINE+1]]:33: expected a cfi register
+    CFI_INSTRUCTION .cfi_offset %0, -16
+    %ebx = COPY %edi, implicit-def %rbx
+    %ebx = ADD32rr %ebx, killed %esi, implicit-def dead %eflags
+    %ebx = ADD32rr %ebx, killed %edx, implicit-def dead %eflags
+    %ebx = ADD32rr %ebx, killed %ecx, implicit-def dead %eflags
+    %edi = COPY %ebx
+    CALL64pcrel32 @foo, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp
+    %eax = LEA64_32r killed %rbx, 1, %rbx, 0, _
+    %rbx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-register-after-flags.mir b/test/CodeGen/MIR/X86/expected-register-after-flags.mir
index 111f5496a3780..68f1060ad8734 100644
--- a/test/CodeGen/MIR/X86/expected-register-after-flags.mir
+++ b/test/CodeGen/MIR/X86/expected-register-after-flags.mir
@@ -12,11 +12,9 @@
 ...
 ---
 name:            foo
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK: [[@LINE+1]]:37: expected a register after register flags
-     - '%eax = MOV32r0 implicit-def 2'
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:33: expected a register after register flags
+    %eax = MOV32r0 implicit-def 2
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir b/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir
new file mode 100644
index 0000000000000..71ff15bd9c523
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32* %a) {
+  entry:
+    %b = load i32, i32* %a
+    ret i32 %b
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:53: expected the size integer literal after memory operation
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load from %ir.a)
+    RETQ %eax
+...
+
diff --git a/test/CodeGen/MIR/X86/expected-stack-object.mir b/test/CodeGen/MIR/X86/expected-stack-object.mir
new file mode 100644
index 0000000000000..ff0c10d59e335
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-stack-object.mir
@@ -0,0 +1,67 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+
+--- |
+  @.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
+  @__stack_chk_guard = external global i8*
+
+  define i32 @test() #0 {
+  entry:
+    %StackGuardSlot = alloca i8*
+    %StackGuard = load i8*, i8** @__stack_chk_guard
+    call void @llvm.stackprotector(i8* %StackGuard, i8** %StackGuardSlot)
+    %test = alloca i8*, align 8
+    %a = alloca i8, i64 5
+    store i8* %a, i8** %test, align 8
+    %b = load i8*, i8** %test, align 8
+    %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %b)
+    call void @llvm.stackprotectorcheck(i8** @__stack_chk_guard)
+    ret i32 %call
+  }
+
+  declare i32 @printf(i8*, ...)
+
+  declare void @llvm.stackprotector(i8*, i8**) #1
+
+  declare void @llvm.stackprotectorcheck(i8**) #2
+
+  attributes #0 = { ssp "stack-protector-buffer-size"="5" }
+  attributes #1 = { nounwind }
+  attributes #2 = { nounwind argmemonly }
+...
+---
+name:            test
+alignment:       4
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       40
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+# CHECK: [[@LINE+1]]:21: expected a stack object
+  stackProtector:  '0'
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16,
+      callee-saved-register: '%rbx' }
+stack:
+  - { id: 0, name: StackGuardSlot, offset: -24, size: 8, alignment: 8 }
+  - { id: 1, name: test, offset: -40, size: 8, alignment: 8 }
+  - { id: 2, name: a, offset: -29, size: 5, alignment: 1 }
+body: |
+  bb.0.entry:
+    liveins: %rbx, %rbx
+
+    frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    %rsp = frame-setup SUB64ri8 %rsp, 32, implicit-def dead %eflags
+    %rbx = LOAD_STACK_GUARD :: (invariant load 8 from %ir.__stack_chk_guard)
+    MOV64mr %rsp, 1, _, 24, _, %rbx
+    %rsi = LEA64r %rsp, 1, _, 19, _
+    MOV64mr %rsp, 1, _, 8, _, %rsi
+    %rdi = LEA64r %rip, 1, _, @.str, _
+    dead %eax = MOV32r0 implicit-def dead %eflags, implicit-def %al
+    CALL64pcrel32 @printf, csr_64, implicit %rsp, implicit %rdi, implicit %rsi, implicit %al, implicit-def %rsp, implicit-def %eax
+    CMP64rm killed %rbx, %rsp, 1, _, 24, _, implicit-def %eflags
+    %rsp = ADD64ri8 %rsp, 32, implicit-def dead %eflags
+    %rbx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir b/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir
index c891a115a1805..6283427c10b35 100644
--- a/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir
+++ b/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir
@@ -16,14 +16,12 @@ registers:
   - { id: 0, class: gr32 }
   - { id: 1, class: gr8 }
   - { id: 2, class: gr8 }
-body:
-  - name:        entry
-    id:          0
-    instructions:
-      - '%0 = COPY %edi'
-      # CHECK: [[@LINE+1]]:25: expected a subregister index after ':'
-      - '%1 = COPY %0 : 42'
-      - '%2 = AND8ri %1, 1, implicit-def %eflags'
-      - '%al = COPY %2'
-      - 'RETQ %al'
+body: |
+  bb.0.entry:
+    %0 = COPY %edi
+    ; CHECK: [[@LINE+1]]:20: expected a subregister index after ':'
+    %1 = COPY %0 : 42
+    %2 = AND8ri %1, 1, implicit-def %eflags
+    %al = COPY %2
+    RETQ %al
 ...
diff --git a/test/CodeGen/MIR/X86/expected-target-flag-name.mir b/test/CodeGen/MIR/X86/expected-target-flag-name.mir
new file mode 100644
index 0000000000000..3d094a11e9f32
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-target-flag-name.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @G = external global i32
+
+  define i32 @inc() {
+  entry:
+    %a = load i32, i32* @G
+    %b = add i32 %a, 1
+    ret i32 %b
+  }
+
+...
+---
+name: inc
+body: |
+  bb.0.entry:
+  ; CHECK: [[@LINE+1]]:46: expected the name of the target flag
+    %rax = MOV64rm %rip, 1, _, target-flags( ) @G, _
+    %eax = MOV32rm killed %rax, 1, _, 0, _
+    %eax = INC32r killed %eax, implicit-def dead %eflags
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir b/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir
new file mode 100644
index 0000000000000..e8d6afd5333e7
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+  define i64 @test(i64 %x) #0 {
+  entry:
+    %asm = tail call i64 asm sideeffect "$foo", "=r,0"(i64 %x) nounwind
+    ret i64 %asm
+  }
+
+  attributes #0 = { nounwind }
+...
+---
+name:            test
+hasInlineAsm:    true
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+
+  ; CHECK: [[@LINE+1]]:70: expected 'tied-def' after '('
+    INLINEASM $"$foo", 1, 2818058, def %rdi, 2147483657, killed %rdi(3)
+    %rax = COPY killed %rdi
+    RETQ killed %rax
+...
diff --git a/test/CodeGen/MIR/X86/expected-value-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-value-in-memory-operand.mir
new file mode 100644
index 0000000000000..f99443f1726df
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-value-in-memory-operand.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32* %a) {
+  entry:
+    %b = load i32, i32* %a
+    ret i32 %b
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:60: expected an IR value reference
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 from a)
+    RETQ %eax
+...
+
diff --git a/test/CodeGen/MIR/X86/expected-virtual-register-in-functions-livein.mir b/test/CodeGen/MIR/X86/expected-virtual-register-in-functions-livein.mir
new file mode 100644
index 0000000000000..da0d1e166a1cd
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-virtual-register-in-functions-livein.mir
@@ -0,0 +1,27 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %a) {
+  body:
+    ret i32 %a
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+liveins:
+  # CHECK: [[@LINE+1]]:34: expected a virtual register
+  - { reg: '%edi', virtual-reg: '%edi' }
+body: |
+  bb.0.body:
+    liveins: %edi
+
+    %0 = COPY %edi
+    %eax = COPY %0
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/external-symbol-operands.mir b/test/CodeGen/MIR/X86/external-symbol-operands.mir
new file mode 100644
index 0000000000000..7e85d946b75a7
--- /dev/null
+++ b/test/CodeGen/MIR/X86/external-symbol-operands.mir
@@ -0,0 +1,64 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the external symbol machine
+# operands correctly.
+
+--- |
+  @__stack_chk_guard = external global i8*
+
+  define i32 @test(i32 %n) #0 {
+  entry:
+    %StackGuardSlot = alloca i8*
+    %StackGuard = load i8*, i8** @__stack_chk_guard
+    call void @llvm.stackprotector(i8* %StackGuard, i8** %StackGuardSlot)
+    %a = alloca [128 x i32], align 16
+    %idxprom = sext i32 %n to i64
+    %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* %a, i64 0, i64 %idxprom
+    %0 = load i32, i32* %arrayidx, align 4
+    call void @llvm.stackprotectorcheck(i8** @__stack_chk_guard)
+    ret i32 %0
+  }
+
+  declare void @llvm.stackprotector(i8*, i8**) #1
+
+  declare void @llvm.stackprotectorcheck(i8**) #1
+
+  attributes #0 = { ssp "stack-protector-buffer-size"="8" }
+  attributes #1 = { nounwind }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    successors: %bb.1.entry, %bb.2.entry
+    liveins: %edi
+
+    %rsp = SUB64ri32 %rsp, 520, implicit-def %eflags
+    %rcx = LOAD_STACK_GUARD
+    MOV64mr %rsp, 1, _, 512, _, %rcx
+    %rax = MOVSX64rr32 %edi
+    %eax = MOV32rm %rsp, 4, %rax, 0, _
+    CMP64rm %rcx, %rsp, 1, _, 512, _, implicit-def %eflags
+    JNE_1 %bb.2.entry, implicit %eflags
+
+  bb.1.entry:
+    liveins: %eax
+
+    %rsp = ADD64ri32 %rsp, 520, implicit-def %eflags
+    RETQ %eax
+
+  bb.2.entry:
+    ; CHECK:      CALL64pcrel32 $__stack_chk_fail,
+    ; CHECK-NEXT: CALL64pcrel32 $__stack_chk_fail.09-_,
+    ; CHECK-NEXT: CALL64pcrel32 $"__stack_chk_fail$",
+    ; CHECK-NEXT: CALL64pcrel32 $"$Quoted \09 External symbol \11 ",
+    ; CHECK-NEXT: CALL64pcrel32 $__stack_chk_fail + 2,
+    ; CHECK-NEXT: CALL64pcrel32 $" check stack - 20" - 20,
+    CALL64pcrel32 $__stack_chk_fail, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 $__stack_chk_fail.09-_, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 $__stack_chk_fail$, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 $"$Quoted \09 External symbol \11 ", csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 $__stack_chk_fail + 2, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 $" check stack - 20" - 20, csr_64, implicit %rsp, implicit-def %rsp
+...
diff --git a/test/CodeGen/MIR/X86/fixed-stack-memory-operands.mir b/test/CodeGen/MIR/X86/fixed-stack-memory-operands.mir
new file mode 100644
index 0000000000000..75d0f8a39c1c8
--- /dev/null
+++ b/test/CodeGen/MIR/X86/fixed-stack-memory-operands.mir
@@ -0,0 +1,39 @@
+# RUN: llc -march=x86 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses fixed stack memory operands
+# correctly.
+
+--- |
+
+  define i32 @test(i32 %a) #0 {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name:            test
+alignment:       4
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       4
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true }
+stack:
+  - { id: 0, name: b, offset: -8, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    frame-setup PUSH32r undef %eax, implicit-def %esp, implicit %esp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 8
+  ; CHECK: name: test
+  ; CHECK: %eax = MOV32rm %esp, 1, _, 8, _ :: (load 4 from %fixed-stack.0, align 16)
+    %eax = MOV32rm %esp, 1, _, 8, _ :: (load 4 from %fixed-stack.0, align 16)
+    MOV32mr %esp, 1, _, 0, _, %eax :: (store 4 into %ir.b)
+    %edx = POP32r implicit-def %esp, implicit %esp
+    RETL %eax
+...
diff --git a/test/CodeGen/MIR/X86/fixed-stack-object-redefinition-error.mir b/test/CodeGen/MIR/X86/fixed-stack-object-redefinition-error.mir
new file mode 100644
index 0000000000000..c4c57a1d2443e
--- /dev/null
+++ b/test/CodeGen/MIR/X86/fixed-stack-object-redefinition-error.mir
@@ -0,0 +1,28 @@
+# RUN: not llc -march=x86 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %a, i32 %b) #0 {
+  entry:
+    %c = add i32 %a, %b
+    ret i32 %c
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+# CHECK: [[@LINE+1]]:11: redefinition of fixed stack object '%fixed-stack.0'
+  - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+body: |
+  bb.0.entry:
+    %eax = MOV32rm %esp, 1, _, 4, _
+    %eax = ADD32rm killed %eax, %esp, 1, _, 8, _, implicit-def dead %eflags
+    RETL %eax
+...
diff --git a/test/CodeGen/MIR/X86/fixed-stack-objects.mir b/test/CodeGen/MIR/X86/fixed-stack-objects.mir
index dcbe6f73a6d02..70e5a7428359e 100644
--- a/test/CodeGen/MIR/X86/fixed-stack-objects.mir
+++ b/test/CodeGen/MIR/X86/fixed-stack-objects.mir
@@ -25,11 +25,9 @@ fixedStack:
   - { id: 0, offset: 0, size: 4, alignment: 4, isImmutable: true, isAliased: false }
 stack:
   - { id: 0, offset: -8, size: 4, alignment: 4 }
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      - '%eax = MOV32rm %esp, 1, _, 8, _'
-      - 'MOV32mr %esp, 1, _, 0, _, %eax'
-      - 'RETL %eax'
+body: |
+  bb.0.entry:
+    %eax = MOV32rm %esp, 1, _, 8, _
+    MOV32mr %esp, 1, _, 0, _, %eax
+    RETL %eax
 ...
diff --git a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
new file mode 100644
index 0000000000000..54fa8ad0b616d
--- /dev/null
+++ b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
@@ -0,0 +1,73 @@
+# RUN: llc -march=x86-64 -enable-shrink-wrap=true -start-after shrink-wrap -stop-after shrink-wrap -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the save and restore points in
+# the machine frame info correctly.
+
+--- |
+
+  define i32 @foo(i32 %a, i32 %b) {
+  entry:
+    %tmp = alloca i32, align 4
+    %tmp2 = icmp slt i32 %a, %b
+    br i1 %tmp2, label %true, label %false
+
+  true:
+    store i32 %a, i32* %tmp, align 4
+    %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+    br label %false
+
+  false:
+    %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %entry ]
+    ret i32 %tmp.0
+  }
+
+  declare i32 @doSomething(i32, i32*)
+
+...
+---
+name:            foo
+tracksRegLiveness: true
+liveins:
+  - { reg: '%edi' }
+  - { reg: '%esi' }
+# CHECK: frameInfo:
+# CHECK:      savePoint: '%bb.2.true'
+# CHECK-NEXT: restorePoint: '%bb.2.true'
+# CHECK: stack
+frameInfo:
+  maxAlignment:  4
+  hasCalls:      true
+  savePoint:     '%bb.2.true'
+  restorePoint:  '%bb.2.true'
+stack:
+  - { id: 0, name: tmp, offset: 0, size: 4, alignment: 4 }
+body: |
+  bb.0:
+    successors: %bb.2.true, %bb.1
+    liveins: %edi, %esi
+
+    %eax = COPY %edi
+    CMP32rr %eax, killed %esi, implicit-def %eflags
+    JL_1 %bb.2.true, implicit killed %eflags
+
+  bb.1:
+    successors: %bb.3.false
+    liveins: %eax
+
+    JMP_1 %bb.3.false
+
+  bb.2.true:
+    successors: %bb.3.false
+    liveins: %eax
+
+    MOV32mr %stack.0.tmp, 1, _, 0, _, killed %eax
+    ADJCALLSTACKDOWN64 0, 0, implicit-def %rsp, implicit-def dead %eflags, implicit %rsp
+    %rsi = LEA64r %stack.0.tmp, 1, _, 0, _
+    %edi = MOV32r0 implicit-def dead %eflags
+    CALL64pcrel32 @doSomething, csr_64, implicit %rsp, implicit %edi, implicit %rsi, implicit-def %rsp, implicit-def %eax
+    ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def dead %eflags, implicit %rsp
+
+  bb.3.false:
+    liveins: %eax
+
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/frame-info-stack-references.mir b/test/CodeGen/MIR/X86/frame-info-stack-references.mir
new file mode 100644
index 0000000000000..c8fa3bbe226f7
--- /dev/null
+++ b/test/CodeGen/MIR/X86/frame-info-stack-references.mir
@@ -0,0 +1,79 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the stack protector stack
+# object reference in the machine frame info correctly.
+
+--- |
+  @.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
+  @__stack_chk_guard = external global i8*
+
+  define i32 @test() #0 {
+  entry:
+    %StackGuardSlot = alloca i8*
+    %StackGuard = load i8*, i8** @__stack_chk_guard
+    call void @llvm.stackprotector(i8* %StackGuard, i8** %StackGuardSlot)
+    %test = alloca i8*, align 8
+    %a = alloca i8, i64 5
+    store i8* %a, i8** %test, align 8
+    %b = load i8*, i8** %test, align 8
+    %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %b)
+    call void @llvm.stackprotectorcheck(i8** @__stack_chk_guard)
+    ret i32 %call
+  }
+
+  declare i32 @printf(i8*, ...)
+
+  declare void @llvm.stackprotector(i8*, i8**) #1
+
+  declare void @llvm.stackprotectorcheck(i8**) #2
+
+  attributes #0 = { ssp "stack-protector-buffer-size"="5" }
+  attributes #1 = { nounwind }
+  attributes #2 = { nounwind argmemonly }
+...
+---
+name:            test
+alignment:       4
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       40
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+# CHECK-LABEL: name: test
+# CHECK: frameInfo
+# CHECK: stackProtector: '%stack.0.StackGuardSlot'
+  stackProtector:  '%stack.0.StackGuardSlot'
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16,
+      callee-saved-register: '%rbx' }
+stack:
+  - { id: 0, name: StackGuardSlot, offset: -24, size: 8, alignment: 8 }
+  - { id: 1, name: test, offset: -40, size: 8, alignment: 8 }
+  - { id: 2, name: a, offset: -29, size: 5, alignment: 1 }
+body: |
+  bb.0.entry:
+    successors: %bb.1.entry, %bb.2.entry
+    liveins: %rbx, %rbx
+
+    frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    %rsp = frame-setup SUB64ri8 %rsp, 32, implicit-def dead %eflags
+    %rbx = LOAD_STACK_GUARD :: (invariant load 8 from @__stack_chk_guard)
+    MOV64mr %rsp, 1, _, 24, _, %rbx
+    %rsi = LEA64r %rsp, 1, _, 19, _
+    MOV64mr %rsp, 1, _, 8, _, %rsi
+    %rdi = LEA64r %rip, 1, _, @.str, _
+    dead %eax = MOV32r0 implicit-def dead %eflags, implicit-def %al
+    CALL64pcrel32 @printf, csr_64, implicit %rsp, implicit %rdi, implicit %rsi, implicit %al, implicit-def %rsp, implicit-def %eax
+    CMP64rm killed %rbx, %rsp, 1, _, 24, _, implicit-def %eflags
+    JNE_1 %bb.2.entry, implicit %eflags
+
+  bb.1.entry:
+    liveins: %eax
+
+    %rsp = ADD64ri8 %rsp, 32, implicit-def dead %eflags
+    %rbx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+
+  bb.2.entry:
+    CALL64pcrel32 $__stack_chk_fail, csr_64, implicit %rsp, implicit-def %rsp
+...
diff --git a/test/CodeGen/MIR/X86/frame-setup-instruction-flag.mir b/test/CodeGen/MIR/X86/frame-setup-instruction-flag.mir
new file mode 100644
index 0000000000000..87c1fc68046ec
--- /dev/null
+++ b/test/CodeGen/MIR/X86/frame-setup-instruction-flag.mir
@@ -0,0 +1,35 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the frame setup instruction flag.
+
+--- |
+
+  define i32 @compute(i32 %a) {
+  body:
+    %c = mul i32 %a, 11
+    ret i32 %c
+  }
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %b = call i32 @compute(i32 %a)
+    ret i32 %b
+  }
+
+...
+---
+name:            compute
+body: |
+  bb.0.body:
+    %eax = IMUL32rri8 %edi, 11, implicit-def %eflags
+    RETQ %eax
+...
+---
+name:            foo
+body: |
+  bb.0.entry:
+    ; CHECK: frame-setup PUSH64r %rax
+    frame-setup PUSH64r %rax, implicit-def %rsp, implicit %rsp
+    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
+    %rdx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/function-liveins.mir b/test/CodeGen/MIR/X86/function-liveins.mir
new file mode 100644
index 0000000000000..95f8786b47a83
--- /dev/null
+++ b/test/CodeGen/MIR/X86/function-liveins.mir
@@ -0,0 +1,37 @@
+# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses machine function's liveins
+# correctly.
+
+--- |
+
+  define i32 @test(i32 %a, i32 %b) {
+  body:
+    %c = add i32 %a, %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+# CHECK: liveins:
+# CHECK-NEXT: - { reg: '%edi', virtual-reg: '%0' }
+# CHECK-NEXT: - { reg: '%esi', virtual-reg: '%1' }
+liveins:
+  - { reg: '%edi', virtual-reg: '%0' }
+  - { reg: '%esi', virtual-reg: '%1' }
+body: |
+  bb.0.body:
+    liveins: %edi, %esi
+
+    %1 = COPY %esi
+    %0 = COPY %edi
+    %2 = ADD32rr %0, %1, implicit-def dead %eflags
+    %eax = COPY %2
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/global-value-operands.mir b/test/CodeGen/MIR/X86/global-value-operands.mir
index 3ea729b00554d..394aa397aef4f 100644
--- a/test/CodeGen/MIR/X86/global-value-operands.mir
+++ b/test/CodeGen/MIR/X86/global-value-operands.mir
@@ -20,30 +20,121 @@
     ret i32 %b
   }
 
+  @.$0  = external global i32
+  @-_-  = external global i32
+  @_-_a = external global i32
+  @$.-B = external global i32
+
+  define i32 @test() {
+  entry:
+    %a = load i32, i32* @.$0
+    store i32 %a, i32* @-_-
+    %b = load i32, i32* @_-_a
+    store i32 %b, i32* @$.-B
+    ret i32 %b
+  }
+
+  @"\01Hello@$%09 \\ World," = external global i32
+
+  define i32 @test2() {
+  entry:
+    %a = load i32, i32* @"\01Hello@$%09 \\ World,"
+    ret i32 %a
+  }
+
+  define i32 @test3() {
+  entry:
+    %a = load i32, i32* @.$0
+    store i32 %a, i32* @-_-
+    %b = load i32, i32* @_-_a
+    store i32 %b, i32* @$.-B
+    ret i32 %b
+  }
+
+  define i32 @tf() {
+  entry:
+    %a = load i32, i32* @G
+    %b = add i32 %a, 1
+    ret i32 %b
+  }
+
 ...
 ---
 # CHECK: name: inc
 name: inc
-body:
-  - id: 0
-    name: entry
-    instructions:
-      # CHECK: - '%rax = MOV64rm %rip, 1, _, @G, _'
-      - '%rax = MOV64rm %rip, 1, _, @G, _'
-      - '%eax = MOV32rm %rax, 1, _, 0, _'
-      - '%eax = INC32r %eax, implicit-def %eflags'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK: %rax = MOV64rm %rip, 1, _, @G, _
+    %rax = MOV64rm %rip, 1, _, @G, _
+    %eax = MOV32rm %rax, 1, _, 0, _
+    %eax = INC32r %eax, implicit-def %eflags
+    RETQ %eax
 ...
 ---
 # CHECK: name: inc2
 name: inc2
-body:
-  - id: 0
-    name: entry
-    instructions:
-      # CHECK: - '%rax = MOV64rm %rip, 1, _, @0, _'
-      - '%rax = MOV64rm %rip, 1, _, @0, _'
-      - '%eax = MOV32rm %rax, 1, _, 0, _'
-      - '%eax = INC32r %eax, implicit-def %eflags'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK: %rax = MOV64rm %rip, 1, _, @0, _
+    %rax = MOV64rm %rip, 1, _, @0, _
+    %eax = MOV32rm %rax, 1, _, 0, _
+    %eax = INC32r %eax, implicit-def %eflags
+    RETQ %eax
+...
+---
+name:            test
+body: |
+  bb.0.entry:
+    ; CHECK: , @".$0",
+    ; CHECK: , @-_-,
+    ; CHECK: , @_-_a,
+    ; CHECK: , @"$.-B",
+    %rax = MOV64rm %rip, 1, _, @.$0, _
+    %eax = MOV32rm killed %rax, 1, _, 0, _
+    %rcx = MOV64rm %rip, 1, _, @-_-, _
+    MOV32mr killed %rcx, 1, _, 0, _, killed %eax
+    %rax = MOV64rm %rip, 1, _, @_-_a, _
+    %eax = MOV32rm killed %rax, 1, _, 0, _
+    %rcx = MOV64rm %rip, 1, _, @$.-B, _
+    MOV32mr killed %rcx, 1, _, 0, _, %eax
+    RETQ %eax
+...
+---
+name:            test2
+body: |
+  bb.0.entry:
+    ; CHECK: , @"\01Hello@$%09 \5C World,",
+    %rax = MOV64rm %rip, 1, _, @"\01Hello@$%09 \\ World,", _
+    %eax = MOV32rm killed %rax, 1, _, 0, _
+    RETQ %eax
+...
+---
+# CHECK: name: test3
+name:            test3
+body: |
+  bb.0.entry:
+    ; CHECK: , @".$0",
+    ; CHECK: , @-_-,
+    ; CHECK: , @_-_a + 4,
+    ; CHECK: , @"$.-B" - 8,
+    %rax = MOV64rm %rip, 1, _, @.$0 + 0, _
+    %eax = MOV32rm killed %rax, 1, _, 0, _
+    %rcx = MOV64rm %rip, 1, _, @-_- - 0, _
+    MOV32mr killed %rcx, 1, _, 0, _, killed %eax
+    %rax = MOV64rm %rip, 1, _, @_-_a + 4, _
+    %eax = MOV32rm killed %rax, 1, _, 0, _
+    %rcx = MOV64rm %rip, 1, _, @$.-B - 8, _
+    MOV32mr killed %rcx, 1, _, 0, _, %eax
+    RETQ %eax
+...
+---
+# CHECK: name: tf
+name: tf
+body: |
+  bb.0.entry:
+  ; CHECK: %rax = MOV64rm %rip, 1, _, target-flags(x86-gotpcrel) @G, _
+    %rax = MOV64rm %rip, 1, _, target-flags(x86-gotpcrel) @G, _
+    %eax = MOV32rm %rax, 1, _, 0, _
+    %eax = INC32r %eax, implicit-def %eflags
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/immediate-operands.mir b/test/CodeGen/MIR/X86/immediate-operands.mir
index 5d4956f539ddc..34bd0fa14904c 100644
--- a/test/CodeGen/MIR/X86/immediate-operands.mir
+++ b/test/CodeGen/MIR/X86/immediate-operands.mir
@@ -17,24 +17,20 @@
 ---
 # CHECK: name: foo
 name:            foo
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK:      - '%eax = MOV32ri 42'
-     # CHECK-NEXT: - 'RETQ %eax'
-     - '%eax = MOV32ri 42'
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK:      %eax = MOV32ri 42
+    ; CHECK-NEXT: RETQ %eax
+    %eax = MOV32ri 42
+    RETQ %eax
 ...
 ---
 # CHECK: name: bar
 name:            bar
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK:      - '%eax = MOV32ri -11'
-     # CHECK-NEXT: - 'RETQ %eax'
-     - '%eax = MOV32ri -11'
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK:      %eax = MOV32ri -11
+    ; CHECK-NEXT: RETQ %eax
+    %eax = MOV32ri -11
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/implicit-register-flag.mir b/test/CodeGen/MIR/X86/implicit-register-flag.mir
index 9c6882d27bdc2..b0a15ed93a8f0 100644
--- a/test/CodeGen/MIR/X86/implicit-register-flag.mir
+++ b/test/CodeGen/MIR/X86/implicit-register-flag.mir
@@ -16,26 +16,53 @@
     ret i32 %a
   }
 
+  define i1 @implicit_subregister1() {
+  entry:
+    ret i1 false
+  }
+
+  define i16 @implicit_subregister2() {
+  entry:
+    ret i16 0
+  }
+
 ...
 ---
 name:            foo
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      # CHECK:      - 'CMP32ri8 %edi, 10, implicit-def %eflags'
-      # CHECK-NEXT: - 'JG_1 %bb.2.exit, implicit %eflags'
-      - 'CMP32ri8 %edi, 10, implicit-def %eflags'
-      - 'JG_1 %bb.2.exit, implicit %eflags'
-  - id:          1
-    name:        less
-    instructions:
-      # CHECK: - '%eax = MOV32r0 implicit-def %eflags'
-      - '%eax = MOV32r0 implicit-def %eflags'
-      - 'RETQ %eax'
-  - id:          2
-    name:        exit
-    instructions:
-      - '%eax = COPY %edi'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    successors: %bb.1.less, %bb.2.exit
+    ; CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
+    ; CHECK-NEXT: JG_1 %bb.2.exit, implicit %eflags
+    CMP32ri8 %edi, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit %eflags
+
+  bb.1.less:
+    ; CHECK: %eax = MOV32r0 implicit-def %eflags
+    %eax = MOV32r0 implicit-def %eflags
+    RETQ %eax
+
+  bb.2.exit:
+    %eax = COPY %edi
+    RETQ %eax
+...
+---
+name:            implicit_subregister1
+body: |
+  bb.0.entry:
+  ; Verify that the implicit register verifier won't report an error on implicit
+  ; subregisters.
+  ; CHECK-LABEL: name: implicit_subregister1
+  ; CHECK: dead %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, implicit-def %al
+    dead %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, implicit-def %al
+    RETQ killed %al
+...
+---
+name:            implicit_subregister2
+body: |
+  bb.0.entry:
+  ; CHECK-LABEL: name: implicit_subregister2
+  ; CHECK: dead %r15 = XOR64rr undef %r15, undef %r15, implicit-def dead %eflags, implicit-def %r15w
+    dead %r15 = XOR64rr undef %r15, undef %r15, implicit-def dead %eflags, implicit-def %r15w
+    RETQ killed %r15w
 ...
diff --git a/test/CodeGen/MIR/X86/inline-asm-registers.mir b/test/CodeGen/MIR/X86/inline-asm-registers.mir
new file mode 100644
index 0000000000000..3fd565891091a
--- /dev/null
+++ b/test/CodeGen/MIR/X86/inline-asm-registers.mir
@@ -0,0 +1,54 @@
+# RUN: llc -march=x86-64 -start-after block-placement -stop-after block-placement -o /dev/null %s | FileCheck %s
+
+--- |
+  define i64 @test(i64 %x, i64 %y) #0 {
+  entry:
+    %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) #0
+    %x1 = extractvalue { i64, i64 } %x0, 0
+    ret i64 %x1
+  }
+
+  define i64 @test2(i64 %x, i64 %y) #0 {
+  entry:
+    %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) #0
+    %x1 = extractvalue { i64, i64 } %x0, 0
+    ret i64 %x1
+  }
+
+  attributes #0 = { nounwind }
+...
+---
+name:            test
+hasInlineAsm:    true
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi, %rsi
+
+  ; CHECK-LABEL: name: test
+  ; CHECK: INLINEASM $foo, 0, 2818058, def %rsi, 2818058, def dead %rdi,
+    INLINEASM $foo, 0, 2818058, def %rsi, 2818058, def dead %rdi, 2147549193, killed %rdi, 2147483657, killed %rsi, 12, implicit-def dead early-clobber %eflags
+    %rax = MOV64rr killed %rsi
+    RETQ killed %rax
+...
+---
+name:            test2
+hasInlineAsm:    true
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi, %rsi
+
+  ; Verify that the register ties are preserved.
+  ; CHECK-LABEL: name: test2
+  ; CHECK: INLINEASM $foo, 0, 2818058, def %rsi, 2818058, def dead %rdi, 2147549193, killed %rdi(tied-def 5), 2147483657, killed %rsi(tied-def 3), 12, implicit-def dead early-clobber %eflags
+    INLINEASM $foo, 0, 2818058, def %rsi, 2818058, def dead %rdi, 2147549193, killed %rdi(tied-def 5), 2147483657, killed %rsi(tied-def 3), 12, implicit-def dead early-clobber %eflags
+    %rax = MOV64rr killed %rsi
+    RETQ killed %rax
+...
diff --git a/test/CodeGen/MIR/X86/instructions-debug-location.mir b/test/CodeGen/MIR/X86/instructions-debug-location.mir
new file mode 100644
index 0000000000000..ea2cdbf7cb2f3
--- /dev/null
+++ b/test/CodeGen/MIR/X86/instructions-debug-location.mir
@@ -0,0 +1,98 @@
+# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the machine instruction's
+# debug location metadata correctly.
+
+--- |
+
+  define i32 @test(i32 %x) #0 !dbg !4 {
+  entry:
+    %x.addr = alloca i32, align 4
+    store i32 %x, i32* %x.addr, align 4
+    call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !12, metadata !13), !dbg !14
+    %0 = load i32, i32* %x.addr, align 4, !dbg !15
+    ret i32 %0, !dbg !15
+  }
+
+  define i32 @test_typed_immediates(i32 %x) #0 {
+  entry:
+    %x.addr = alloca i32, align 4
+    store i32 %x, i32* %x.addr, align 4
+    call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !12, metadata !13), !dbg !14
+    %0 = load i32, i32* %x.addr, align 4, !dbg !15
+    ret i32 %0, !dbg !15
+  }
+
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+  attributes #0 = { nounwind "no-frame-pointer-elim"="false" }
+  attributes #1 = { nounwind readnone }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!9, !10}
+  !llvm.ident = !{!11}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "test.ll", directory: "")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+  !5 = !DIFile(filename: "test.c", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !9 = !{i32 2, !"Dwarf Version", i32 4}
+  !10 = !{i32 2, !"Debug Info Version", i32 3}
+  !11 = !{!"clang version 3.7.0"}
+  !12 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 4, type: !8)
+  !13 = !DIExpression()
+  !14 = !DILocation(line: 4, scope: !4)
+  !15 = !DILocation(line: 8, scope: !4)
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+frameInfo:
+  maxAlignment:  4
+stack:
+  - { id: 0, name: x.addr, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    liveins: %edi
+    ; CHECK: DBG_VALUE debug-use _, 0, !12, !13, debug-location !14
+    ; CHECK: %eax = COPY %0, debug-location !15
+    ; CHECK: RETQ %eax, debug-location !15
+    %0 = COPY %edi
+    DBG_VALUE debug-use _, 0, !12, !13, debug-location !14
+    MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
+    %eax = COPY %0, debug-location !15
+    RETQ %eax, debug-location !15
+...
+---
+name:            test_typed_immediates
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+frameInfo:
+  maxAlignment:  4
+stack:
+  - { id: 0, name: x.addr, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    liveins: %edi
+
+    %0 = COPY %edi
+  ; CHECK:      DBG_VALUE _, i32 0, !12, !13
+  ; CHECK-NEXT: DBG_VALUE _, i64 -22, !12, !13
+  ; CHECK-NEXT: DBG_VALUE _, i128 123492148938512984928424384934328985928, !12, !13
+    DBG_VALUE _, i32 0, !12, !13
+    DBG_VALUE _, i64 -22, !12, !13
+    DBG_VALUE _, i128 123492148938512984928424384934328985928, !12, !13
+    MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
+    %eax = COPY %0
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/invalid-constant-pool-item.mir b/test/CodeGen/MIR/X86/invalid-constant-pool-item.mir
new file mode 100644
index 0000000000000..afd6c78546ce3
--- /dev/null
+++ b/test/CodeGen/MIR/X86/invalid-constant-pool-item.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when parsing an invalid
+# constant pool item operand.
+
+--- |
+
+  define double @test(double %a, float %b) {
+  entry:
+    %c = fadd double %a, 3.250000e+00
+    ret double %c
+  }
+
+...
+---
+name:            test
+constants:
+  - id:          0
+    value:       'double 3.250000e+00'
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:47: use of undefined constant '%const.10'
+    %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.10, _
+    RETQ %xmm0
+...
+
diff --git a/test/CodeGen/MIR/X86/invalid-metadata-node-type.mir b/test/CodeGen/MIR/X86/invalid-metadata-node-type.mir
new file mode 100644
index 0000000000000..a6c2e509da0c9
--- /dev/null
+++ b/test/CodeGen/MIR/X86/invalid-metadata-node-type.mir
@@ -0,0 +1,53 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+
+  define void @foo() #1 {
+  entry:
+    %x.i = alloca i8, align 1
+    %y.i = alloca [256 x i8], align 16
+    %0 = bitcast [256 x i8]* %y.i to i8*
+    br label %for.body
+
+  for.body:
+    %1 = bitcast [256 x i8]* %y.i to i8*
+    call void @llvm.dbg.declare(metadata i8* %0, metadata !4, metadata !7) #3, !dbg !8
+    br label %for.body
+  }
+
+  attributes #0 = { nounwind readnone }
+  attributes #1 = { nounwind ssp uwtable }
+  attributes #3 = { nounwind }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: 0, enums: !2, retainedTypes: !2)
+  !1 = !DIFile(filename: "t.c", directory: "")
+  !2 = !{}
+  !3 = !{i32 1, !"Debug Info Version", i32 3}
+  !4 = !DILocalVariable(name: "x", scope: !5, file: !1, line: 16, type: !6)
+  !5 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false)
+  !6 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+  !7 = !DIExpression()
+  !8 = !DILocation(line: 0, scope: !5)
+...
+---
+name:            foo
+isSSA:           true
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    16
+stack:
+# CHECK: [[@LINE+1]]:75: expected a reference to a 'DILocalVariable' metadata node
+  - { id: 0, name: y.i, offset: 0, size: 256, alignment: 16, di-variable: '!8',
+      di-expression: '!7', di-location: '!8' }
+body: |
+  bb.0.entry:
+    successors: %bb.1.for.body
+  bb.1.for.body:
+    successors: %bb.1.for.body
+
+    DBG_VALUE %stack.0.y.i, 0, !4, !7, debug-location !8
+    JMP_1 %bb.1.for.body
+...
diff --git a/test/CodeGen/MIR/X86/invalid-target-flag-name.mir b/test/CodeGen/MIR/X86/invalid-target-flag-name.mir
new file mode 100644
index 0000000000000..313c5bdafed87
--- /dev/null
+++ b/test/CodeGen/MIR/X86/invalid-target-flag-name.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @G = external global i32
+
+  define i32 @inc() {
+  entry:
+    %a = load i32, i32* @G
+    %b = add i32 %a, 1
+    ret i32 %b
+  }
+
+...
+---
+name: inc
+body: |
+  bb.0.entry:
+  ; CHECK: [[@LINE+1]]:45: use of undefined target flag 'x86-test'
+    %rax = MOV64rm %rip, 1, _, target-flags(x86-test) @G, _
+    %eax = MOV32rm killed %rax, 1, _, 0, _
+    %eax = INC32r killed %eax, implicit-def dead %eflags
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/invalid-tied-def-index-error.mir b/test/CodeGen/MIR/X86/invalid-tied-def-index-error.mir
new file mode 100644
index 0000000000000..00436adca4843
--- /dev/null
+++ b/test/CodeGen/MIR/X86/invalid-tied-def-index-error.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+  define i64 @test(i64 %x) #0 {
+  entry:
+    %asm = tail call i64 asm sideeffect "$foo", "=r,0"(i64 %x) nounwind
+    ret i64 %asm
+  }
+
+  attributes #0 = { nounwind }
+...
+---
+name:            test
+hasInlineAsm:    true
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+
+  ; CHECK: [[@LINE+1]]:58: use of invalid tied-def operand index '300'; instruction has only 6 operands
+    INLINEASM $"$foo", 1, 2818058, def %rdi, 2147483657, killed %rdi(tied-def 300)
+    %rax = COPY killed %rdi
+    RETQ killed %rax
+...
diff --git a/test/CodeGen/MIR/X86/jump-table-info.mir b/test/CodeGen/MIR/X86/jump-table-info.mir
new file mode 100644
index 0000000000000..a4e6f6a1728c3
--- /dev/null
+++ b/test/CodeGen/MIR/X86/jump-table-info.mir
@@ -0,0 +1,150 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the jump table info and jump
+# table operands correctly.
+
+--- |
+
+  define i32 @test_jumptable(i32 %in) {
+  entry:
+    switch i32 %in, label %def [
+      i32 0, label %lbl1
+      i32 1, label %lbl2
+      i32 2, label %lbl3
+      i32 3, label %lbl4
+    ]
+
+  def:
+    ret i32 0
+
+  lbl1:
+    ret i32 1
+
+  lbl2:
+    ret i32 2
+
+  lbl3:
+    ret i32 4
+
+  lbl4:
+    ret i32 8
+  }
+
+  define i32 @test_jumptable2(i32 %in) {
+  entry:
+    switch i32 %in, label %def [
+      i32 0, label %lbl1
+      i32 1, label %lbl2
+      i32 2, label %lbl3
+      i32 3, label %lbl4
+    ]
+
+  def:
+    ret i32 0
+
+  lbl1:
+    ret i32 1
+
+  lbl2:
+    ret i32 2
+
+  lbl3:
+    ret i32 4
+
+  lbl4:
+    ret i32 8
+  }
+
+...
+---
+name:            test_jumptable
+# CHECK:      jumpTable:
+# CHECK-NEXT: kind: label-difference32
+# CHECK-NEXT: entries:
+# CHECK-NEXT: - id: 0
+# CHECK-NEXT: blocks: [ '%bb.3.lbl1', '%bb.4.lbl2', '%bb.5.lbl3', '%bb.6.lbl4' ]
+# CHECK_NEXT: body:
+jumpTable:
+  kind:          label-difference32
+  entries:
+    - id:        0
+      blocks:    [ '%bb.3.lbl1', '%bb.4.lbl2', '%bb.5.lbl3', '%bb.6.lbl4' ]
+body: |
+  bb.0.entry:
+    successors: %bb.2.def, %bb.1.entry
+
+    %eax = MOV32rr %edi, implicit-def %rax
+    CMP32ri8 %edi, 3, implicit-def %eflags
+    JA_1 %bb.2.def, implicit %eflags
+
+  bb.1.entry:
+    successors: %bb.3.lbl1, %bb.4.lbl2, %bb.5.lbl3, %bb.6.lbl4
+    ; CHECK: %rcx = LEA64r %rip, 1, _, %jump-table.0, _
+    %rcx = LEA64r %rip, 1, _, %jump-table.0, _
+    %rax = MOVSX64rm32 %rcx, 4, %rax, 0, _
+    %rax = ADD64rr %rax, %rcx, implicit-def %eflags
+    JMP64r %rax
+
+  bb.2.def:
+    %eax = MOV32r0 implicit-def %eflags
+    RETQ %eax
+
+  bb.3.lbl1:
+    %eax = MOV32ri 1
+    RETQ %eax
+
+  bb.4.lbl2:
+    %eax = MOV32ri 2
+    RETQ %eax
+
+  bb.5.lbl3:
+    %eax = MOV32ri 4
+    RETQ %eax
+
+  bb.6.lbl4:
+    %eax = MOV32ri 8
+    RETQ %eax
+...
+---
+name:            test_jumptable2
+jumpTable:
+  kind:          label-difference32
+  entries:
+    - id:        1
+      blocks:    [ '%bb.3.lbl1', '%bb.4.lbl2', '%bb.5.lbl3', '%bb.6.lbl4' ]
+body: |
+  bb.0.entry:
+    successors: %bb.2.def, %bb.1.entry
+
+    %eax = MOV32rr %edi, implicit-def %rax
+    CMP32ri8 %edi, 3, implicit-def %eflags
+    JA_1 %bb.2.def, implicit %eflags
+
+  bb.1.entry:
+    successors: %bb.3.lbl1, %bb.4.lbl2, %bb.5.lbl3, %bb.6.lbl4
+    ; Verify that the printer will use an id of 0 for this jump table:
+    ; CHECK: %rcx = LEA64r %rip, 1, _, %jump-table.0, _
+    %rcx = LEA64r %rip, 1, _, %jump-table.1, _
+    %rax = MOVSX64rm32 %rcx, 4, %rax, 0, _
+    %rax = ADD64rr %rax, %rcx, implicit-def %eflags
+    JMP64r %rax
+
+  bb.2.def:
+    %eax = MOV32r0 implicit-def %eflags
+    RETQ %eax
+
+  bb.3.lbl1:
+    %eax = MOV32ri 1
+    RETQ %eax
+
+  bb.4.lbl2:
+    %eax = MOV32ri 2
+    RETQ %eax
+
+  bb.5.lbl3:
+    %eax = MOV32ri 4
+    RETQ %eax
+
+  bb.6.lbl4:
+    %eax = MOV32ri 8
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/jump-table-redefinition-error.mir b/test/CodeGen/MIR/X86/jump-table-redefinition-error.mir
new file mode 100644
index 0000000000000..d4ab11f407877
--- /dev/null
+++ b/test/CodeGen/MIR/X86/jump-table-redefinition-error.mir
@@ -0,0 +1,76 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test_jumptable(i32 %in) {
+  entry:
+    switch i32 %in, label %def [
+      i32 0, label %lbl1
+      i32 1, label %lbl2
+      i32 2, label %lbl3
+      i32 3, label %lbl4
+    ]
+
+  def:
+    ret i32 0
+
+  lbl1:
+    ret i32 1
+
+  lbl2:
+    ret i32 2
+
+  lbl3:
+    ret i32 4
+
+  lbl4:
+    ret i32 8
+  }
+
+...
+---
+name:            test_jumptable
+jumpTable:
+  kind:          label-difference32
+  entries:
+    - id:        0
+      blocks:    [ '%bb.3.lbl1', '%bb.4.lbl2', '%bb.5.lbl3', '%bb.6.lbl4' ]
+# CHECK: [[@LINE+1]]:18: redefinition of jump table entry '%jump-table.0'
+    - id:        0
+      blocks:    [ '%bb.3.lbl1', '%bb.4.lbl2', '%bb.5.lbl3', '%bb.6.lbl4' ]
+body: |
+  bb.0.entry:
+    successors: %bb.2.def, %bb.1.entry
+
+    %eax = MOV32rr %edi, implicit-def %rax
+    CMP32ri8 %edi, 3, implicit-def %eflags
+    JA_1 %bb.2.def, implicit %eflags
+
+  bb.1.entry:
+    successors: %bb.3.lbl1, %bb.4.lbl2, %bb.5.lbl3, %bb.6.lbl4
+
+    %rcx = LEA64r %rip, 1, _, %jump-table.0, _
+    %rax = MOVSX64rm32 %rcx, 4, %rax, 0, _
+    %rax = ADD64rr %rax, %rcx, implicit-def %eflags
+    JMP64r %rax
+
+  bb.2.def:
+    %eax = MOV32r0 implicit-def %eflags
+    RETQ %eax
+
+  bb.3.lbl1:
+    %eax = MOV32ri 1
+    RETQ %eax
+
+  bb.4.lbl2:
+    %eax = MOV32ri 2
+    RETQ %eax
+
+  bb.5.lbl3:
+    %eax = MOV32ri 4
+    RETQ %eax
+
+  bb.6.lbl4:
+    %eax = MOV32ri 8
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/killed-register-flag.mir b/test/CodeGen/MIR/X86/killed-register-flag.mir
index d654a9d2fa562..9e8f3ba3b368d 100644
--- a/test/CodeGen/MIR/X86/killed-register-flag.mir
+++ b/test/CodeGen/MIR/X86/killed-register-flag.mir
@@ -19,24 +19,22 @@
 ...
 ---
 name:            foo
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      - 'CMP32ri8 %edi, 10, implicit-def %eflags'
-      - 'JG_1 %bb.2.exit, implicit %eflags'
-  - id:          1
-    name:        less
-    instructions:
-      # CHECK:      - '%eax = MOV32r0
-      # CHECK-NEXT: - 'RETQ killed %eax
-      - '%eax = MOV32r0 implicit-def %eflags'
-      - 'RETQ killed %eax'
-  - id:          2
-    name:        exit
-    instructions:
-      # CHECK:      - '%eax = COPY killed %edi
-      # CHECK-NEXT: - 'RETQ killed %eax
-      - '%eax = COPY killed %edi'
-      - 'RETQ killed %eax'
+body: |
+  bb.0.entry:
+    successors: %bb.1.less, %bb.2.exit
+
+    CMP32ri8 %edi, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit %eflags
+
+  bb.1.less:
+    ; CHECK:      %eax = MOV32r0
+    ; CHECK-NEXT: RETQ killed %eax
+    %eax = MOV32r0 implicit-def %eflags
+    RETQ killed %eax
+
+  bb.2.exit:
+    ; CHECK:      %eax = COPY killed %edi
+    ; CHECK-NEXT: RETQ killed %eax
+    %eax = COPY killed %edi
+    RETQ killed %eax
 ...
diff --git a/test/CodeGen/MIR/X86/large-cfi-offset-number-error.mir b/test/CodeGen/MIR/X86/large-cfi-offset-number-error.mir
new file mode 100644
index 0000000000000..93ce30abec7ce
--- /dev/null
+++ b/test/CodeGen/MIR/X86/large-cfi-offset-number-error.mir
@@ -0,0 +1,27 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define void @test() {
+  entry:
+    %tmp = alloca [4168 x i8], align 4
+    ret void
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       4040
+stack:
+  - { id: 0, name: tmp, offset: -4176, size: 4168, alignment: 4 }
+body: |
+  bb.0.entry:
+    %rsp = SUB64ri32 %rsp, 4040, implicit-def dead %eflags
+    ; CHECK: [[@LINE+1]]:41: expected a 32 bit integer (the cfi offset is too large)
+    CFI_INSTRUCTION .cfi_def_cfa_offset 123456789123456
+    %rsp = ADD64ri32 %rsp, 4040, implicit-def dead %eflags
+    RETQ
+...
+
diff --git a/test/CodeGen/MIR/X86/large-immediate-operand-error.mir b/test/CodeGen/MIR/X86/large-immediate-operand-error.mir
new file mode 100644
index 0000000000000..f815c63e18e92
--- /dev/null
+++ b/test/CodeGen/MIR/X86/large-immediate-operand-error.mir
@@ -0,0 +1,18 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo() {
+  entry:
+    ret i32 42
+  }
+
+...
+---
+name:            foo
+body: |
+  bb.0.entry:
+  ; CHECK: [[@LINE+1]]:20: integer literal is too large to be an immediate operand
+    %eax = MOV32ri 12346127502983478823754212949184914
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/large-index-number-error.mir b/test/CodeGen/MIR/X86/large-index-number-error.mir
index fdb25c907f527..272cd685b3810 100644
--- a/test/CodeGen/MIR/X86/large-index-number-error.mir
+++ b/test/CodeGen/MIR/X86/large-index-number-error.mir
@@ -18,18 +18,16 @@
 ...
 ---
 name:            foo
-body:
- - id: 0
-   name: entry
-   instructions:
-     - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
-     # CHECK: [[@LINE+1]]:14: expected 32-bit integer (too large)
-     - 'JG_1 %bb.123456789123456, implicit %eflags'
- - id: 1
-   instructions:
-     - '%eax = MOV32r0 implicit-def %eflags'
- - id: 2
-   instructions:
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    %eax = MOV32rm %rdi, 1, _, 0, _
+    CMP32ri8 %eax, 10, implicit-def %eflags
+    ; CHECK: [[@LINE+1]]:10: expected 32-bit integer (too large)
+    JG_1 %bb.123456789123456, implicit %eflags
+
+  bb.1:
+    %eax = MOV32r0 implicit-def %eflags
+
+  bb.2:
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/large-offset-number-error.mir b/test/CodeGen/MIR/X86/large-offset-number-error.mir
new file mode 100644
index 0000000000000..5463cdbce444d
--- /dev/null
+++ b/test/CodeGen/MIR/X86/large-offset-number-error.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @G = external global i32
+
+  define i32 @inc() {
+  entry:
+    %a = load i32, i32* @G
+    %b = add i32 %a, 1
+    ret i32 %b
+  }
+
+...
+---
+name: inc
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:37: expected 64-bit integer (too large)
+    %rax = MOV64rm %rip, 1, _, @G + 123456789123456789123456789, _
+    %eax = MOV32rm %rax, 1, _, 0, _
+    %eax = INC32r %eax implicit-def %eflags
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/large-size-in-memory-operand-error.mir b/test/CodeGen/MIR/X86/large-size-in-memory-operand-error.mir
new file mode 100644
index 0000000000000..c570f0992a3fc
--- /dev/null
+++ b/test/CodeGen/MIR/X86/large-size-in-memory-operand-error.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32* %a) {
+  entry:
+    %b = load i32, i32* %a
+    ret i32 %b
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:53: expected 64-bit integer (too large)
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 12345678912345678924218574857 from %ir.a)
+    RETQ %eax
+...
+
diff --git a/test/CodeGen/MIR/X86/liveout-register-mask.mir b/test/CodeGen/MIR/X86/liveout-register-mask.mir
new file mode 100644
index 0000000000000..7ded7287060ea
--- /dev/null
+++ b/test/CodeGen/MIR/X86/liveout-register-mask.mir
@@ -0,0 +1,42 @@
+# RUN: llc -march=x86-64 -start-after stackmap-liveness -stop-after stackmap-liveness -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the liveout register mask
+# machine operands correctly.
+
+--- |
+
+  define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+  entry:
+    %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
+    ret void
+  }
+
+  declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
+...
+---
+name:            small_patchpoint_codegen
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+frameInfo:
+  hasPatchPoint: true
+  stackSize:     8
+  adjustsStack:  true
+  hasCalls:      true
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16 }
+body: |
+  bb.0.entry:
+    liveins: %rdi, %rsi, %rbp
+
+    frame-setup PUSH64r killed %rbp, implicit-def %rsp, implicit %rsp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 16
+    CFI_INSTRUCTION .cfi_offset %rbp, -16
+    %rbp = frame-setup MOV64rr %rsp
+    CFI_INSTRUCTION .cfi_def_cfa_register %rbp
+  ; CHECK: PATCHPOINT 5, 5, 0, 2, 0, %rdi, %rsi, csr_64, liveout(%esp, %rsp, %sp, %spl),
+    PATCHPOINT 5, 5, 0, 2, 0, %rdi, %rsi, csr_64, liveout(%esp, %rsp, %sp, %spl), implicit-def dead early-clobber %r11, implicit-def %rsp, implicit-def dead %rax
+    %rbp = POP64r implicit-def %rsp, implicit %rsp
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/machine-basic-block-operands.mir b/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
index 607acb5f273eb..0d7a9f8ef34cb 100644
--- a/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
+++ b/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
@@ -33,43 +33,41 @@
 ---
 # CHECK: name: foo
 name:            foo
-body:
- # CHECK: name: entry
- - id:              0
-   name:            entry
-   instructions:
-     - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     # CHECK:      - 'CMP32ri8 %eax, 10
-     # CHECK-NEXT: - 'JG_1 %bb.2.exit
-     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
-     - 'JG_1 %bb.2.exit, implicit %eflags'
- # CHECK: name: less
- - id:              1
-   name:            less
-   instructions:
-     - '%eax = MOV32r0 implicit-def %eflags'
- - id:              2
-   name:            exit
-   instructions:
-     - 'RETQ %eax'
+body: |
+  ; CHECK: bb.0.entry
+  bb.0.entry:
+    successors: %bb.1.less, %bb.2.exit
+
+    %eax = MOV32rm %rdi, 1, _, 0, _
+    ; CHECK:      CMP32ri8 %eax, 10
+    ; CHECK-NEXT: JG_1 %bb.2.exit
+    CMP32ri8 %eax, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit %eflags
+  ; CHECK: bb.1.less:
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def %eflags
+
+  bb.2.exit:
+    RETQ %eax
 ...
 ---
 # CHECK: name: bar
 name:            bar
-body:
- # CHECK: name: entry
- - id: 0
-   name: entry
-   instructions:
-     - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     # CHECK:      - 'CMP32ri8 %eax, 10
-     # CHECK-NEXT: - 'JG_1 %bb.2
-     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
-     - 'JG_1 %bb.3, implicit %eflags'
- - id: 1
-   instructions:
-     - '%eax = MOV32r0 implicit-def %eflags'
- - id: 3
-   instructions:
-     - 'RETQ %eax'
+body: |
+  ; CHECK: bb.0.entry:
+  bb.0.entry:
+    successors: %bb.1, %bb.3
+
+    %eax = MOV32rm %rdi, 1, _, 0, _
+    ; CHECK:      CMP32ri8 %eax, 10
+    ; CHECK-NEXT: JG_1 %bb.2
+    CMP32ri8 %eax, 10, implicit-def %eflags
+    JG_1 %bb.3, implicit %eflags
+
+  bb.1:
+    %eax = MOV32r0 implicit-def %eflags
+
+  bb.3:
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/machine-instructions.mir b/test/CodeGen/MIR/X86/machine-instructions.mir
index 08f3d76486b15..0e46d01e0bd1f 100644
--- a/test/CodeGen/MIR/X86/machine-instructions.mir
+++ b/test/CodeGen/MIR/X86/machine-instructions.mir
@@ -14,12 +14,10 @@
 ---
 # CHECK: name: inc
 name:            inc
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK:      - MOV32rr
-     # CHECK-NEXT: - RETQ
-     - MOV32rr
-     - ' RETQ '
+body: |
+  bb.0.entry:
+    ; CHECK:      MOV32rr
+    ; CHECK-NEXT: RETQ
+    %eax = MOV32rr %eax
+     RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/machine-verifier.mir b/test/CodeGen/MIR/X86/machine-verifier.mir
new file mode 100644
index 0000000000000..a7413d4d03bc2
--- /dev/null
+++ b/test/CodeGen/MIR/X86/machine-verifier.mir
@@ -0,0 +1,22 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser runs the machine verifier after parsing.
+
+--- |
+
+  define i32 @inc(i32 %a) {
+  entry:
+    ret i32 %a
+  }
+
+...
+---
+name:            inc
+body: |
+  bb.0.entry:
+    liveins: %edi
+   ; CHECK: *** Bad machine code: Too few operands ***
+   ; CHECK: instruction: COPY
+   ; CHECK: 2 operands expected, but 0 given.
+    COPY
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/memory-operands.mir b/test/CodeGen/MIR/X86/memory-operands.mir
new file mode 100644
index 0000000000000..3c9463d2f313e
--- /dev/null
+++ b/test/CodeGen/MIR/X86/memory-operands.mir
@@ -0,0 +1,508 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the machine memory operands
+# correctly.
+
+--- |
+
+  define i32 @test(i32* %a) {
+  entry:
+    %b = load i32, i32* %a
+    store i32 42, i32* %a
+    ret i32 %b
+  }
+
+  define void @test2(i32* %"a value") {
+  entry2:
+    %b = load i32, i32* %"a value"
+    %c = add i32 %b, 1
+    store i32 %c, i32* %"a value"
+    ret void
+  }
+
+  define void @test3(i32*) {
+  entry3:
+    %1 = alloca i32
+    %b = load i32, i32* %0
+    %c = add i32 %b, 1
+    store i32 %c, i32* %1
+    ret void
+  }
+
+  define i32 @volatile_inc(i32* %x) {
+  entry:
+    %0 = load volatile i32, i32* %x
+    %1 = add i32 %0, 1
+    store volatile i32 %1, i32* %x
+    ret i32 %1
+  }
+
+  define void @non_temporal_store(i32* %a, i32 %b) {
+  entry:
+    store i32 %b, i32* %a, align 16, !nontemporal !0
+    ret void
+  }
+
+  !0 = !{i32 1}
+
+  define i32 @invariant_load(i32* %x) {
+  entry:
+    %v = load i32, i32* %x, !invariant.load !1
+    ret i32 %v
+  }
+
+  !1 = !{}
+
+  define void @memory_offset(<8 x float>* %vec) {
+  entry:
+    %v = load <8 x float>, <8 x float>* %vec
+    %v2 = insertelement <8 x float> %v, float 0.0, i32 4
+    store <8 x float> %v2, <8 x float>* %vec
+    ret void
+  }
+
+  define void @memory_alignment(<8 x float>* %vec) {
+  entry:
+    %v = load <8 x float>, <8 x float>* %vec
+    %v2 = insertelement <8 x float> %v, float 0.0, i32 4
+    store <8 x float> %v2, <8 x float>* %vec
+    ret void
+  }
+
+  define double @constant_pool_psv(double %a) {
+  entry:
+    %b = fadd double %a, 3.250000e+00
+    ret double %b
+  }
+
+  declare x86_fp80 @cosl(x86_fp80) #0
+
+  define x86_fp80 @stack_psv(x86_fp80 %x) {
+  entry:
+    %y = call x86_fp80 @cosl(x86_fp80 %x) #0
+    ret x86_fp80 %y
+  }
+
+  attributes #0 = { readonly }
+
+  @G = external global i32
+
+  define i32 @got_psv() {
+  entry:
+    %a = load i32, i32* @G
+    %b = add i32 %a, 1
+    ret i32 %b
+  }
+
+  @0 = external global i32
+
+  define i32 @global_value() {
+  entry:
+    %a = load i32, i32* @G
+    %b = add i32 %a, 1
+    %c = load i32, i32* @0
+    %d = add i32 %b, %c
+    ret i32 %d
+  }
+
+  define i32 @jumptable_psv(i32 %in) {
+  entry:
+    switch i32 %in, label %def [
+      i32 0, label %lbl1
+      i32 1, label %lbl2
+      i32 2, label %lbl3
+      i32 3, label %lbl4
+    ]
+  def:
+    ret i32 0
+  lbl1:
+    ret i32 1
+  lbl2:
+    ret i32 2
+  lbl3:
+    ret i32 4
+  lbl4:
+    ret i32 8
+  }
+
+  %struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
+
+  @a = common global i32 0, align 4
+
+  define i32 @tbaa_metadata() {
+  entry:
+    %0 = load i32, i32* @a, align 4, !tbaa !2
+    %1 = inttoptr i32 %0 to %struct.XXH_state64_t*
+    %total_len2 = bitcast %struct.XXH_state64_t* %1 to i32*
+    %2 = load i32, i32* %total_len2, align 4, !tbaa !6
+    ret i32 %2
+  }
+
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = !{!7, !3, i64 0}
+  !7 = !{!"XXH_state64_t", !3, i64 0, !3, i64 4, !8, i64 8, !8, i64 16, !8, i64 24}
+  !8 = !{!"long long", !4, i64 0}
+
+  define void @aa_scope(float* nocapture %a, float* nocapture readonly %c) #1 {
+  entry:
+    %0 = load float, float* %c, align 4, !alias.scope !9
+    %arrayidx.i = getelementptr inbounds float, float* %a, i64 5
+    store float %0, float* %arrayidx.i, align 4, !noalias !9
+    %1 = load float, float* %c, align 4
+    %arrayidx = getelementptr inbounds float, float* %a, i64 7
+    store float %1, float* %arrayidx, align 4
+    ret void
+  }
+
+  attributes #1 = { nounwind uwtable }
+
+  !9 = distinct !{!9, !10, !"some scope"}
+  !10 = distinct !{!10, !"some domain"}
+
+  define zeroext i1 @range_metadata(i8* %x) {
+  entry:
+    %0 = load i8, i8* %x, align 1, !range !11
+    %tobool = trunc i8 %0 to i1
+    ret i1 %tobool
+  }
+
+  !11 = !{i8 0, i8 2}
+
+  %st = type { i32, i32 }
+
+  @values = common global [50 x %st] zeroinitializer, align 16
+
+  define void @gep_value(i64 %d) {
+  entry:
+    %conv = trunc i64 %d to i32
+    store i32 %conv, i32* getelementptr inbounds ([50 x %st], [50 x %st]* @values, i64 0, i64 0, i32 0), align 16
+    ret void
+  }
+
+  define i8* @undef_value() {
+  entry:
+    %0 = load i8*, i8** undef, align 8
+    ret i8* %0
+  }
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK:      %eax = MOV32rm %rdi, 1, _, 0, _ :: (load 4 from %ir.a)
+  ; CHECK-NEXT: MOV32mi killed %rdi, 1, _, 0, _, 42 :: (store 4 into %ir.a)
+    %eax = MOV32rm %rdi, 1, _, 0, _ :: (load 4 from %ir.a)
+    MOV32mi killed %rdi, 1, _, 0, _, 42 :: (store 4 into %ir.a)
+    RETQ %eax
+...
+---
+name:            test2
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry2:
+    liveins: %rdi
+  ; CHECK: INC32m killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (store 4 into %ir."a value"), (load 4 from %ir."a value")
+    INC32m killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (store 4 into %ir."a value"), (load 4 from %ir."a value")
+    RETQ
+...
+---
+name:            test3
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, offset: -12, size: 4, alignment: 4 }
+body: |
+  bb.0.entry3:
+    liveins: %rdi
+  ; Verify that the unnamed local values can be serialized.
+  ; CHECK-LABEL: name: test3
+  ; CHECK: %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 from %ir.0)
+  ; CHECK: MOV32mr %rsp, 1, _, -4, _, killed %eax :: (store 4 into %ir.1)
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 from %ir.0)
+    %eax = INC32r killed %eax, implicit-def dead %eflags
+    MOV32mr %rsp, 1, _, -4, _, killed %eax :: (store 4 into %ir.1)
+    RETQ
+...
+---
+name:            volatile_inc
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+    ; CHECK: name: volatile_inc
+    ; CHECK: %eax = MOV32rm %rdi, 1, _, 0, _ :: (volatile load 4 from %ir.x)
+    ; CHECK: MOV32mr killed %rdi, 1, _, 0, _, %eax :: (volatile store 4 into %ir.x)
+    %eax = MOV32rm %rdi, 1, _, 0, _ :: (volatile load 4 from %ir.x)
+    %eax = INC32r killed %eax, implicit-def dead %eflags
+    MOV32mr killed %rdi, 1, _, 0, _, %eax :: (volatile store 4 into %ir.x)
+    RETQ %eax
+...
+---
+name:            non_temporal_store
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%esi' }
+body: |
+  bb.0.entry:
+    liveins: %esi, %rdi
+  ; CHECK: name: non_temporal_store
+  ; CHECK: MOVNTImr killed %rdi, 1, _, 0, _, killed %esi :: (non-temporal store 4 into %ir.a)
+    MOVNTImr killed %rdi, 1, _, 0, _, killed %esi :: (non-temporal store 4 into %ir.a)
+    RETQ
+...
+---
+name:            invariant_load
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: name: invariant_load
+  ; CHECK: %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (invariant load 4 from %ir.x)
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (invariant load 4 from %ir.x)
+    RETQ %eax
+...
+---
+name:            memory_offset
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: name: memory_offset
+  ; CHECK:      %xmm0 = MOVAPSrm %rdi, 1, _, 0, _ :: (load 16 from %ir.vec)
+  ; CHECK-NEXT: %xmm1 = MOVAPSrm %rdi, 1, _, 16, _ :: (load 16 from %ir.vec + 16)
+  ; CHECK:      MOVAPSmr %rdi, 1, _, 0, _, killed %xmm0 :: (store 16 into %ir.vec)
+  ; CHECK-NEXT: MOVAPSmr killed %rdi, 1, _, 16, _, killed %xmm1 :: (store 16 into %ir.vec + 16)
+    %xmm0 = MOVAPSrm %rdi, 1, _, 0, _ :: (load 16 from %ir.vec)
+    %xmm1 = MOVAPSrm %rdi, 1, _, 16, _ :: (load 16 from %ir.vec + 16)
+    %xmm2 = FsFLD0SS
+    %xmm1 = MOVSSrr killed %xmm1, killed %xmm2
+    MOVAPSmr %rdi, 1, _, 0, _, killed %xmm0 :: (store 16 into %ir.vec)
+    MOVAPSmr killed %rdi, 1, _, 16, _, killed %xmm1 :: (store 16 into %ir.vec + 16)
+    RETQ
+...
+---
+name:            memory_alignment
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: name: memory_alignment
+  ; CHECK:      %xmm0 = MOVAPSrm %rdi, 1, _, 0, _ :: (load 16 from %ir.vec, align 32)
+  ; CHECK-NEXT: %xmm1 = MOVAPSrm %rdi, 1, _, 16, _ :: (load 16 from %ir.vec + 16, align 32)
+  ; CHECK:      MOVAPSmr %rdi, 1, _, 0, _, killed %xmm0 :: (store 16 into %ir.vec, align 32)
+  ; CHECK-NEXT: MOVAPSmr killed %rdi, 1, _, 16, _, killed %xmm1 :: (store 16 into %ir.vec + 16, align 32)
+    %xmm0 = MOVAPSrm %rdi, 1, _, 0, _ :: (load 16 from %ir.vec, align 32)
+    %xmm1 = MOVAPSrm %rdi, 1, _, 16, _ :: (load 16 from %ir.vec + 16, align 32)
+    %xmm2 = FsFLD0SS
+    %xmm1 = MOVSSrr killed %xmm1, killed %xmm2
+    MOVAPSmr %rdi, 1, _, 0, _, killed %xmm0 :: (store 16 into %ir.vec, align 32)
+    MOVAPSmr killed %rdi, 1, _, 16, _, killed %xmm1 :: (store 16 into %ir.vec + 16, align 32)
+    RETQ
+...
+---
+name:            constant_pool_psv
+tracksRegLiveness: true
+liveins:
+  - { reg: '%xmm0' }
+constants:
+  - id:          0
+    value:       'double 3.250000e+00'
+body: |
+  bb.0.entry:
+    liveins: %xmm0
+  ; CHECK: name: constant_pool_psv
+  ; CHECK:      %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _ :: (load 8 from constant-pool)
+  ; CHECK-NEXT: %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _ :: (load 8 from constant-pool + 8)
+    %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _ :: (load 8 from constant-pool)
+    %xmm0 = ADDSDrm killed %xmm0, %rip, 1, _, %const.0, _ :: (load 8 from constant-pool + 8)
+    RETQ %xmm0
+...
+---
+name:            stack_psv
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       24
+  maxAlignment:    16
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 16
+fixedStack:
+  - { id: 0, offset: 0, size: 10, alignment: 16, isImmutable: true, isAliased: false }
+body: |
+  bb.0.entry:
+    %rsp = frame-setup SUB64ri8 %rsp, 24, implicit-def dead %eflags
+    CFI_INSTRUCTION .cfi_def_cfa_offset 32
+    LD_F80m %rsp, 1, _, 32, _, implicit-def dead %fpsw
+  ; CHECK: name: stack_psv
+  ; CHECK: ST_FP80m %rsp, 1, _, 0, _, implicit-def dead %fpsw :: (store 10 into stack, align 16)
+    ST_FP80m %rsp, 1, _, 0, _, implicit-def dead %fpsw :: (store 10 into stack, align 16)
+    CALL64pcrel32 $cosl, csr_64, implicit %rsp, implicit-def %rsp, implicit-def %fp0
+    %rsp = ADD64ri8 %rsp, 24, implicit-def dead %eflags
+    RETQ
+...
+---
+name:            got_psv
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+  ; CHECK: name: got_psv
+  ; CHECK: %rax = MOV64rm %rip, 1, _, @G, _ :: (load 8 from got)
+    %rax = MOV64rm %rip, 1, _, @G, _ :: (load 8 from got)
+    %eax = MOV32rm killed %rax, 1, _, 0, _
+    %eax = INC32r killed %eax, implicit-def dead %eflags
+    RETQ %eax
+...
+---
+name:            global_value
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    %rax = MOV64rm %rip, 1, _, @G, _
+  ; CHECK-LABEL: name: global_value
+  ; CHECK: %eax = MOV32rm killed %rax, 1, _, 0, _, implicit-def %rax :: (load 4 from @G)
+  ; CHECK: %ecx = MOV32rm killed %rcx, 1, _, 0, _, implicit-def %rcx :: (load 4 from @0)
+    %eax = MOV32rm killed %rax, 1, _, 0, _, implicit-def %rax :: (load 4 from @G)
+    %rcx = MOV64rm %rip, 1, _, @0, _
+    %ecx = MOV32rm killed %rcx, 1, _, 0, _, implicit-def %rcx :: (load 4 from @0)
+    %eax = LEA64_32r killed %rax, 1, killed %rcx, 1, _
+    RETQ %eax
+...
+---
+name:            jumptable_psv
+tracksRegLiveness: true
+liveins:
+  - { reg: '%edi' }
+jumpTable:
+  kind:          label-difference32
+  entries:
+    - id:        0
+      blocks:    [ '%bb.3.lbl1', '%bb.4.lbl2', '%bb.5.lbl3', '%bb.6.lbl4' ]
+body: |
+  bb.0.entry:
+    successors: %bb.2.def, %bb.1.entry
+    liveins: %edi
+
+    %eax = MOV32rr %edi, implicit-def %rax
+    CMP32ri8 killed %edi, 3, implicit-def %eflags
+    JA_1 %bb.2.def, implicit killed %eflags
+
+  bb.1.entry:
+    successors: %bb.3.lbl1, %bb.4.lbl2, %bb.5.lbl3, %bb.6.lbl4
+    liveins: %rax
+
+    %rcx = LEA64r %rip, 1, _, %jump-table.0, _
+  ; CHECK: name: jumptable_psv
+  ; CHECK: %rax = MOVSX64rm32 %rcx, 4, killed %rax, 0, _ :: (load 4 from jump-table, align 8)
+    %rax = MOVSX64rm32 %rcx, 4, killed %rax, 0, _ :: (load 4 from jump-table, align 8)
+    %rax = ADD64rr killed %rax, killed %rcx, implicit-def dead %eflags
+    JMP64r killed %rax
+
+  bb.2.def:
+    %eax = MOV32r0 implicit-def dead %eflags
+    RETQ %eax
+
+  bb.3.lbl1:
+    %eax = MOV32ri 1
+    RETQ %eax
+
+  bb.4.lbl2:
+    %eax = MOV32ri 2
+    RETQ %eax
+
+  bb.5.lbl3:
+    %eax = MOV32ri 4
+    RETQ %eax
+
+  bb.6.lbl4:
+    %eax = MOV32ri 8
+    RETQ %eax
+...
+---
+name:            tbaa_metadata
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    %rax = MOV64rm %rip, 1, _, @a, _ :: (load 8 from got)
+  ; CHECK-LABEL: name: tbaa_metadata
+  ; CHECK:      %eax = MOV32rm killed %rax, 1, _, 0, _, implicit-def %rax :: (load 4 from @a, !tbaa !2)
+  ; CHECK-NEXT: %eax = MOV32rm killed %rax, 1, _, 0, _ :: (load 4 from %ir.total_len2, !tbaa !6)
+    %eax = MOV32rm killed %rax, 1, _, 0, _, implicit-def %rax :: (load 4 from @a, !tbaa !2)
+    %eax = MOV32rm killed %rax, 1, _, 0, _ :: (load 4 from %ir.total_len2, !tbaa !6)
+    RETQ %eax
+...
+---
+name:            aa_scope
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi, %rsi
+  ; CHECK-LABEL: name: aa_scope
+  ; CHECK: %xmm0 = MOVSSrm %rsi, 1, _, 0, _ :: (load 4 from %ir.c, !alias.scope !9)
+    %xmm0 = MOVSSrm %rsi, 1, _, 0, _ :: (load 4 from %ir.c, !alias.scope !9)
+  ; CHECK-NEXT: MOVSSmr %rdi, 1, _, 20, _, killed %xmm0 :: (store 4 into %ir.arrayidx.i, !noalias !9)
+    MOVSSmr %rdi, 1, _, 20, _, killed %xmm0 :: (store 4 into %ir.arrayidx.i, !noalias !9)
+    %xmm0 = MOVSSrm killed %rsi, 1, _, 0, _ :: (load 4 from %ir.c)
+    MOVSSmr killed %rdi, 1, _, 28, _, killed %xmm0 :: (store 4 into %ir.arrayidx)
+    RETQ
+...
+---
+name:            range_metadata
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK-LABEL: name: range_metadata
+  ; CHECK: %al = MOV8rm killed %rdi, 1, _, 0, _ :: (load 1 from %ir.x, !range !11)
+    %al = MOV8rm killed %rdi, 1, _, 0, _ :: (load 1 from %ir.x, !range !11)
+    RETQ %al
+...
+---
+name:            gep_value
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+
+    %rax = MOV64rm %rip, 1, _, @values, _ :: (load 8 from got)
+  ; CHECK-LABEL: gep_value
+  ; CHECK: MOV32mr killed %rax, 1, _, 0, _, %edi, implicit killed %rdi :: (store 4 into `i32* getelementptr inbounds ([50 x %st], [50 x %st]* @values, i64 0, i64 0, i32 0)`, align 16)
+    MOV32mr killed %rax, 1, _, 0, _, %edi, implicit killed %rdi :: (store 4 into `i32* getelementptr inbounds ([50 x %st], [50 x %st]* @values, i64 0, i64 0, i32 0)`, align 16)
+    RETQ
+...
+---
+name:            undef_value
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+  ; CHECK-LABEL: name: undef_value
+  ; CHECK: %rax = MOV64rm undef %rax, 1, _, 0, _ :: (load 8 from `i8** undef`)
+    %rax = MOV64rm undef %rax, 1, _, 0, _ :: (load 8 from `i8** undef`)
+    RETQ %rax
+...
diff --git a/test/CodeGen/MIR/X86/metadata-operands.mir b/test/CodeGen/MIR/X86/metadata-operands.mir
new file mode 100644
index 0000000000000..89a1e6fcb8155
--- /dev/null
+++ b/test/CodeGen/MIR/X86/metadata-operands.mir
@@ -0,0 +1,63 @@
+# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the metadata machine operands
+# correctly.
+
+--- |
+
+  define i32 @test(i32 %x) #0 !dbg !4 {
+  entry:
+    %x.addr = alloca i32, align 4
+    store i32 %x, i32* %x.addr, align 4
+    call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !12, metadata !13), !dbg !14
+    %0 = load i32, i32* %x.addr, align 4, !dbg !15
+    ret i32 %0, !dbg !15
+  }
+
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+  attributes #0 = { nounwind "no-frame-pointer-elim"="false" }
+  attributes #1 = { nounwind readnone }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!9, !10}
+  !llvm.ident = !{!11}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "test.ll", directory: "")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+  !5 = !DIFile(filename: "test.c", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !9 = !{i32 2, !"Dwarf Version", i32 4}
+  !10 = !{i32 2, !"Debug Info Version", i32 3}
+  !11 = !{!"clang version 3.7.0"}
+  !12 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 4, type: !8)
+  !13 = !DIExpression()
+  !14 = !DILocation(line: 4, scope: !4)
+  !15 = !DILocation(line: 8, scope: !4)
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+frameInfo:
+  maxAlignment:  4
+stack:
+  - { id: 0, name: x.addr, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    liveins: %edi
+    ; CHECK:      %0 = COPY %edi
+    ; CHECK-NEXT: DBG_VALUE _, 0, !12, !13
+    %0 = COPY %edi
+    DBG_VALUE _, 0, !12, ! 13
+    MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
+    %eax = COPY %0
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/missing-closing-quote.mir b/test/CodeGen/MIR/X86/missing-closing-quote.mir
new file mode 100644
index 0000000000000..9f4b369a3df4e
--- /dev/null
+++ b/test/CodeGen/MIR/X86/missing-closing-quote.mir
@@ -0,0 +1,22 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @"quoted name" = external global i32
+
+  define i32 @test() {
+  entry:
+    %a = load i32, i32* @"quoted name"
+    ret i32 %a
+  }
+
+...
+---
+name:            test
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:48: end of machine instruction reached before the closing '"'
+    %rax = MOV64rm %rip, 1, _, @"quoted name, _
+    %eax = MOV32rm killed %rax, 1, _, 0, _
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/missing-comma.mir b/test/CodeGen/MIR/X86/missing-comma.mir
index 54c67ac6c9117..092995e59c703 100644
--- a/test/CodeGen/MIR/X86/missing-comma.mir
+++ b/test/CodeGen/MIR/X86/missing-comma.mir
@@ -10,12 +10,10 @@
 ...
 ---
 name:            foo
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK: [[@LINE+1]]:29: expected ',' before the next machine operand
-     - '%eax = XOR32rr %eax %eflags'
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:25: expected ',' before the next machine operand
+    %eax = XOR32rr %eax %eflags
+    RETQ %eax
 ...
 
diff --git a/test/CodeGen/MIR/X86/missing-implicit-operand.mir b/test/CodeGen/MIR/X86/missing-implicit-operand.mir
index 4d2cd03f4a3dd..0135c756e1382 100644
--- a/test/CodeGen/MIR/X86/missing-implicit-operand.mir
+++ b/test/CodeGen/MIR/X86/missing-implicit-operand.mir
@@ -21,20 +21,18 @@
 ...
 ---
 name:            foo
-body:
- - id:              0
-   name:            entry
-   instructions:
-     - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
-# CHECK: [[@LINE+1]]:24: missing implicit register operand 'implicit %eflags'
-     - 'JG_1 %bb.2.exit'
- - id:              1
-   name:            less
-   instructions:
-     - '%eax = MOV32r0 implicit-def %eflags'
- - id:              2
-   name:            exit
-   instructions:
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    successors: %bb.1.less, %bb.2.exit
+
+    %eax = MOV32rm %rdi, 1, _, 0, _
+    CMP32ri8 %eax, 10, implicit-def %eflags
+  ; CHECK: [[@LINE+1]]:20: missing implicit register operand 'implicit %eflags'
+    JG_1 %bb.2.exit
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def %eflags
+
+  bb.2.exit:
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/missing-instruction.mir b/test/CodeGen/MIR/X86/missing-instruction.mir
deleted file mode 100644
index 8d11ab5eaabe7..0000000000000
--- a/test/CodeGen/MIR/X86/missing-instruction.mir
+++ /dev/null
@@ -1,19 +0,0 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-
---- |
-
-  define void @foo() {
-  entry:
-    ret void
-  }
-
-...
----
-name:            foo
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK: [[@LINE+1]]:9: expected a machine instruction
-     - ''
-...
diff --git a/test/CodeGen/MIR/X86/named-registers.mir b/test/CodeGen/MIR/X86/named-registers.mir
index 91ed485686783..e547c326563e4 100644
--- a/test/CodeGen/MIR/X86/named-registers.mir
+++ b/test/CodeGen/MIR/X86/named-registers.mir
@@ -12,12 +12,10 @@
 ---
 # CHECK: name: foo
 name:            foo
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK:      - '%eax = MOV32r0
-     # CHECK-NEXT: - 'RETQ %eax
-     - '%eax = MOV32r0 implicit-def %eflags'
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK:      %eax = MOV32r0
+    ; CHECK-NEXT: RETQ %eax
+    %eax = MOV32r0 implicit-def %eflags
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/newline-handling.mir b/test/CodeGen/MIR/X86/newline-handling.mir
new file mode 100644
index 0000000000000..bce06d540114a
--- /dev/null
+++ b/test/CodeGen/MIR/X86/newline-handling.mir
@@ -0,0 +1,109 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+  define i32 @bar(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+tracksRegLiveness: true
+liveins:
+  - { reg: '%edi' }
+# CHECK-LABEL: name: foo
+# CHECK: body: |
+# CHECK-NEXT: bb.0.entry:
+# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
+# CHECK-NEXT: liveins: %edi
+# CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
+# CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags
+
+# CHECK:      bb.1.less:
+# CHECK-NEXT: %eax = MOV32r0 implicit-def dead %eflags
+# CHECK-NEXT: RETQ killed %eax
+
+# CHECK:      bb.2.exit:
+# CHECK-NEXT: liveins: %edi
+# CHECK:      %eax = COPY killed %edi
+# CHECK-NEXT: RETQ killed %eax
+body: |
+  bb.0.entry:
+    successors: %bb.1.less, %bb.2.exit
+
+    liveins: %edi
+
+    CMP32ri8 %edi, 10, implicit-def %eflags
+
+    JG_1 %bb.2.exit, implicit killed %eflags
+
+  bb.1.less:
+
+
+    %eax = MOV32r0 implicit-def dead %eflags
+    RETQ killed %eax
+  bb.2.exit:
+
+
+    liveins: %edi
+    %eax = COPY killed %edi
+    RETQ killed %eax
+
+...
+---
+name:            bar
+tracksRegLiveness: true
+liveins:
+  - { reg: '%edi' }
+# CHECK-LABEL: name: bar
+# CHECK: body: |
+# CHECK-NEXT: bb.0.entry:
+# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
+# CHECK-NEXT: liveins: %edi
+# CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
+# CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags
+
+# CHECK:      bb.1.less:
+# CHECK-NEXT: %eax = MOV32r0 implicit-def dead %eflags
+# CHECK-NEXT: RETQ killed %eax
+
+# CHECK:      bb.2.exit:
+# CHECK-NEXT: liveins: %edi
+# CHECK:      %eax = COPY killed %edi
+# CHECK-NEXT: RETQ killed %eax
+body: |
+
+  bb.0.entry:
+    successors: %bb.1.less, %bb.2.exit
+    liveins: %edi
+    CMP32ri8 %edi, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit killed %eflags
+  bb.1.less:  %eax = MOV32r0 implicit-def dead %eflags
+              RETQ killed %eax
+
+  bb.2.exit:  liveins: %edi
+    %eax = COPY killed %edi
+    RETQ killed %eax
+
+...
diff --git a/test/CodeGen/MIR/X86/null-register-operands.mir b/test/CodeGen/MIR/X86/null-register-operands.mir
index 55c0ceb3a60a3..5563ef8e8f752 100644
--- a/test/CodeGen/MIR/X86/null-register-operands.mir
+++ b/test/CodeGen/MIR/X86/null-register-operands.mir
@@ -13,12 +13,10 @@
 ---
 # CHECK: name: deref
 name:            deref
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK:      - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     # CHECK-NEXT: - 'RETQ %eax'
-     - '%eax = MOV32rm %rdi, 1, _, 0, %noreg'
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK:      %eax = MOV32rm %rdi, 1, _, 0, _
+    ; CHECK-NEXT: RETQ %eax
+    %eax = MOV32rm %rdi, 1, _, 0, %noreg
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/register-mask-operands.mir b/test/CodeGen/MIR/X86/register-mask-operands.mir
index f4136598ff5c0..9fa4e6e3994e0 100644
--- a/test/CodeGen/MIR/X86/register-mask-operands.mir
+++ b/test/CodeGen/MIR/X86/register-mask-operands.mir
@@ -20,24 +20,20 @@
 ...
 ---
 name:            compute
-body:
-  - id:          0
-    name:        body
-    instructions:
-      - '%eax = IMUL32rri8 %edi, 11, implicit-def %eflags'
-      - 'RETQ %eax'
+body: |
+  bb.0.body:
+    %eax = IMUL32rri8 %edi, 11, implicit-def %eflags
+    RETQ %eax
 ...
 ---
 # CHECK: name: foo
 name:            foo
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      # CHECK:      - 'PUSH64r %rax
-      # CHECK-NEXT: - 'CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax'
-      - 'PUSH64r %rax, implicit-def %rsp, implicit %rsp'
-      - 'CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax'
-      - '%rdx = POP64r implicit-def %rsp, implicit %rsp'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK:      PUSH64r %rax
+    ; CHECK-NEXT: CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
+    PUSH64r %rax, implicit-def %rsp, implicit %rsp
+    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
+    %rdx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/register-operands-target-flag-error.mir b/test/CodeGen/MIR/X86/register-operands-target-flag-error.mir
new file mode 100644
index 0000000000000..64d46d20db748
--- /dev/null
+++ b/test/CodeGen/MIR/X86/register-operands-target-flag-error.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @G = external global i32
+
+  define i32 @inc() {
+  entry:
+    %a = load i32, i32* @G
+    %b = add i32 %a, 1
+    ret i32 %b
+  }
+
+...
+---
+name: inc
+body: |
+  bb.0.entry:
+  ; CHECK: [[@LINE+1]]:42: register operands can't have target flags
+    %rax = MOV64rm target-flags(x86-got) %rip, 1, _, @G, _
+    %eax = MOV32rm killed %rax, 1, _, 0, _
+    %eax = INC32r killed %eax, implicit-def dead %eflags
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/simple-register-allocation-hints.mir b/test/CodeGen/MIR/X86/simple-register-allocation-hints.mir
new file mode 100644
index 0000000000000..d7e76329be730
--- /dev/null
+++ b/test/CodeGen/MIR/X86/simple-register-allocation-hints.mir
@@ -0,0 +1,34 @@
+# RUN: llc -march=x86-64 -start-after machine-scheduler -stop-after machine-scheduler -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses simple register allocation hints
+# correctly.
+
+--- |
+
+  define i32 @test(i32 %a, i32 %b) {
+  body:
+    %c = mul i32 %a, %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gr32 }
+# CHECK-NEXT:  - { id: 1, class: gr32, preferred-register: '%esi' }
+# CHECK-NEXT:  - { id: 2, class: gr32, preferred-register: '%edi' }
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32, preferred-register: '%esi' }
+  - { id: 2, class: gr32, preferred-register: '%edi' }
+body: |
+  bb.0.body:
+    liveins: %edi, %esi
+
+    %1 = COPY %esi
+    %2 = COPY %edi
+    %2 = IMUL32rr %2, %1, implicit-def dead %eflags
+    %eax = COPY %2
+    RETQ killed %eax
+...
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir
index 67f4bd21cd05c..b62cd755fec15 100644
--- a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir
@@ -22,11 +22,9 @@ fixedStack:
   - { id: 0, type: spill-slot, offset: 0, size: 4, isAliased: true }
 stack:
   - { id: 0, offset: -12, size: 4, alignment: 4 }
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
-      - '%eax = COPY %edi'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    MOV32mr %rsp, 1, _, -4, _, %edi
+    %eax = COPY %edi
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir
index 1e1b0fdcc8dcf..c89216bea67a9 100644
--- a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir
@@ -22,11 +22,9 @@ fixedStack:
   - { id: 0, type: spill-slot, offset: 0, size: 4, isImmutable: true }
 stack:
   - { id: 0, offset: -12, size: 4, alignment: 4 }
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
-      - '%eax = COPY %edi'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    MOV32mr %rsp, 1, _, -4, _, %edi
+    %eax = COPY %edi
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir
index f771f796ec346..7e13a26f0b680 100644
--- a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir
@@ -24,11 +24,9 @@ fixedStack:
   - { id: 0, type: spill-slot, offset: 0, size: 4, alignment: 4 }
 stack:
   - { id: 0, offset: -12, size: 4, alignment: 4 }
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
-      - '%eax = COPY %edi'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    MOV32mr %rsp, 1, _, -4, _, %edi
+    %eax = COPY %edi
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/stack-object-debug-info.mir b/test/CodeGen/MIR/X86/stack-object-debug-info.mir
new file mode 100644
index 0000000000000..509b196416fd7
--- /dev/null
+++ b/test/CodeGen/MIR/X86/stack-object-debug-info.mir
@@ -0,0 +1,65 @@
+# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the stack object's debug info
+# correctly.
+--- |
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+
+  define void @foo() #1 {
+  entry:
+    %x.i = alloca i8, align 1
+    %y.i = alloca [256 x i8], align 16
+    %0 = bitcast [256 x i8]* %y.i to i8*
+    br label %for.body
+
+  for.body:
+    %1 = bitcast [256 x i8]* %y.i to i8*
+    call void @llvm.lifetime.end(i64 -1, i8* %1) #3
+    call void @llvm.lifetime.start(i64 -1, i8* %0) #3
+    call void @llvm.dbg.declare(metadata i8* %0, metadata !4, metadata !7) #3, !dbg !8
+    br label %for.body
+  }
+
+  declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+
+  declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+
+  attributes #0 = { nounwind readnone }
+  attributes #1 = { nounwind ssp uwtable }
+  attributes #2 = { nounwind argmemonly }
+  attributes #3 = { nounwind }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: 0, enums: !2, retainedTypes: !2)
+  !1 = !DIFile(filename: "t.c", directory: "")
+  !2 = !{}
+  !3 = !{i32 1, !"Debug Info Version", i32 3}
+  !4 = !DILocalVariable(name: "x", scope: !5, file: !1, line: 16, type: !6)
+  !5 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false)
+  !6 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+  !7 = !DIExpression()
+  !8 = !DILocation(line: 0, scope: !5)
+...
+---
+name:            foo
+isSSA:           true
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    16
+# CHECK-LABEL: foo
+# CHECK: stack:
+# CHECK:  - { id: 0, name: y.i, offset: 0, size: 256, alignment: 16, di-variable: '!4',
+# CHECK-NEXT: di-expression: '!7', di-location: '!8' }
+stack:
+  - { id: 0, name: y.i, offset: 0, size: 256, alignment: 16, di-variable: '!4',
+      di-expression: '!7', di-location: '!8' }
+body: |
+  bb.0.entry:
+    successors: %bb.1.for.body
+  bb.1.for.body:
+    successors: %bb.1.for.body
+
+    DBG_VALUE %stack.0.y.i, 0, !4, !7, debug-location !8
+    JMP_1 %bb.1.for.body
+...
diff --git a/test/CodeGen/MIR/X86/stack-object-invalid-name.mir b/test/CodeGen/MIR/X86/stack-object-invalid-name.mir
new file mode 100644
index 0000000000000..e42e1e59f1e72
--- /dev/null
+++ b/test/CodeGen/MIR/X86/stack-object-invalid-name.mir
@@ -0,0 +1,28 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when it encounters a
+# stack object with a name that can't be associated with an alloca instruction.
+
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+frameInfo:
+  maxAlignment:  4
+stack:
+  # CHECK: [[@LINE+1]]:20: alloca instruction named 'x' isn't defined in the function 'test'
+  - { id: 0, name: x, offset: -12, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    MOV32mr %rsp, 1, _, -4, _, %edi
+    %eax = MOV32rm %rsp, 1, _, -4, _
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/stack-object-operand-name-mismatch-error.mir b/test/CodeGen/MIR/X86/stack-object-operand-name-mismatch-error.mir
new file mode 100644
index 0000000000000..46661d95e727d
--- /dev/null
+++ b/test/CodeGen/MIR/X86/stack-object-operand-name-mismatch-error.mir
@@ -0,0 +1,33 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that an error is reported when an stack object reference
+# uses a different name then the stack object definition.
+
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+frameInfo:
+  maxAlignment:  4
+stack:
+  - { id: 0, name: b, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    %0 = COPY %edi
+    ; CHECK: [[@LINE+1]]:13: the name of the stack object '%stack.0' isn't 'x'
+    MOV32mr %stack.0.x, 1, _, 0, _, %0
+    %eax = COPY %0
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/stack-object-operands.mir b/test/CodeGen/MIR/X86/stack-object-operands.mir
new file mode 100644
index 0000000000000..fce5bf717d1a7
--- /dev/null
+++ b/test/CodeGen/MIR/X86/stack-object-operands.mir
@@ -0,0 +1,45 @@
+# RUN: llc -march=x86 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses stack object machine operands
+# correctly.
+
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    %b = alloca i32
+    %0 = alloca i32
+    store i32 %a, i32* %b
+    store i32 2, i32* %0
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, offset: 0, size: 4, isImmutable: true, isAliased: false }
+stack:
+  - { id: 0, name: b, size: 4, alignment: 4 }
+  - { id: 1, size: 4, alignment: 4 }
+body: |
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT: %0 = MOV32rm %fixed-stack.0, 1, _, 0, _
+  ; CHECK-NEXT: MOV32mr %stack.0.b, 1, _, 0, _, %0
+  ; CHECK-NEXT: MOV32mi %stack.1, 1, _, 0, _, 2
+  ; CHECK-NEXT: %1 = MOV32rm %stack.0.b, 1, _, 0, _
+  bb.0.entry:
+    %0 = MOV32rm %fixed-stack.0, 1, _, 0, _
+    MOV32mr %stack.0.b, 1, _, 0, _, %0
+    MOV32mi %stack.1, 1, _, 0, _, 2
+    %1 = MOV32rm %stack.0, 1, _, 0, _
+    %eax = COPY %1
+    RETL %eax
+...
diff --git a/test/CodeGen/MIR/X86/stack-object-redefinition-error.mir b/test/CodeGen/MIR/X86/stack-object-redefinition-error.mir
new file mode 100644
index 0000000000000..b84863ebca67a
--- /dev/null
+++ b/test/CodeGen/MIR/X86/stack-object-redefinition-error.mir
@@ -0,0 +1,37 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %a) #0 {
+  entry:
+    %b = alloca i32
+    %x = alloca i64
+    store i32 %a, i32* %b
+    store i64 2, i64* %x
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%edi' }
+frameInfo:
+  maxAlignment:    8
+stack:
+  - { id: 0, name: b, offset: -12, size: 4, alignment: 4 }
+# CHECK: [[@LINE+1]]:11: redefinition of stack object '%stack.0'
+  - { id: 0, name: x, offset: -24, size: 8, alignment: 8 }
+body: |
+  bb.0.entry:
+    liveins: %edi
+
+    MOV32mr %rsp, 1, _, -4, _, killed %edi
+    MOV64mi32 %rsp, 1, _, -16, _, 2
+    %eax = MOV32rm %rsp, 1, _, -4, _
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/stack-objects.mir b/test/CodeGen/MIR/X86/stack-objects.mir
index 14ed4b74f96f6..bdd911075da03 100644
--- a/test/CodeGen/MIR/X86/stack-objects.mir
+++ b/test/CodeGen/MIR/X86/stack-objects.mir
@@ -21,19 +21,17 @@ name:            test
 frameInfo:
   maxAlignment:    8
 # CHECK: stack:
-# CHECK-NEXT: - { id: 0, offset: -12, size: 4, alignment: 4 }
-# CHECK-NEXT: - { id: 1, offset: -24, size: 8, alignment: 8 }
+# CHECK-NEXT: - { id: 0, name: b, offset: -12, size: 4, alignment: 4 }
+# CHECK-NEXT: - { id: 1, name: x, offset: -24, size: 8, alignment: 8 }
 # CHECK-NEXT: - { id: 2, type: spill-slot, offset: -32, size: 4, alignment: 4 }
 stack:
-  - { id: 0, offset: -12, size: 4, alignment: 4 }
-  - { id: 1, offset: -24, size: 8, alignment: 8 }
+  - { id: 0, name: b, offset: -12, size: 4, alignment: 4 }
+  - { id: 1, name: x, offset: -24, size: 8, alignment: 8 }
   - { id: 2, type: spill-slot, offset: -32, size: 4, alignment: 4 }
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
-      - 'MOV64mi32 %rsp, 1, _, -16, _, 2'
-      - '%eax = MOV32rm %rsp, 1, _, -4, _'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    MOV32mr %rsp, 1, _, -4, _, %edi
+    MOV64mi32 %rsp, 1, _, -16, _, 2
+    %eax = MOV32rm %rsp, 1, _, -4, _
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/standalone-register-error.mir b/test/CodeGen/MIR/X86/standalone-register-error.mir
new file mode 100644
index 0000000000000..f17451bfc89c8
--- /dev/null
+++ b/test/CodeGen/MIR/X86/standalone-register-error.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+  define i32 @test(i32 %a) {
+  body:
+    ret i32 %a
+  }
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+liveins:
+# CHECK: [[@LINE+1]]:13: unknown register name 'register'
+  - { reg: '%register', virtual-reg: '%0' }
+body: |
+  bb.0.body:
+    liveins: %edi
+
+    %0 = COPY %edi
+    %eax = COPY %0
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/subregister-operands.mir b/test/CodeGen/MIR/X86/subregister-operands.mir
index 5e46fab4b0585..8a3fcf69aca63 100644
--- a/test/CodeGen/MIR/X86/subregister-operands.mir
+++ b/test/CodeGen/MIR/X86/subregister-operands.mir
@@ -18,16 +18,15 @@ registers:
   - { id: 0, class: gr32 }
   - { id: 1, class: gr8 }
   - { id: 2, class: gr8 }
-body:
-  - name:        entry
-    id:          0
-    instructions:
-      # CHECK:      %0 = COPY %edi
-      # CHECK-NEXT: %1 = COPY %0:sub_8bit
-      - '%0 = COPY %edi'
-      - '%1 = COPY %0:sub_8bit'
-      - '%2 = AND8ri %1, 1, implicit-def %eflags'
-      - '%al = COPY %2'
-      - 'RETQ %al'
+body: |
+  bb.0.entry:
+    liveins: %edi
+    ; CHECK:      %0 = COPY %edi
+    ; CHECK-NEXT: %1 = COPY %0:sub_8bit
+    %0 = COPY %edi
+    %1 = COPY %0:sub_8bit
+    %2 = AND8ri %1, 1, implicit-def %eflags
+    %al = COPY %2
+    RETQ %al
 ...
 
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
new file mode 100644
index 0000000000000..64af6121189a3
--- /dev/null
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
@@ -0,0 +1,42 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses basic block successors and
+# probabilities correctly.
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+body: |
+  ; CHECK-LABEL: bb.0.entry:
+  ; CHECK:         successors: %bb.1.less({{[0-9a-fx/= ]+}}33.00%), %bb.2.exit({{[0-9a-fx/= ]+}}67.00%)
+  ; CHECK-LABEL: bb.1.less:
+  bb.0.entry:
+    successors: %bb.1.less (33), %bb.2.exit(67)
+    liveins: %edi
+
+    CMP32ri8 %edi, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit killed %eflags
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def dead %eflags
+    RETQ killed %eax
+
+  bb.2.exit:
+    liveins: %edi
+
+    %eax = COPY killed %edi
+    RETQ killed %eax
+...
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks.mir b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
new file mode 100644
index 0000000000000..a6c14f70bc7cc
--- /dev/null
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
@@ -0,0 +1,83 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses basic block successors correctly.
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+  define i32 @bar(i32 %a) {
+  entry:
+    %b = icmp sle i32 %a, 10
+    br i1 %b, label %0, label %1
+
+  ; <label>:0
+    ret i32 0
+
+  ; <label>:1
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+body: |
+  ; CHECK-LABEL: bb.0.entry:
+  ; CHECK:         successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
+  ; CHECK-LABEL: bb.1.less:
+  bb.0.entry:
+    successors: %bb.1.less, %bb.2.exit
+    liveins: %edi
+
+    CMP32ri8 %edi, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit killed %eflags
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def dead %eflags
+    RETQ killed %eax
+
+  bb.2.exit:
+    liveins: %edi
+
+    %eax = COPY killed %edi
+    RETQ killed %eax
+...
+---
+name:            bar
+body: |
+  ; CHECK-LABEL: name: bar
+  ; Verify that we can have multiple lists of successors that will be merged
+  ; into one.
+  ; CHECK-LABEL: bb.0.entry:
+  ; CHECK:         successors: %bb.1(0x80000000 / 0x80000000 = 100.00%), %bb.2(0x00000000 / 0x80000000 = 0.00%)
+  bb.0.entry:
+    liveins: %edi
+    successors: %bb.1
+    successors: %bb.2
+
+    CMP32ri8 %edi, 10, implicit-def %eflags
+    JG_1 %bb.2, implicit killed %eflags
+
+  ; Verify that we can have an empty list of successors.
+  ; CHECK-LABEL: bb.1:
+  ; CHECK-NEXT:  %eax = MOV32r0 implicit-def dead %eflags
+  bb.1:
+    successors:
+    %eax = MOV32r0 implicit-def dead %eflags
+    RETQ killed %eax
+
+  bb.2:
+    liveins: %edi
+
+    %eax = COPY killed %edi
+    RETQ killed %eax
+...
diff --git a/test/CodeGen/MIR/X86/tied-def-operand-invalid.mir b/test/CodeGen/MIR/X86/tied-def-operand-invalid.mir
new file mode 100644
index 0000000000000..fe5263df355f9
--- /dev/null
+++ b/test/CodeGen/MIR/X86/tied-def-operand-invalid.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+  define i64 @test(i64 %x) #0 {
+  entry:
+    %asm = tail call i64 asm sideeffect "$foo", "=r,0"(i64 %x) nounwind
+    ret i64 %asm
+  }
+
+  attributes #0 = { nounwind }
+...
+---
+name:            test
+hasInlineAsm:    true
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+
+  ; CHECK: [[@LINE+1]]:58: use of invalid tied-def operand index '0'; the operand #0 isn't a defined register
+    INLINEASM $"$foo", 1, 2818058, def %rdi, 2147483657, killed %rdi(tied-def 0)
+    %rax = COPY killed %rdi
+    RETQ killed %rax
+...
diff --git a/test/CodeGen/MIR/X86/undef-register-flag.mir b/test/CodeGen/MIR/X86/undef-register-flag.mir
index 83b9e10a80d1d..0b26c528aee12 100644
--- a/test/CodeGen/MIR/X86/undef-register-flag.mir
+++ b/test/CodeGen/MIR/X86/undef-register-flag.mir
@@ -21,22 +21,18 @@
 ...
 ---
 name:            compute
-body:
-  - id:          0
-    name:        body
-    instructions:
-      - '%eax = IMUL32rri8 %edi, 11, implicit-def %eflags'
-      - 'RETQ %eax'
+body: |
+  bb.0.body:
+    %eax = IMUL32rri8 %edi, 11, implicit-def %eflags
+    RETQ %eax
 ...
 ---
 name:            foo
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      # CHECK: - 'PUSH64r undef %rax
-      - 'PUSH64r undef %rax, implicit-def %rsp, implicit %rsp'
-      - 'CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax'
-      - '%rdx = POP64r implicit-def %rsp, implicit %rsp'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK: PUSH64r undef %rax
+    PUSH64r undef %rax, implicit-def %rsp, implicit %rsp
+    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
+    %rdx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/undefined-fixed-stack-object.mir b/test/CodeGen/MIR/X86/undefined-fixed-stack-object.mir
new file mode 100644
index 0000000000000..8d8f8614f32be
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undefined-fixed-stack-object.mir
@@ -0,0 +1,38 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    %b = alloca i32
+    %0 = alloca i32
+    store i32 %a, i32* %b
+    store i32 2, i32* %0
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, offset: 0, size: 4, isImmutable: true, isAliased: false }
+stack:
+  - { id: 0, name: b, size: 4, alignment: 4 }
+  - { id: 1, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:18: use of undefined fixed stack object '%fixed-stack.11'
+    %0 = MOV32rm %fixed-stack.11, 1, _, 0, _
+    MOV32mr %stack.0, 1, _, 0, _, %0
+    MOV32mi %stack.1, 1, _, 0, _, 2
+    %1 = MOV32rm %stack.0, 1, _, 0, _
+    %eax = COPY %1
+    RETL %eax
+...
diff --git a/test/CodeGen/MIR/X86/undefined-global-value.mir b/test/CodeGen/MIR/X86/undefined-global-value.mir
index e41dc0454d2cb..f82c626397a93 100644
--- a/test/CodeGen/MIR/X86/undefined-global-value.mir
+++ b/test/CodeGen/MIR/X86/undefined-global-value.mir
@@ -16,13 +16,11 @@
 ...
 ---
 name: inc
-body:
-  - id: 0
-    name: entry
-    instructions:
-      # CHECK: [[@LINE+1]]:37: use of undefined global value '@2'
-      - '%rax = MOV64rm %rip, 1, _, @2, _'
-      - '%eax = MOV32rm %rax, 1, _, 0, _'
-      - '%eax = INC32r %eax'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:32: use of undefined global value '@2'
+    %rax = MOV64rm %rip, 1, _, @2, _
+    %eax = MOV32rm %rax, 1, _, 0, _
+    %eax = INC32r %eax
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/undefined-ir-block-in-blockaddress.mir b/test/CodeGen/MIR/X86/undefined-ir-block-in-blockaddress.mir
new file mode 100644
index 0000000000000..f6b10e3123ca3
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undefined-ir-block-in-blockaddress.mir
@@ -0,0 +1,30 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @addr = global i8* null
+
+  define void @test() {
+  entry:
+    store volatile i8* blockaddress(@test, %block), i8** @addr
+    %val = load volatile i8*, i8** @addr
+    indirectbr i8* %val, [label %block]
+
+  block:
+    ret void
+  }
+
+...
+---
+name:            test
+body: |
+  bb.0.entry:
+    successors: %bb.1.block
+    ; CHECK: [[@LINE+1]]:51: use of undefined IR block '%ir-block."block "'
+    %rax = LEA64r %rip, 1, _, blockaddress(@test, %ir-block."block "), _
+    MOV64mr %rip, 1, _, @addr, _, killed %rax
+    JMP64m %rip, 1, _, @addr, _
+
+  bb.1.block (address-taken):
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/undefined-ir-block-slot-in-blockaddress.mir b/test/CodeGen/MIR/X86/undefined-ir-block-slot-in-blockaddress.mir
new file mode 100644
index 0000000000000..0b3c0093dc62b
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undefined-ir-block-slot-in-blockaddress.mir
@@ -0,0 +1,29 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  @addr = global i8* null
+
+  define void @test() {
+  entry:
+    store volatile i8* blockaddress(@test, %0), i8** @addr
+    %val = load volatile i8*, i8** @addr
+    indirectbr i8* %val, [label %0]
+
+    ret void
+  }
+
+...
+---
+name:            test
+body: |
+  bb.0.entry:
+    successors: %bb.1
+    ; CHECK: [[@LINE+1]]:51: use of undefined IR block '%ir-block.1'
+    %rax = LEA64r %rip, 1, _, blockaddress(@test, %ir-block.1), _
+    MOV64mr %rip, 1, _, @addr, _, killed %rax
+    JMP64m %rip, 1, _, @addr, _
+
+  bb.1 (address-taken):
+    RETQ
+...
diff --git a/test/CodeGen/MIR/X86/undefined-jump-table-id.mir b/test/CodeGen/MIR/X86/undefined-jump-table-id.mir
new file mode 100644
index 0000000000000..b463dc4bd9f4a
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undefined-jump-table-id.mir
@@ -0,0 +1,73 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test_jumptable(i32 %in) {
+  entry:
+    switch i32 %in, label %def [
+      i32 0, label %lbl1
+      i32 1, label %lbl2
+      i32 2, label %lbl3
+      i32 3, label %lbl4
+    ]
+
+  def:
+    ret i32 0
+
+  lbl1:
+    ret i32 1
+
+  lbl2:
+    ret i32 2
+
+  lbl3:
+    ret i32 4
+
+  lbl4:
+    ret i32 8
+  }
+
+...
+---
+name:            test_jumptable
+jumpTable:
+  kind:          label-difference32
+  entries:
+    - id:        0
+      blocks:    [ '%bb.3.lbl1', '%bb.4.lbl2', '%bb.5.lbl3', '%bb.6.lbl4' ]
+body: |
+  bb.0.entry:
+    successors: %bb.2.def, %bb.1.entry
+
+    %eax = MOV32rr %edi, implicit-def %rax
+    CMP32ri8 %edi, 3, implicit-def %eflags
+    JA_1 %bb.2.def, implicit %eflags
+
+  bb.1.entry:
+    successors: %bb.3.lbl1, %bb.4.lbl2, %bb.5.lbl3, %bb.6.lbl4
+    ; CHECK: [[@LINE+1]]:31: use of undefined jump table '%jump-table.2'
+    %rcx = LEA64r %rip, 1, _, %jump-table.2, _
+    %rax = MOVSX64rm32 %rcx, 4, %rax, 0, _
+    %rax = ADD64rr %rax, %rcx, implicit-def %eflags
+    JMP64r %rax
+
+  bb.2.def:
+    %eax = MOV32r0 implicit-def %eflags
+    RETQ %eax
+
+  bb.3.lbl1:
+    %eax = MOV32ri 1
+    RETQ %eax
+
+  bb.4.lbl2:
+    %eax = MOV32ri 2
+    RETQ %eax
+
+  bb.5.lbl3:
+    %eax = MOV32ri 4
+    RETQ %eax
+
+  bb.6.lbl4:
+    %eax = MOV32ri 8
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/undefined-named-global-value.mir b/test/CodeGen/MIR/X86/undefined-named-global-value.mir
index b40c2ce43b5f8..a1ada4b42e469 100644
--- a/test/CodeGen/MIR/X86/undefined-named-global-value.mir
+++ b/test/CodeGen/MIR/X86/undefined-named-global-value.mir
@@ -16,13 +16,11 @@
 ...
 ---
 name: inc
-body:
-  - id: 0
-    name: entry
-    instructions:
-      # CHECK: [[@LINE+1]]:37: use of undefined global value '@GG'
-      - '%rax = MOV64rm %rip, 1, _, @GG, _'
-      - '%eax = MOV32rm %rax, 1, _, 0, _'
-      - '%eax = INC32r %eax'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:32: use of undefined global value '@GG'
+    %rax = MOV64rm %rip, 1, _, @GG, _
+    %eax = MOV32rm %rax, 1, _, 0, _
+    %eax = INC32r %eax
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/undefined-register-class.mir b/test/CodeGen/MIR/X86/undefined-register-class.mir
index a14d2303a7d8b..348f6af5c44fa 100644
--- a/test/CodeGen/MIR/X86/undefined-register-class.mir
+++ b/test/CodeGen/MIR/X86/undefined-register-class.mir
@@ -17,10 +17,8 @@ tracksRegLiveness: true
 registers:
   # CHECK: [[@LINE+1]]:20: use of undefined register class 'gr3200'
   - {id: 0, class: 'gr3200'}
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    RETQ %eax
 ...
 
diff --git a/test/CodeGen/MIR/X86/undefined-stack-object.mir b/test/CodeGen/MIR/X86/undefined-stack-object.mir
new file mode 100644
index 0000000000000..416e6789ba0f8
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undefined-stack-object.mir
@@ -0,0 +1,30 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+frameInfo:
+  maxAlignment:  4
+stack:
+  - { id: 0, name: b, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    %0 = COPY %edi
+    ; CHECK: [[@LINE+1]]:13: use of undefined stack object '%stack.2'
+    MOV32mr %stack.2, 1, _, 0, _, %0
+    %eax = COPY %0
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/undefined-value-in-memory-operand.mir b/test/CodeGen/MIR/X86/undefined-value-in-memory-operand.mir
new file mode 100644
index 0000000000000..a3907d7a3a4aa
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undefined-value-in-memory-operand.mir
@@ -0,0 +1,24 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32* %a) {
+  entry:
+    %b = load i32, i32* %a
+    ret i32 %b
+  }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:60: use of undefined IR value '%ir.c'
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 from %ir.c)
+    RETQ %eax
+...
+
diff --git a/test/CodeGen/MIR/X86/undefined-virtual-register.mir b/test/CodeGen/MIR/X86/undefined-virtual-register.mir
index 12370c80caf92..2f9a304ffe5c5 100644
--- a/test/CodeGen/MIR/X86/undefined-virtual-register.mir
+++ b/test/CodeGen/MIR/X86/undefined-virtual-register.mir
@@ -16,13 +16,11 @@ isSSA:           true
 tracksRegLiveness: true
 registers:
   - { id: 0, class: gr32 }
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      - '%0 = COPY %edi'
-      # CHECK: [[@LINE+1]]:22: use of undefined virtual register '%10'
-      - '%eax = COPY %10'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    %0 = COPY %edi
+    ; CHECK: [[@LINE+1]]:17: use of undefined virtual register '%10'
+    %eax = COPY %10
+    RETQ %eax
 ...
 
diff --git a/test/CodeGen/MIR/X86/unknown-instruction.mir b/test/CodeGen/MIR/X86/unknown-instruction.mir
index 4e58ca6bad402..cec3549488320 100644
--- a/test/CodeGen/MIR/X86/unknown-instruction.mir
+++ b/test/CodeGen/MIR/X86/unknown-instruction.mir
@@ -12,10 +12,8 @@
 ...
 ---
 name:            foo
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK: [[@LINE+1]]:8: unknown machine instruction name 'retJust0'
-     - retJust0
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:5: unknown machine instruction name 'retJust0'
+    retJust0
 ...
diff --git a/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir b/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
index a82e9a780f542..a512d9aa08e66 100644
--- a/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
+++ b/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
@@ -21,18 +21,16 @@
 ...
 ---
 name:            foo
-body:
- - id: 0
-   name:         entry
-   instructions:
-     - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
-     # CHECK: [[@LINE+1]]:14: use of undefined machine basic block #4
-     - 'JG_1 %bb.4, implicit %eflags'
- - id: 1
-   instructions:
-     - '%eax = MOV32r0 implicit-def %eflags'
- - id: 2
-   instructions:
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    %eax = MOV32rm %rdi, 1, _, 0, _
+    CMP32ri8 %eax, 10, implicit-def %eflags
+    ; CHECK: [[@LINE+1]]:10: use of undefined machine basic block #4
+    JG_1 %bb.4, implicit %eflags
+
+  bb.1:
+    %eax = MOV32r0 implicit-def %eflags
+
+  bb.2:
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/unknown-metadata-keyword.mir b/test/CodeGen/MIR/X86/unknown-metadata-keyword.mir
new file mode 100644
index 0000000000000..c58c38ab1322e
--- /dev/null
+++ b/test/CodeGen/MIR/X86/unknown-metadata-keyword.mir
@@ -0,0 +1,25 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+  define i32 @inc(i32* %x) {
+  entry:
+    %0 = load i32, i32* %x
+    %1 = add i32 %0, 1
+    store i32 %1, i32* %x
+    ret i32 %1
+  }
+...
+---
+name:            inc
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+body: |
+  bb.0.entry:
+    liveins: %rdi
+  ; CHECK: [[@LINE+1]]:60: use of unknown metadata keyword '!tba'
+    %eax = MOV32rm %rdi, 1, _, 0, _ :: (load 4 from %ir.x, !tba !0)
+    %eax = INC32r killed %eax, implicit-def dead %eflags
+    MOV32mr killed %rdi, 1, _, 0, _, %eax :: (store 4 into %ir.x)
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/unknown-metadata-node.mir b/test/CodeGen/MIR/X86/unknown-metadata-node.mir
new file mode 100644
index 0000000000000..958a30678be1d
--- /dev/null
+++ b/test/CodeGen/MIR/X86/unknown-metadata-node.mir
@@ -0,0 +1,59 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %x) #0 !dbg !4 {
+  entry:
+    %x.addr = alloca i32, align 4
+    store i32 %x, i32* %x.addr, align 4
+    call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !12, metadata !13), !dbg !14
+    %0 = load i32, i32* %x.addr, align 4, !dbg !15
+    ret i32 %0, !dbg !15
+  }
+
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+  attributes #0 = { nounwind "no-frame-pointer-elim"="false" }
+  attributes #1 = { nounwind readnone }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!9, !10}
+  !llvm.ident = !{!11}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "test.ll", directory: "")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+  !5 = !DIFile(filename: "test.c", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !9 = !{i32 2, !"Dwarf Version", i32 4}
+  !10 = !{i32 2, !"Debug Info Version", i32 3}
+  !11 = !{!"clang version 3.7.0"}
+  !12 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 4, type: !8)
+  !13 = !DIExpression()
+  !14 = !DILocation(line: 4, scope: !4)
+  !15 = !DILocation(line: 8, scope: !4)
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+frameInfo:
+  maxAlignment:  4
+stack:
+  - { id: 0, name: x.addr, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    %0 = COPY %edi
+    ; CHECK: [[@LINE+1]]:21: use of undefined metadata '!42'
+    DBG_VALUE _, 0, !42, !13
+    MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
+    %eax = COPY %0
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir b/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
index f304113f40b9c..6627273d44708 100644
--- a/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
+++ b/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
@@ -20,20 +20,16 @@
 ...
 ---
 name:            foo
-body:
- - id:              0
-   name:            entry
-   instructions:
-     - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
-     # CHECK: [[@LINE+1]]:14: the name of machine basic block #2 isn't 'hit'
-     - 'JG_1 %bb.2.hit, implicit %eflags'
- - id:              1
-   name:            less
-   instructions:
-     - '%eax = MOV32r0 implicit-def %eflags'
- - id:              2
-   name:            exit
-   instructions:
-     - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    %eax = MOV32rm %rdi, 1, _, 0, _
+    CMP32ri8 %eax, 10, implicit-def %eflags
+    ; CHECK: [[@LINE+1]]:10: the name of machine basic block #2 isn't 'hit'
+    JG_1 %bb.2.hit, implicit %eflags
+
+  bb.1.less:
+    %eax = MOV32r0 implicit-def %eflags
+
+  bb.2.exit:
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/unknown-register.mir b/test/CodeGen/MIR/X86/unknown-register.mir
index ce40ee809bf3f..da0798ca1b521 100644
--- a/test/CodeGen/MIR/X86/unknown-register.mir
+++ b/test/CodeGen/MIR/X86/unknown-register.mir
@@ -12,11 +12,9 @@
 ...
 ---
 name:            foo
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK: [[@LINE+1]]:9: unknown register name 'xax'
-     - '%xax = MOV32r0'
-     - 'RETQ %xax'
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:5: unknown register name 'xax'
+    %xax = MOV32r0
+    RETQ %xax
 ...
diff --git a/test/CodeGen/MIR/X86/unknown-subregister-index.mir b/test/CodeGen/MIR/X86/unknown-subregister-index.mir
index 50461232b6235..5dde345612365 100644
--- a/test/CodeGen/MIR/X86/unknown-subregister-index.mir
+++ b/test/CodeGen/MIR/X86/unknown-subregister-index.mir
@@ -18,14 +18,12 @@ registers:
   - { id: 0, class: gr32 }
   - { id: 1, class: gr8 }
   - { id: 2, class: gr8 }
-body:
-  - name:        entry
-    id:          0
-    instructions:
-      - '%0 = COPY %edi'
-      # CHECK: [[@LINE+1]]:23: use of unknown subregister index 'bit8'
-      - '%1 = COPY %0:bit8'
-      - '%2 = AND8ri %1, 1, implicit-def %eflags'
-      - '%al = COPY %2'
-      - 'RETQ %al'
+body: |
+  bb.0.entry:
+    %0 = COPY %edi
+    ; CHECK: [[@LINE+1]]:18: use of unknown subregister index 'bit8'
+    %1 = COPY %0:bit8
+    %2 = AND8ri %1, 1, implicit-def %eflags
+    %al = COPY %2
+    RETQ %al
 ...
diff --git a/test/CodeGen/MIR/X86/unrecognized-character.mir b/test/CodeGen/MIR/X86/unrecognized-character.mir
index 3b4fb1a9fc6eb..cf99028677fad 100644
--- a/test/CodeGen/MIR/X86/unrecognized-character.mir
+++ b/test/CodeGen/MIR/X86/unrecognized-character.mir
@@ -10,10 +10,8 @@
 ...
 ---
 name:            foo
-body:
- - id:           0
-   name:         entry
-   instructions:
-     # CHECK: [[@LINE+1]]:9: unexpected character '`'
-     - '` RETQ'
+body: |
+  bb.0.entry:
+    ; CHECK: [[@LINE+1]]:5: unexpected character '\'
+    \ RETQ
 ...
diff --git a/test/CodeGen/MIR/X86/used-physical-register-info.mir b/test/CodeGen/MIR/X86/used-physical-register-info.mir
new file mode 100644
index 0000000000000..9a81578703e05
--- /dev/null
+++ b/test/CodeGen/MIR/X86/used-physical-register-info.mir
@@ -0,0 +1,109 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the callee saved register mask
+# correctly and that the MIR parser can infer it as well.
+
+--- |
+
+  define i32 @compute(i32 %a) #0 {
+  body:
+    %c = mul i32 %a, 11
+    ret i32 %c
+  }
+
+  define i32 @foo(i32 %a) #0 {
+  entry:
+    %b = call i32 @compute(i32 %a)
+    ret i32 %b
+  }
+
+  define i32 @bar(i32 %a) #0 {
+  entry:
+    %b = call i32 @compute(i32 %a)
+    ret i32 %b
+  }
+
+  define i32 @empty(i32 %a) #0 {
+  entry:
+    %b = call i32 @compute(i32 %a)
+    ret i32 %b
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+# CHECK: name: compute
+# CHECK: liveins:
+# CHECK-NEXT: - { reg: '%edi' }
+# CHECK-NEXT: frameInfo:
+name:            compute
+liveins:
+  - { reg: '%edi' }
+frameInfo:
+  stackSize:     8
+body: |
+  bb.0.body:
+    liveins: %edi
+
+    %eax = IMUL32rri8 %edi, 11, implicit-def %eflags
+    RETQ %eax
+...
+---
+name:            foo
+liveins:
+  - { reg: '%edi' }
+# CHECK: name: foo
+# CHECK: calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx',
+# CHECK-NEXT:                    '%rbp', '%rbx', '%r12', '%r13', '%r14', '%r15',
+# CHECK-NEXT:                    '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d',
+# CHECK-NEXT:                    '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
+calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx',
+                        '%rbp', '%rbx', '%r12', '%r13', '%r14', '%r15',
+                        '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d',
+                        '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
+body: |
+  bb.0.entry:
+    liveins: %edi
+
+    PUSH64r %rax, implicit-def %rsp, implicit %rsp
+    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
+    %rdx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+...
+---
+name:            bar
+liveins:
+  - { reg: '%edi' }
+# Verify that the callee saved register can be inferred from register mask
+# machine operands:
+# CHECK: name: bar
+# CHECK: calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx',
+# CHECK-NEXT:                    '%rbp', '%rbx', '%r12', '%r13', '%r14', '%r15',
+# CHECK-NEXT:                    '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d',
+# CHECK-NEXT:                    '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
+body: |
+  bb.0.entry:
+    liveins: %edi
+
+    PUSH64r %rax, implicit-def %rsp, implicit %rsp
+    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
+    %rdx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+...
+---
+name:            empty
+liveins:
+  - { reg: '%edi' }
+# Verify that the callee saved register can be empty.
+# CHECK: name: empty
+# CHECK: calleeSavedRegisters: [ ]
+calleeSavedRegisters: [ ]
+body: |
+  bb.0.entry:
+    liveins: %edi
+
+    PUSH64r %rax, implicit-def %rsp, implicit %rsp
+    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
+    %rdx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+...
diff --git a/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir b/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir
index 8e50c52f5e181..e6a9ef8d4c882 100644
--- a/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir
+++ b/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir
@@ -25,12 +25,10 @@ stack:
   - { id: 1, offset: -32, size: 8, alignment: 8 }
   # CHECK: [[@LINE+1]]:55: unknown key 'size'
   - { id: 2, type: variable-sized, offset: -32, size: 42, alignment: 1 }
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
-      - 'MOV64mi32 %rsp, 1, _, -16, _, 2'
-      - '%eax = MOV32rm %rsp, 1, _, -4, _'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    MOV32mr %rsp, 1, _, -4, _, %edi
+    MOV64mi32 %rsp, 1, _, -16, _, 2
+    %eax = MOV32rm %rsp, 1, _, -4, _
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir b/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir
index 4c45742b25a4c..a58be69ae0460 100644
--- a/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir
+++ b/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir
@@ -26,17 +26,15 @@ frameInfo:
 # CHECK: stack:
 # CHECK-NEXT: - { id: 0, offset: -20, size: 4, alignment: 4 }
 # CHECK-NEXT: - { id: 1, offset: -32, size: 8, alignment: 8 }
-# CHECK-NEXT: - { id: 2, type: variable-sized, offset: -32, alignment: 1 }
+# CHECK-NEXT: - { id: 2, name: y, type: variable-sized, offset: -32, alignment: 1 }
 stack:
   - { id: 0, offset: -20, size: 4, alignment: 4 }
   - { id: 1, offset: -32, size: 8, alignment: 8 }
-  - { id: 2, type: variable-sized, offset: -32, alignment: 1 }
-body:
-  - id:          0
-    name:        entry
-    instructions:
-      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
-      - 'MOV64mi32 %rsp, 1, _, -16, _, 2'
-      - '%eax = MOV32rm %rsp, 1, _, -4, _'
-      - 'RETQ %eax'
+  - { id: 2, name: y, type: variable-sized, offset: -32, alignment: 1 }
+body: |
+  bb.0.entry:
+    MOV32mr %rsp, 1, _, -4, _, %edi
+    MOV64mi32 %rsp, 1, _, -16, _, 2
+    %eax = MOV32rm %rsp, 1, _, -4, _
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/virtual-register-redefinition-error.mir b/test/CodeGen/MIR/X86/virtual-register-redefinition-error.mir
new file mode 100644
index 0000000000000..5dae6e666c838
--- /dev/null
+++ b/test/CodeGen/MIR/X86/virtual-register-redefinition-error.mir
@@ -0,0 +1,27 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %a) {
+  body:
+    ret i32 %a
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+# CHECK: [[@LINE+1]]:11: redefinition of virtual register '%0'
+  - { id: 0, class: gr32 }
+body: |
+  bb.0.body:
+    liveins: %edi
+
+    %0 = COPY %edi
+    %eax = COPY %0
+    RETQ %eax
+...
+
diff --git a/test/CodeGen/MIR/X86/virtual-registers.mir b/test/CodeGen/MIR/X86/virtual-registers.mir
index c6d76e6a18c58..93c2fea6fd95a 100644
--- a/test/CodeGen/MIR/X86/virtual-registers.mir
+++ b/test/CodeGen/MIR/X86/virtual-registers.mir
@@ -41,29 +41,27 @@ registers:
   - { id: 0, class: gr32 }
   - { id: 1, class: gr32 }
   - { id: 2, class: gr32 }
-body:
-  - id:          0
-    name:        entry
-    # CHECK:      %0 = COPY %edi
-    # CHECK-NEXT: %1 = SUB32ri8 %0, 10
-    instructions:
-      - '%0 = COPY %edi'
-      - '%1 = SUB32ri8 %0, 10, implicit-def %eflags'
-      - 'JG_1 %bb.2.exit, implicit %eflags'
-      - 'JMP_1 %bb.1.less'
-  - id:          1
-    name:        less
-    # CHECK:      %2 = MOV32r0
-    # CHECK-NEXT: %eax = COPY %2
-    instructions:
-      - '%2 = MOV32r0 implicit-def %eflags'
-      - '%eax = COPY %2'
-      - 'RETQ %eax'
-  - id:          2
-    name:        exit
-    instructions:
-      - '%eax = COPY %0'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    successors: %bb.2.exit, %bb.1.less
+    liveins: %edi
+    ; CHECK:      %0 = COPY %edi
+    ; CHECK-NEXT: %1 = SUB32ri8 %0, 10
+    %0 = COPY %edi
+    %1 = SUB32ri8 %0, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit %eflags
+    JMP_1 %bb.1.less
+
+  bb.1.less:
+    ; CHECK:      %2 = MOV32r0
+    ; CHECK-NEXT: %eax = COPY %2
+    %2 = MOV32r0 implicit-def %eflags
+    %eax = COPY %2
+    RETQ %eax
+
+  bb.2.exit:
+    %eax = COPY %0
+    RETQ %eax
 ...
 ---
 name:            foo
@@ -78,28 +76,26 @@ registers:
   - { id: 2, class: gr32 }
   - { id: 0, class: gr32 }
   - { id: 10, class: gr32 }
-body:
-  - id:          0
-    name:        entry
-    # CHECK:      %0 = COPY %edi
-    # CHECK-NEXT: %1 = SUB32ri8 %0, 10
-    instructions:
-      - '%2 = COPY %edi'
-      - '%0 = SUB32ri8 %2, 10, implicit-def %eflags'
-      - 'JG_1 %bb.2.exit, implicit %eflags'
-      - 'JMP_1 %bb.1.less'
-  - id:          1
-    name:        less
-    # CHECK:      %2 = MOV32r0
-    # CHECK-NEXT: %eax = COPY %2
-    instructions:
-      - '%10 = MOV32r0 implicit-def %eflags'
-      - '%eax = COPY %10'
-      - 'RETQ %eax'
-  - id:          2
-    name:        exit
-    # CHECK: %eax = COPY %0
-    instructions:
-      - '%eax = COPY %2'
-      - 'RETQ %eax'
+body: |
+  bb.0.entry:
+    successors: %bb.2.exit, %bb.1.less
+    liveins: %edi
+    ; CHECK:      %0 = COPY %edi
+    ; CHECK-NEXT: %1 = SUB32ri8 %0, 10
+    %2 = COPY %edi
+    %0 = SUB32ri8 %2, 10, implicit-def %eflags
+    JG_1 %bb.2.exit, implicit %eflags
+    JMP_1 %bb.1.less
+
+  bb.1.less:
+    ; CHECK:      %2 = MOV32r0
+    ; CHECK-NEXT: %eax = COPY %2
+    %10 = MOV32r0 implicit-def %eflags
+    %eax = COPY %10
+    RETQ %eax
+
+  bb.2.exit:
+    ; CHECK: %eax = COPY %0
+    %eax = COPY %2
+    RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/basic-blocks.mir b/test/CodeGen/MIR/basic-blocks.mir
deleted file mode 100644
index 17313047576b4..0000000000000
--- a/test/CodeGen/MIR/basic-blocks.mir
+++ /dev/null
@@ -1,49 +0,0 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
-# This test ensures that the MIR parser parses machine functions correctly.
-
---- |
-
-  define i32 @foo() {
-  entry:
-    ret i32 0
-  }
-
-  define i32 @bar() {
-  start:
-    ret i32 0
-  }
-
-...
----
-# CHECK: name: foo
-# CHECK: body:
-# CHECK-NEXT: - id: 0
-# CHECK-NEXT:   name: entry
-# CHECK-NEXT:   alignment: 0
-# CHECK-NEXT:   isLandingPad: false
-# CHECK-NEXT:   addressTaken: false
-name:            foo
-body:
- - id:           0
-   name:         entry
-...
----
-# CHECK: name: bar
-# CHECK: body:
-# CHECK-NEXT: - id: 0
-# CHECK-NEXT:   name: start
-# CHECK-NEXT:   alignment: 4
-# CHECK-NEXT:   isLandingPad: false
-# CHECK-NEXT:   addressTaken: false
-# CHECK-NEXT: - id: 1
-# CHECK-NEXT:   alignment: 0
-# CHECK-NEXT:   isLandingPad: false
-# CHECK-NEXT:   addressTaken: true
-name:            bar
-body:
- - id:           0
-   name:         start
-   alignment:    4
- - id:           1
-   addressTaken: true
-...
diff --git a/test/CodeGen/MIR/expected-eof-after-successor-mbb.mir b/test/CodeGen/MIR/expected-eof-after-successor-mbb.mir
deleted file mode 100644
index 25ae511929717..0000000000000
--- a/test/CodeGen/MIR/expected-eof-after-successor-mbb.mir
+++ /dev/null
@@ -1,29 +0,0 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-
---- |
-
-  define i32 @foo(i32 %a) {
-  entry:
-    %0 = icmp sle i32 %a, 10
-    br i1 %0, label %less, label %exit
-
-  less:
-    ret i32 0
-
-  exit:
-    ret i32 %a
-  }
-
-...
----
-name:            foo
-body:
-  - id:          0
-    name:        entry
-    # CHECK: [[@LINE+1]]:46: expected end of string after the machine basic block reference
-    successors:  [ '%bb.1.less', '%bb.2.exit 2' ]
-  - id:          1
-    name:        less
-  - id:          2
-    name:        exit
-...
diff --git a/test/CodeGen/MIR/expected-mbb-reference-for-successor-mbb.mir b/test/CodeGen/MIR/expected-mbb-reference-for-successor-mbb.mir
deleted file mode 100644
index ce9192901d7dc..0000000000000
--- a/test/CodeGen/MIR/expected-mbb-reference-for-successor-mbb.mir
+++ /dev/null
@@ -1,29 +0,0 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-
---- |
-
-  define i32 @foo(i32 %a) {
-  entry:
-    %0 = icmp sle i32 %a, 10
-    br i1 %0, label %less, label %exit
-
-  less:
-    ret i32 0
-
-  exit:
-    ret i32 %a
-  }
-
-...
----
-name:            foo
-body:
-  - id:          0
-    name:        entry
-    # CHECK: [[@LINE+1]]:35: expected a machine basic block reference
-    successors:  [ '%bb.1.less', '2' ]
-  - id:          1
-    name:        less
-  - id:          2
-    name:        exit
-...
diff --git a/test/CodeGen/MIR/frame-info.mir b/test/CodeGen/MIR/frame-info.mir
deleted file mode 100644
index c5468f94f33a6..0000000000000
--- a/test/CodeGen/MIR/frame-info.mir
+++ /dev/null
@@ -1,91 +0,0 @@
-# RUN: llc -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
-# This test ensures that the MIR parser parses machine frame info properties
-# correctly.
-
---- |
-
-  define i32 @test(i32 %a) {
-  entry:
-    %b = alloca i32
-    store i32 %a, i32* %b
-    %c = load i32, i32* %b
-    ret i32 %c
-  }
-
-  define i32 @test2(i32 %a) {
-  entry:
-    %b = alloca i32
-    store i32 %a, i32* %b
-    %c = load i32, i32* %b
-    ret i32 %c
-  }
-
-...
----
-name:            test
-isSSA:           true
-tracksRegLiveness: true
-
-# CHECK: frameInfo:
-# CHECK-NEXT: isFrameAddressTaken: false
-# CHECK-NEXT: isReturnAddressTaken: false
-# CHECK-NEXT: hasStackMap: false
-# CHECK-NEXT: hasPatchPoint: false
-# CHECK-NEXT: stackSize: 0
-# CHECK-NEXT: offsetAdjustment: 0
-# Note: max alignment can be target specific when printed.
-# CHECK-NEXT: maxAlignment:
-# CHECK-NEXT: adjustsStack: false
-# CHECK-NEXT: hasCalls: false
-# CHECK-NEXT: maxCallFrameSize: 0
-# CHECK-NEXT: hasOpaqueSPAdjustment: false
-# CHECK-NEXT: hasVAStart: false
-# CHECK-NEXT: hasMustTailInVarArgFunc: false
-# CHECK: body
-frameInfo:
-  maxAlignment:    4
-body:
-  - id:          0
-    name:        entry
-...
----
-name:            test2
-isSSA:           true
-tracksRegLiveness: true
-
-# CHECK: test2
-# CHECK: frameInfo:
-# CHECK-NEXT: isFrameAddressTaken: true
-# CHECK-NEXT: isReturnAddressTaken: true
-# CHECK-NEXT: hasStackMap: true
-# CHECK-NEXT: hasPatchPoint: true
-# CHECK-NEXT: stackSize: 4
-# CHECK-NEXT: offsetAdjustment: 4
-# Note: max alignment can be target specific when printed.
-# CHECK-NEXT: maxAlignment:
-# CHECK-NEXT: adjustsStack: true
-# CHECK-NEXT: hasCalls: true
-# CHECK-NEXT: maxCallFrameSize: 4
-# CHECK-NEXT: hasOpaqueSPAdjustment: true
-# CHECK-NEXT: hasVAStart: true
-# CHECK-NEXT: hasMustTailInVarArgFunc: true
-# CHECK: body
-frameInfo:
-  isFrameAddressTaken: true
-  isReturnAddressTaken: true
-  hasStackMap:     true
-  hasPatchPoint:   true
-  stackSize:       4
-  offsetAdjustment: 4
-  maxAlignment:    4
-  adjustsStack:    true
-  hasCalls:        true
-  maxCallFrameSize: 4
-  hasOpaqueSPAdjustment: true
-  hasVAStart:      true
-  hasMustTailInVarArgFunc: true
-body:
-  - id:          0
-    name:        entry
-...
-
diff --git a/test/CodeGen/MIR/function-missing-machine-function.mir b/test/CodeGen/MIR/function-missing-machine-function.mir
deleted file mode 100644
index 71b5b28453405..0000000000000
--- a/test/CodeGen/MIR/function-missing-machine-function.mir
+++ /dev/null
@@ -1,13 +0,0 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-# This test verifies that an error is reported when a MIR file has some
-# function but is missing a corresponding machine function.
-
-# CHECK: no machine function information for function 'foo' in the MIR file
-
---- |
-
-  define i32 @foo() {
-    ret i32 0
-  }
-
-...
diff --git a/test/CodeGen/MIR/llvm-ir-error-reported.mir b/test/CodeGen/MIR/llvm-ir-error-reported.mir
deleted file mode 100644
index 3508c341c44d8..0000000000000
--- a/test/CodeGen/MIR/llvm-ir-error-reported.mir
+++ /dev/null
@@ -1,22 +0,0 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-# This test ensures an error is reported if the embedded LLVM IR contains an
-# error.
-
---- |
-  
-  ; CHECK: [[@LINE+3]]:15: use of undefined value '%a'
-  define i32 @foo(i32 %x, i32 %y) {
-    %z = alloca i32, align 4
-    store i32 %a, i32* %z, align 4
-    br label %Test
-  Test:
-    %m = load i32, i32* %z, align 4
-    %cond = icmp eq i32 %y, %m
-    br i1 %cond, label %IfEqual, label %IfUnequal
-  IfEqual:
-    ret i32 1
-  IfUnequal:
-    ret i32 0
-  }
-  
-...
diff --git a/test/CodeGen/MIR/llvmIR.mir b/test/CodeGen/MIR/llvmIR.mir
deleted file mode 100644
index 3c084ad7d393c..0000000000000
--- a/test/CodeGen/MIR/llvmIR.mir
+++ /dev/null
@@ -1,37 +0,0 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
-# This test ensures that the LLVM IR that's embedded with MIR is parsed
-# correctly.
-
---- |
-  ; CHECK: define i32 @foo(i32 %x, i32 %y)
-  ; CHECK: %z = alloca i32, align 4
-  ; CHECK: store i32 %x, i32* %z, align 4
-  ; CHECK: br label %Test
-  ; CHECK: Test:
-  ; CHECK: %m = load i32, i32* %z, align 4
-  ; CHECK: %cond = icmp eq i32 %y, %m
-  ; CHECK: br i1 %cond, label %IfEqual, label %IfUnequal
-  ; CHECK: IfEqual:
-  ; CHECK: ret i32 1
-  ; CHECK: IfUnequal:
-  ; CHECK: ret i32 0
-  define i32 @foo(i32 %x, i32 %y) {
-    %z = alloca i32, align 4
-    store i32 %x, i32* %z, align 4
-    br label %Test
-  Test:
-    %m = load i32, i32* %z, align 4
-    %cond = icmp eq i32 %y, %m
-    br i1 %cond, label %IfEqual, label %IfUnequal
-  IfEqual:
-    ret i32 1
-  IfUnequal:
-    ret i32 0
-  }
-  
-...
----
-name: foo
-body:
-  - id: 0
-...
diff --git a/test/CodeGen/MIR/llvmIRMissing.mir b/test/CodeGen/MIR/llvmIRMissing.mir
deleted file mode 100644
index 80cea5a6fdaa6..0000000000000
--- a/test/CodeGen/MIR/llvmIRMissing.mir
+++ /dev/null
@@ -1,9 +0,0 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
-# This test ensures that the MIR parser accepts files without the LLVM IR.
-
----
-# CHECK: name: foo
-name: foo
-body:
-  - id: 0
-...
diff --git a/test/CodeGen/MIR/machine-basic-block-redefinition-error.mir b/test/CodeGen/MIR/machine-basic-block-redefinition-error.mir
deleted file mode 100644
index deac3b0b69bf1..0000000000000
--- a/test/CodeGen/MIR/machine-basic-block-redefinition-error.mir
+++ /dev/null
@@ -1,17 +0,0 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-
---- |
-
-  define i32 @foo() {
-  entry:
-    ret i32 0
-  }
-
-...
----
-name:            foo
-body:
-  # CHECK: redefinition of machine basic block with id #0
-  - id:       0
-  - id:       0
-...
diff --git a/test/CodeGen/MIR/machine-basic-block-unknown-name.mir b/test/CodeGen/MIR/machine-basic-block-unknown-name.mir
deleted file mode 100644
index df8eee9d27086..0000000000000
--- a/test/CodeGen/MIR/machine-basic-block-unknown-name.mir
+++ /dev/null
@@ -1,19 +0,0 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-# This test ensures that an error is reported whenever the MIR parser can't find
-# a basic block with the machine basis block's name.
-
---- |
-
-  define i32 @foo() {
-  entry:
-    ret i32 0
-  }
-
-...
----
-name:            foo
-body:
-  # CHECK: [[@LINE+2]]:18: basic block 'entrie' is not defined in the function 'foo'
-  - id:          0
-    name:        entrie
-...
diff --git a/test/CodeGen/MIR/machine-function-missing-body-error.mir b/test/CodeGen/MIR/machine-function-missing-body-error.mir
deleted file mode 100644
index 0dc7477f6275b..0000000000000
--- a/test/CodeGen/MIR/machine-function-missing-body-error.mir
+++ /dev/null
@@ -1,15 +0,0 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-# This test ensures that the MIR parser reports an error when it encounters a
-# machine function with an empty body.
-
---- |
-
-  define i32 @foo() {
-    ret i32 0
-  }
-
-...
----
-# CHECK: machine function 'foo' requires at least one machine basic block in its body
-name:            foo
-...
diff --git a/test/CodeGen/MIR/machine-function-missing-function.mir b/test/CodeGen/MIR/machine-function-missing-function.mir
deleted file mode 100644
index 424c34aae8477..0000000000000
--- a/test/CodeGen/MIR/machine-function-missing-function.mir
+++ /dev/null
@@ -1,23 +0,0 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-# This test ensures that an error is reported when the mir file has LLVM IR and
-# one of the machine functions has a name that doesn't match any function in
-# the LLVM IR.
-
---- |
-
-  define i32 @foo() {
-    ret i32 0
-  }
-
-...
----
-name:            foo
-body:
-  - id: 0
-...
----
-# CHECK: function 'faa' isn't defined in the provided LLVM IR
-name:            faa
-body:
-  - id: 0
-...
diff --git a/test/CodeGen/MIR/machine-function-missing-name.mir b/test/CodeGen/MIR/machine-function-missing-name.mir
deleted file mode 100644
index a868a65d35f22..0000000000000
--- a/test/CodeGen/MIR/machine-function-missing-name.mir
+++ /dev/null
@@ -1,26 +0,0 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-# This test ensures that an error is reported when a machine function doesn't
-# have a name attribute.
-
---- |
-
-  define i32 @foo() {
-    ret i32 0
-  }
-
-  define i32 @bar() {
-    ret i32 0
-  }
-
-...
----
-# CHECK: [[@LINE+1]]:1: missing required key 'name'
-nme:             foo
-body:
-  - id: 0
-...
----
-name:            bar
-body:
-  - id: 0
-...
diff --git a/test/CodeGen/MIR/machine-function-redefinition-error.mir b/test/CodeGen/MIR/machine-function-redefinition-error.mir
deleted file mode 100644
index be84161b5630a..0000000000000
--- a/test/CodeGen/MIR/machine-function-redefinition-error.mir
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-# This test ensures that the machine function errors are reported correctly.
-
----
-name:            foo
-...
----
-# CHECK: redefinition of machine function 'foo'
-name:            foo
-...
diff --git a/test/CodeGen/MIR/machine-function.mir b/test/CodeGen/MIR/machine-function.mir
deleted file mode 100644
index afd10ab02c260..0000000000000
--- a/test/CodeGen/MIR/machine-function.mir
+++ /dev/null
@@ -1,66 +0,0 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
-# This test ensures that the MIR parser parses machine functions correctly.
-
---- |
-
-  define i32 @foo() {
-    ret i32 0
-  }
-
-  define i32 @bar() {
-    ret i32 0
-  }
-
-  define i32 @func() {
-    ret i32 0
-  }
-
-  define i32 @func2() {
-    ret i32 0
-  }
-  
-...
----
-# CHECK: name: foo
-# CHECK-NEXT: alignment:
-# CHECK-NEXT: exposesReturnsTwice: false
-# CHECK-NEXT: hasInlineAsm: false
-# CHECK: ...
-name:            foo
-body:
-  - id: 0
-...
----
-# CHECK: name: bar
-# CHECK-NEXT: alignment:
-# CHECK-NEXT: exposesReturnsTwice: false
-# CHECK-NEXT: hasInlineAsm: false
-# CHECK: ...
-name:            bar
-body:
-  - id: 0
-...
----
-# CHECK: name: func
-# CHECK-NEXT: alignment: 8
-# CHECK-NEXT: exposesReturnsTwice: false
-# CHECK-NEXT: hasInlineAsm: false
-# CHECK: ...
-name:            func
-alignment:       8
-body:
-  - id: 0
-...
----
-# CHECK: name: func2
-# CHECK-NEXT: alignment: 16
-# CHECK-NEXT: exposesReturnsTwice: true
-# CHECK-NEXT: hasInlineAsm: true
-# CHECK: ...
-name:            func2
-alignment:       16
-exposesReturnsTwice: true
-hasInlineAsm:    true
-body:
-  - id: 0
-...
diff --git a/test/CodeGen/MIR/register-info.mir b/test/CodeGen/MIR/register-info.mir
deleted file mode 100644
index 9585faa96223c..0000000000000
--- a/test/CodeGen/MIR/register-info.mir
+++ /dev/null
@@ -1,40 +0,0 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
-# This test ensures that the MIR parser parses machine register info properties
-# correctly.
-
---- |
-
-  define i32 @foo() {
-  entry:
-    ret i32 0
-  }
-
-  define i32 @bar() {
-  start:
-    ret i32 0
-  }
-
-...
----
-# CHECK: name: foo
-# CHECK:      isSSA: false
-# CHECK-NEXT: tracksRegLiveness: false
-# CHECK-NEXT: tracksSubRegLiveness: false
-# CHECK: ...
-name:            foo
-body:
-  - id: 0
-...
----
-# CHECK: name: bar
-# CHECK:      isSSA: false
-# CHECK-NEXT: tracksRegLiveness: true
-# CHECK-NEXT: tracksSubRegLiveness: true
-# CHECK: ...
-name: bar
-isSSA: false
-tracksRegLiveness: true
-tracksSubRegLiveness: true
-body:
-  - id: 0
-...
diff --git a/test/CodeGen/MIR/successor-basic-blocks.mir b/test/CodeGen/MIR/successor-basic-blocks.mir
deleted file mode 100644
index 3fe01e3ad4388..0000000000000
--- a/test/CodeGen/MIR/successor-basic-blocks.mir
+++ /dev/null
@@ -1,58 +0,0 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
-# This test ensures that the MIR parser parses basic block successors correctly.
-
---- |
-
-  define i32 @foo(i32 %a) {
-  entry:
-    %0 = icmp sle i32 %a, 10
-    br i1 %0, label %less, label %exit
-
-  less:
-    ret i32 0
-
-  exit:
-    ret i32 %a
-  }
-
-  define i32 @bar(i32 %a) {
-  entry:
-    %b = icmp sle i32 %a, 10
-    br i1 %b, label %0, label %1
-
-  ; <label>:0
-    ret i32 0
-
-  ; <label>:1
-    ret i32 %a
-  }
-
-...
----
-name:            foo
-body:
-  # CHECK: name: entry
-  # CHECK: successors: [ '%bb.1.less', '%bb.2.exit' ]
-  # CHECK: name: less
-  - id:          0
-    name:        entry
-    successors:  [ '%bb.1.less', '%bb.2.exit' ]
-  - id:          1
-    name:        less
-  - id:          2
-    name:        exit
-...
----
-name:            bar
-body:
-  # CHECK: name: bar
-  # CHECK: name: entry
-  # CHECK: successors: [ '%bb.1', '%bb.2' ]
-  # CHECK: id: 1
-  # CHECK: id: 2
-  - id:          0
-    name:        entry
-    successors:  [ '%bb.1', '%bb.2' ]
-  - id:          1
-  - id:          2
-...
diff --git a/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll b/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll
new file mode 100644
index 0000000000000..203e5a7e15959
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=mips -mcpu=mips2 -O0 -relocation-model=pic \
+; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips3 -O0 -relocation-model=pic \
+; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips4 -O0 -relocation-model=pic \
+; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+
+; RUN: llc -march=mips -mcpu=mips32r6 -O0 -relocation-model=pic \
+; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+
+; RUN: llc -march=mips -mcpu=mips64 -O0 -relocation-model=pic \
+; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips64r2 -O0 -relocation-model=pic \
+; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips64r3 -O0 -relocation-model=pic \
+; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips64r5 -O0 -relocation-model=pic \
+; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips32r6 -O0 -relocation-model=pic \
+; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+
+; CHECK: FastISel missed terminator:   ret i32 0
+
+define i32 @foo() {
+entry:
+  ret i32 0
+}
diff --git a/test/CodeGen/Mips/addi.ll b/test/CodeGen/Mips/addi.ll
index b6af2ee456879..f1db843caf642 100644
--- a/test/CodeGen/Mips/addi.ll
+++ b/test/CodeGen/Mips/addi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 6, align 4
 @j = global i32 12, align 4
diff --git a/test/CodeGen/Mips/adjust-callstack-sp.ll b/test/CodeGen/Mips/adjust-callstack-sp.ll
index 8c61a650a962d..e4afcd8350050 100644
--- a/test/CodeGen/Mips/adjust-callstack-sp.ll
+++ b/test/CodeGen/Mips/adjust-callstack-sp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=mips -mcpu=mips16 | FileCheck %s -check-prefix=M16
+; RUN: llc < %s -march=mips -mattr=mips16 | FileCheck %s -check-prefix=M16
 ; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s -check-prefix=GP32
 ; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s -check-prefix=GP32
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s -check-prefix=GP32
diff --git a/test/CodeGen/Mips/align16.ll b/test/CodeGen/Mips/align16.ll
index f385adfaa04c7..ac6685dd55246 100644
--- a/test/CodeGen/Mips/align16.ll
+++ b/test/CodeGen/Mips/align16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 25, align 4
 @.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
diff --git a/test/CodeGen/Mips/alloca16.ll b/test/CodeGen/Mips/alloca16.ll
index be8cc740310bd..d728d3bb0b7bb 100644
--- a/test/CodeGen/Mips/alloca16.ll
+++ b/test/CodeGen/Mips/alloca16.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @iiii = global i32 25, align 4
 @jjjj = global i32 35, align 4
diff --git a/test/CodeGen/Mips/and1.ll b/test/CodeGen/Mips/and1.ll
index 57076a4d4fcf7..a2bf4f080a076 100644
--- a/test/CodeGen/Mips/and1.ll
+++ b/test/CodeGen/Mips/and1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @x = global i32 65504, align 4
 @y = global i32 60929, align 4
diff --git a/test/CodeGen/Mips/asm-large-immediate.ll b/test/CodeGen/Mips/asm-large-immediate.ll
index 246fff615edba..c75b9e4ad12be 100644
--- a/test/CodeGen/Mips/asm-large-immediate.ll
+++ b/test/CodeGen/Mips/asm-large-immediate.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -no-integrated-as < %s | FileCheck %s
+
 define void @test() {
 entry:
 ; CHECK: /* result: 68719476738 */
diff --git a/test/CodeGen/Mips/atomicops.ll b/test/CodeGen/Mips/atomicops.ll
index 0ff9f5c22a840..18a48ca5023bc 100644
--- a/test/CodeGen/Mips/atomicops.ll
+++ b/test/CodeGen/Mips/atomicops.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @.str = private unnamed_addr constant [8 x i8] c"%d, %d\0A\00", align 1
 
diff --git a/test/CodeGen/Mips/beqzc.ll b/test/CodeGen/Mips/beqzc.ll
index 37bece884212a..c0845f7185e1b 100644
--- a/test/CodeGen/Mips/beqzc.ll
+++ b/test/CodeGen/Mips/beqzc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
 
 @i = global i32 0, align 4
 @j = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/beqzc1.ll b/test/CodeGen/Mips/beqzc1.ll
index 1f5575f099fae..144983513edfc 100644
--- a/test/CodeGen/Mips/beqzc1.ll
+++ b/test/CodeGen/Mips/beqzc1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
 
 @i = global i32 0, align 4
 @j = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/br-jmp.ll b/test/CodeGen/Mips/br-jmp.ll
index 9ca8d159614fa..5e94c755c9697 100644
--- a/test/CodeGen/Mips/br-jmp.ll
+++ b/test/CodeGen/Mips/br-jmp.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefix=CHECK-PIC
 ; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=CHECK-PIC16
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=CHECK-PIC16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
 
 define void @count(i32 %x, i32 %y, i32 %z) noreturn nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/brconeq.ll b/test/CodeGen/Mips/brconeq.ll
index f555528bbb646..7c3c31e0ec3c4 100644
--- a/test/CodeGen/Mips/brconeq.ll
+++ b/test/CodeGen/Mips/brconeq.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @j = global i32 10, align 4
diff --git a/test/CodeGen/Mips/brconeqk.ll b/test/CodeGen/Mips/brconeqk.ll
index 59edae82e5adc..85d257e8d7970 100644
--- a/test/CodeGen/Mips/brconeqk.ll
+++ b/test/CodeGen/Mips/brconeqk.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @result = global i32 0, align 4
diff --git a/test/CodeGen/Mips/brconeqz.ll b/test/CodeGen/Mips/brconeqz.ll
index 22c5664075288..cf1beed49bb43 100644
--- a/test/CodeGen/Mips/brconeqz.ll
+++ b/test/CodeGen/Mips/brconeqz.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @result = global i32 0, align 4
diff --git a/test/CodeGen/Mips/brconge.ll b/test/CodeGen/Mips/brconge.ll
index 46d19847d9bc9..f3f059ff2d54c 100644
--- a/test/CodeGen/Mips/brconge.ll
+++ b/test/CodeGen/Mips/brconge.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @j = global i32 10, align 4
diff --git a/test/CodeGen/Mips/brcongt.ll b/test/CodeGen/Mips/brcongt.ll
index cefacb8318b04..7dffdb4112118 100644
--- a/test/CodeGen/Mips/brcongt.ll
+++ b/test/CodeGen/Mips/brcongt.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @j = global i32 10, align 4
diff --git a/test/CodeGen/Mips/brconle.ll b/test/CodeGen/Mips/brconle.ll
index e1f15ecb6b92c..99599f84db17e 100644
--- a/test/CodeGen/Mips/brconle.ll
+++ b/test/CodeGen/Mips/brconle.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 -5, align 4
 @j = global i32 10, align 4
diff --git a/test/CodeGen/Mips/brconlt.ll b/test/CodeGen/Mips/brconlt.ll
index 049f35c393fe5..487018c22f26c 100644
--- a/test/CodeGen/Mips/brconlt.ll
+++ b/test/CodeGen/Mips/brconlt.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @j = global i32 10, align 4
diff --git a/test/CodeGen/Mips/brconne.ll b/test/CodeGen/Mips/brconne.ll
index b260320b94e14..e0cbe378fe3c6 100644
--- a/test/CodeGen/Mips/brconne.ll
+++ b/test/CodeGen/Mips/brconne.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @j = global i32 5, align 4
diff --git a/test/CodeGen/Mips/brconnek.ll b/test/CodeGen/Mips/brconnek.ll
index 778a5cce72b32..0b9234fe3b9d8 100644
--- a/test/CodeGen/Mips/brconnek.ll
+++ b/test/CodeGen/Mips/brconnek.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 5, align 4
 @result = global i32 0, align 4
diff --git a/test/CodeGen/Mips/brconnez.ll b/test/CodeGen/Mips/brconnez.ll
index 754714b21daf3..27cf9e8cacb81 100644
--- a/test/CodeGen/Mips/brconnez.ll
+++ b/test/CodeGen/Mips/brconnez.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 0, align 4
 @result = global i32 0, align 4
diff --git a/test/CodeGen/Mips/brind.ll b/test/CodeGen/Mips/brind.ll
index a3e9b8011a2be..ed2c3b3dddb74 100644
--- a/test/CodeGen/Mips/brind.ll
+++ b/test/CodeGen/Mips/brind.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @main.L = internal unnamed_addr constant [5 x i8*] [i8* blockaddress(@main, %L1), i8* blockaddress(@main, %L2), i8* blockaddress(@main, %L3), i8* blockaddress(@main, %L4), i8* null], align 4
 @str = private unnamed_addr constant [2 x i8] c"A\00"
diff --git a/test/CodeGen/Mips/brsize3.ll b/test/CodeGen/Mips/brsize3.ll
index dad0d841d4c6a..1e76879409c60 100644
--- a/test/CodeGen/Mips/brsize3.ll
+++ b/test/CodeGen/Mips/brsize3.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-no-short
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-no-short
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-long
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-long
 
 ; ModuleID = 'brsize3.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
diff --git a/test/CodeGen/Mips/brsize3a.ll b/test/CodeGen/Mips/brsize3a.ll
index e1cd5893ceda9..24516018b9b76 100644
--- a/test/CodeGen/Mips/brsize3a.ll
+++ b/test/CodeGen/Mips/brsize3a.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-short
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-short
 
 ; ModuleID = 'brsize3.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs.ll b/test/CodeGen/Mips/cconv/arguments-varargs.ll
index af217c92dab8e..d1a196738aeef 100644
--- a/test/CodeGen/Mips/cconv/arguments-varargs.ll
+++ b/test/CodeGen/Mips/cconv/arguments-varargs.ll
@@ -55,7 +55,7 @@ entry:
 ; Store [[VA]]
 ; O32-DAG:       sw [[VA]], 0([[SP]])
 
-; ALL: # ANCHOR1
+; ALL: teqi $zero, 1
 
 ; Increment [[VA]]
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -89,7 +89,7 @@ entry:
 
 ; ALL-DAG:       sh [[ARG1]], 2([[GV]])
 
-; ALL: # ANCHOR2
+; ALL: teqi $zero, 2
 
 ; Increment [[VA]] again.
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -117,12 +117,12 @@ entry:
   %ap2 = bitcast i8** %ap to i8*
   call void @llvm.va_start(i8* %ap2)
 
-  call void asm sideeffect "# ANCHOR1", ""()
+  call void asm sideeffect "teqi $$zero, 1", ""()
   %arg1 = va_arg i8** %ap, i16
   %e1 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 1
   store volatile i16 %arg1, i16* %e1, align 2
 
-  call void asm sideeffect "# ANCHOR2", ""()
+  call void asm sideeffect "teqi $$zero, 2", ""()
   %arg2 = va_arg i8** %ap, i16
   %e2 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 2
   store volatile i16 %arg2, i16* %e2, align 2
@@ -173,7 +173,7 @@ entry:
 ; Store [[VA]]
 ; O32-DAG:       sw [[VA]], 0([[SP]])
 
-; ALL: # ANCHOR1
+; ALL: teqi $zero, 1
 
 ; Increment [[VA]]
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -207,7 +207,7 @@ entry:
 
 ; ALL-DAG:       sw [[ARG1]], 4([[GV]])
 
-; ALL: # ANCHOR2
+; ALL: teqi $zero, 2
 
 ; Increment [[VA]] again.
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -235,12 +235,12 @@ entry:
   %ap2 = bitcast i8** %ap to i8*
   call void @llvm.va_start(i8* %ap2)
 
-  call void asm sideeffect "# ANCHOR1", ""()
+  call void asm sideeffect "teqi $$zero, 1", ""()
   %arg1 = va_arg i8** %ap, i32
   %e1 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 1
   store volatile i32 %arg1, i32* %e1, align 4
 
-  call void asm sideeffect "# ANCHOR2", ""()
+  call void asm sideeffect "teqi $$zero, 2", ""()
   %arg2 = va_arg i8** %ap, i32
   %e2 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 2
   store volatile i32 %arg2, i32* %e2, align 4
@@ -291,7 +291,7 @@ entry:
 ; Store [[VA]]
 ; O32-DAG:       sw [[VA]], 0([[SP]])
 
-; ALL: # ANCHOR1
+; ALL: teqi $zero, 1
 
 ; Increment [[VA]] (and realign pointer for O32)
 ; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -328,7 +328,7 @@ entry:
 ; NEW-DAG:       ld [[ARG1:\$[0-9]+]], 0([[VA]])
 ; NEW-DAG:       sd [[ARG1]], 8([[GV]])
 
-; ALL: # ANCHOR2
+; ALL: teqi $zero, 2
 
 ; Increment [[VA]] again.
 ; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
@@ -362,12 +362,12 @@ entry:
   %ap2 = bitcast i8** %ap to i8*
   call void @llvm.va_start(i8* %ap2)
 
-  call void asm sideeffect "# ANCHOR1", ""()
+  call void asm sideeffect "teqi $$zero, 1", ""()
   %arg1 = va_arg i8** %ap, i64
   %e1 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 1
   store volatile i64 %arg1, i64* %e1, align 8
 
-  call void asm sideeffect "# ANCHOR2", ""()
+  call void asm sideeffect "teqi $$zero, 2", ""()
   %arg2 = va_arg i8** %ap, i64
   %e2 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 2
   store volatile i64 %arg2, i64* %e2, align 8
@@ -418,7 +418,7 @@ entry:
 ; Store [[VA]]
 ; O32-DAG:       sw [[VA]], 0([[SP]])
 
-; ALL: # ANCHOR1
+; ALL: teqi $zero, 1
 
 ; Increment [[VA]]
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -452,7 +452,7 @@ entry:
 
 ; ALL-DAG:       sh [[ARG1]], 2([[GV]])
 
-; ALL: # ANCHOR2
+; ALL: teqi $zero, 2
 
 ; Increment [[VA]] again.
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -480,12 +480,12 @@ entry:
   %ap2 = bitcast i8** %ap to i8*
   call void @llvm.va_start(i8* %ap2)
 
-  call void asm sideeffect "# ANCHOR1", ""()
+  call void asm sideeffect "teqi $$zero, 1", ""()
   %arg1 = va_arg i8** %ap, i16
   %e1 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 1
   store volatile i16 %arg1, i16* %e1, align 2
 
-  call void asm sideeffect "# ANCHOR2", ""()
+  call void asm sideeffect "teqi $$zero, 2", ""()
   %arg2 = va_arg i8** %ap, i16
   %e2 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 2
   store volatile i16 %arg2, i16* %e2, align 2
@@ -536,7 +536,7 @@ entry:
 ; Store [[VA]]
 ; O32-DAG:       sw [[VA]], 0([[SP]])
 
-; ALL: # ANCHOR1
+; ALL: teqi $zero, 1
 
 ; Increment [[VA]]
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -570,7 +570,7 @@ entry:
 
 ; ALL-DAG:       sw [[ARG1]], 4([[GV]])
 
-; ALL: # ANCHOR2
+; ALL: teqi $zero, 2
 
 ; Increment [[VA]] again.
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -598,12 +598,12 @@ entry:
   %ap2 = bitcast i8** %ap to i8*
   call void @llvm.va_start(i8* %ap2)
 
-  call void asm sideeffect "# ANCHOR1", ""()
+  call void asm sideeffect "teqi $$zero, 1", ""()
   %arg1 = va_arg i8** %ap, i32
   %e1 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 1
   store volatile i32 %arg1, i32* %e1, align 4
 
-  call void asm sideeffect "# ANCHOR2", ""()
+  call void asm sideeffect "teqi $$zero, 2", ""()
   %arg2 = va_arg i8** %ap, i32
   %e2 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 2
   store volatile i32 %arg2, i32* %e2, align 4
@@ -654,7 +654,7 @@ entry:
 ; Store [[VA]]
 ; O32-DAG:       sw [[VA]], 0([[SP]])
 
-; ALL: # ANCHOR1
+; ALL: teqi $zero, 1
 
 ; Increment [[VA]] (and realign pointer for O32)
 ; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -691,7 +691,7 @@ entry:
 ; NEW-DAG:       ld [[ARG1:\$[0-9]+]], 0([[VA]])
 ; NEW-DAG:       sd [[ARG1]], 8([[GV]])
 
-; ALL: # ANCHOR2
+; ALL: teqi $zero, 2
 
 ; Increment [[VA]] again.
 ; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
@@ -725,12 +725,12 @@ entry:
   %ap2 = bitcast i8** %ap to i8*
   call void @llvm.va_start(i8* %ap2)
 
-  call void asm sideeffect "# ANCHOR1", ""()
+  call void asm sideeffect "teqi $$zero, 1", ""()
   %arg1 = va_arg i8** %ap, i64
   %e1 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 1
   store volatile i64 %arg1, i64* %e1, align 8
 
-  call void asm sideeffect "# ANCHOR2", ""()
+  call void asm sideeffect "teqi $$zero, 2", ""()
   %arg2 = va_arg i8** %ap, i64
   %e2 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 2
   store volatile i64 %arg2, i64* %e2, align 8
@@ -780,7 +780,7 @@ entry:
 ; Store [[VA]]
 ; O32-DAG:       sw [[VA]], 0([[SP]])
 
-; ALL: # ANCHOR1
+; ALL: teqi $zero, 1
 
 ; Increment [[VA]]
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -814,7 +814,7 @@ entry:
 
 ; ALL-DAG:       sh [[ARG1]], 2([[GV]])
 
-; ALL: # ANCHOR2
+; ALL: teqi $zero, 2
 
 ; Increment [[VA]] again.
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -842,12 +842,12 @@ entry:
   %ap2 = bitcast i8** %ap to i8*
   call void @llvm.va_start(i8* %ap2)
 
-  call void asm sideeffect "# ANCHOR1", ""()
+  call void asm sideeffect "teqi $$zero, 1", ""()
   %arg1 = va_arg i8** %ap, i16
   %e1 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 1
   store volatile i16 %arg1, i16* %e1, align 2
 
-  call void asm sideeffect "# ANCHOR2", ""()
+  call void asm sideeffect "teqi $$zero, 2", ""()
   %arg2 = va_arg i8** %ap, i16
   %e2 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 2
   store volatile i16 %arg2, i16* %e2, align 2
@@ -897,7 +897,7 @@ entry:
 ; Store [[VA]]
 ; O32-DAG:       sw [[VA]], 0([[SP]])
 
-; ALL: # ANCHOR1
+; ALL: teqi $zero, 1
 
 ; Increment [[VA]]
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -931,7 +931,7 @@ entry:
 
 ; ALL-DAG:       sw [[ARG1]], 4([[GV]])
 
-; ALL: # ANCHOR2
+; ALL: teqi $zero, 2
 
 ; Increment [[VA]] again.
 ; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -959,12 +959,12 @@ entry:
   %ap2 = bitcast i8** %ap to i8*
   call void @llvm.va_start(i8* %ap2)
 
-  call void asm sideeffect "# ANCHOR1", ""()
+  call void asm sideeffect "teqi $$zero, 1", ""()
   %arg1 = va_arg i8** %ap, i32
   %e1 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 1
   store volatile i32 %arg1, i32* %e1, align 4
 
-  call void asm sideeffect "# ANCHOR2", ""()
+  call void asm sideeffect "teqi $$zero, 2", ""()
   %arg2 = va_arg i8** %ap, i32
   %e2 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 2
   store volatile i32 %arg2, i32* %e2, align 4
@@ -1014,7 +1014,7 @@ entry:
 ; Store [[VA]]
 ; O32-DAG:       sw [[VA]], 0([[SP]])
 
-; ALL: # ANCHOR1
+; ALL: teqi $zero, 1
 
 ; Increment [[VA]] (and realign pointer for O32)
 ; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
@@ -1051,7 +1051,7 @@ entry:
 ; NEW-DAG:       ld [[ARG1:\$[0-9]+]], 0([[VA]])
 ; NEW-DAG:       sd [[ARG1]], 8([[GV]])
 
-; ALL: # ANCHOR2
+; ALL: teqi $zero, 2
 
 ; Increment [[VA]] again.
 ; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
@@ -1085,12 +1085,12 @@ entry:
   %ap2 = bitcast i8** %ap to i8*
   call void @llvm.va_start(i8* %ap2)
 
-  call void asm sideeffect "# ANCHOR1", ""()
+  call void asm sideeffect "teqi $$zero, 1", ""()
   %arg1 = va_arg i8** %ap, i64
   %e1 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 1
   store volatile i64 %arg1, i64* %e1, align 8
 
-  call void asm sideeffect "# ANCHOR2", ""()
+  call void asm sideeffect "teqi $$zero, 2", ""()
   %arg2 = va_arg i8** %ap, i64
   %e2 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 2
   store volatile i64 %arg2, i64* %e2, align 8
diff --git a/test/CodeGen/Mips/ci2.ll b/test/CodeGen/Mips/ci2.ll
index 4687748879acb..bb16fa83fc5cf 100644
--- a/test/CodeGen/Mips/ci2.ll
+++ b/test/CodeGen/Mips/ci2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands   < %s | FileCheck %s -check-prefix=constisle
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands   < %s | FileCheck %s -check-prefix=constisle
 
 @i = common global i32 0, align 4
 @b = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/cmplarge.ll b/test/CodeGen/Mips/cmplarge.ll
index 79019065a9053..1ca5b921e0bc0 100644
--- a/test/CodeGen/Mips/cmplarge.ll
+++ b/test/CodeGen/Mips/cmplarge.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=cmp16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=cmp16
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
 target triple = "mipsel--linux-gnu"
diff --git a/test/CodeGen/Mips/const1.ll b/test/CodeGen/Mips/const1.ll
index 1a5d58bd3f95a..2bcd405179b2c 100644
--- a/test/CodeGen/Mips/const1.ll
+++ b/test/CodeGen/Mips/const1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands < %s | FileCheck %s 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands < %s | FileCheck %s 
 
 ; ModuleID = 'const1.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
diff --git a/test/CodeGen/Mips/const4a.ll b/test/CodeGen/Mips/const4a.ll
index c31e54a010363..d1182d7fc6ec2 100644
--- a/test/CodeGen/Mips/const4a.ll
+++ b/test/CodeGen/Mips/const4a.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands -mips-constant-islands-no-load-relaxation  < %s | FileCheck %s -check-prefix=no-load-relax
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands -mips-constant-islands-no-load-relaxation  < %s | FileCheck %s -check-prefix=no-load-relax
 
 ; ModuleID = 'const4.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
diff --git a/test/CodeGen/Mips/const6.ll b/test/CodeGen/Mips/const6.ll
index 49e98ea787031..c576f573a43bf 100644
--- a/test/CodeGen/Mips/const6.ll
+++ b/test/CodeGen/Mips/const6.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands -mips-constant-islands-no-load-relaxation  < %s | FileCheck %s -check-prefix=no-load-relax
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands -mips-constant-islands-no-load-relaxation  < %s | FileCheck %s -check-prefix=no-load-relax
 
 ; ModuleID = 'const6.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
diff --git a/test/CodeGen/Mips/const6a.ll b/test/CodeGen/Mips/const6a.ll
index 54a3f2234dc22..653cdeb920f31 100644
--- a/test/CodeGen/Mips/const6a.ll
+++ b/test/CodeGen/Mips/const6a.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax1
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax
 
 ; ModuleID = 'const6a.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
diff --git a/test/CodeGen/Mips/div.ll b/test/CodeGen/Mips/div.ll
index 731841c554faf..92258bce02d2f 100644
--- a/test/CodeGen/Mips/div.ll
+++ b/test/CodeGen/Mips/div.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @iiii = global i32 100, align 4
 @jjjj = global i32 -4, align 4
diff --git a/test/CodeGen/Mips/div_rem.ll b/test/CodeGen/Mips/div_rem.ll
index e64529cee8412..be1e001a24c33 100644
--- a/test/CodeGen/Mips/div_rem.ll
+++ b/test/CodeGen/Mips/div_rem.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @iiii = global i32 103, align 4
 @jjjj = global i32 -4, align 4
diff --git a/test/CodeGen/Mips/divu.ll b/test/CodeGen/Mips/divu.ll
index 5bc765a71eb92..ce1b70cacf6f0 100644
--- a/test/CodeGen/Mips/divu.ll
+++ b/test/CodeGen/Mips/divu.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @iiii = global i32 100, align 4
 @jjjj = global i32 4, align 4
diff --git a/test/CodeGen/Mips/divu_remu.ll b/test/CodeGen/Mips/divu_remu.ll
index a079440b913fb..0e094cbe48ae3 100644
--- a/test/CodeGen/Mips/divu_remu.ll
+++ b/test/CodeGen/Mips/divu_remu.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @iiii = global i32 103, align 4
 @jjjj = global i32 4, align 4
diff --git a/test/CodeGen/Mips/eh.ll b/test/CodeGen/Mips/eh.ll
index 2f843d9da9a61..19f3d4d23d640 100644
--- a/test/CodeGen/Mips/eh.ll
+++ b/test/CodeGen/Mips/eh.ll
@@ -24,7 +24,7 @@ entry:
 
 lpad:                                             ; preds = %entry
 ; CHECK-EL:  # %lpad
-; CHECK-EL:  bne $5
+; CHECK-EL:  beq $5
 
   %exn.val = landingpad { i8*, i32 }
            cleanup
diff --git a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
index 58dd16c9f9c8e..54092b4e3ebe9 100644
--- a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
+++ b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
@@ -1,5 +1,5 @@
 ; Check that register scavenging spill slot is close to $fp.
-; RUN: llc -march=mipsel -O0 -fast-isel=false < %s | FileCheck %s
+; RUN: llc -march=mipsel -O0 < %s | FileCheck %s
 
 ; CHECK: sw ${{.*}}, 8($sp)
 ; CHECK: lw ${{.*}}, 8($sp)
@@ -31,4 +31,4 @@ entry:
   ret i32 0
 }
 
-attributes #0 = { noinline optnone "no-frame-pointer-elim"="true" }
+attributes #0 = { noinline "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/Mips/emutls_generic.ll b/test/CodeGen/Mips/emutls_generic.ll
new file mode 100644
index 0000000000000..a6cf23aa67ff0
--- /dev/null
+++ b/test/CodeGen/Mips/emutls_generic.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -emulated-tls -mtriple=mipsel-linux-android -relocation-model=pic \
+; RUN:     | FileCheck -check-prefix=MIPS_32 %s
+; RUN: llc < %s -emulated-tls -mtriple=mips64el-linux-android -relocation-model=pic \
+; RUN:     | FileCheck -check-prefix=MIPS_64 %s
+
+; Make sure that TLS symbols are emitted in expected order.
+
+@external_x = external thread_local global i32, align 8
+@external_y = thread_local global i8 7, align 2
+@internal_y = internal thread_local global i64 9, align 16
+
+define i32* @get_external_x() {
+entry:
+  ret i32* @external_x
+}
+
+define i8* @get_external_y() {
+entry:
+  ret i8* @external_y
+}
+
+define i64* @get_internal_y() {
+entry:
+  ret i64* @internal_y
+}
+
+; MIPS_32-LABEL: get_external_y:
+; MIPS_32-LABEL: get_internal_y:
+; MIPS_32:     lw {{.+}}(__emutls_v.internal_y
+; MIPS_32:     lw {{.+}}call16(__emutls_get_address
+; MIPS_32-NOT:  __emutls_t.external_x
+; MIPS_32-NOT:  __emutls_v.external_x:
+; MIPS_32:       .data
+; MIPS_32:       .align 2
+; MIPS_32-LABEL: __emutls_v.external_y:
+; MIPS_32:       .section .rodata,
+; MIPS_32-LABEL: __emutls_t.external_y:
+; MIPS_32-NEXT:  .byte 7
+; MIPS_32:       .data
+; MIPS_32:       .align 2
+; MIPS_32-LABEL: __emutls_v.internal_y:
+; MIPS_32-NEXT:  .4byte 8
+; MIPS_32-NEXT:  .4byte 16
+; MIPS_32-NEXT:  .4byte 0
+; MIPS_32-NEXT:  .4byte __emutls_t.internal_y
+; MIPS_32-LABEL: __emutls_t.internal_y:
+; MIPS_32-NEXT:  .8byte 9
+
+; MIPS_64-LABEL: get_external_x:
+; MIPS_64-LABEL: get_external_y:
+; MIPS_64-LABEL: get_internal_y:
+; MIPS_64:     ld {{.+}}(__emutls_v.internal_y
+; MIPS_64:     ld {{.+}}call16(__emutls_get_address
+; MIPS_64-NOT:  __emutls_t.external_x
+; MIPS_64-NOT:  __emutls_v.external_x:
+; MIPS_64-LABEL: __emutls_v.external_y:
+; MIPS_64-NOT:   __emutls_v.external_x:
+; MIPS_64:       .section .rodata,
+; MIPS_64-LABEL: __emutls_t.external_y:
+; MIPS_64-NEXT:  .byte 7
+; MIPS_64:       .data
+; MIPS_64:       .align 3
+; MIPS_64-LABEL: __emutls_v.internal_y:
+; MIPS_64-NEXT:  .8byte 8
+; MIPS_64-NEXT:  .8byte 16
+; MIPS_64-NEXT:  .8byte 0
+; MIPS_64-NEXT:  .8byte __emutls_t.internal_y
+; MIPS_64:       .section .rodata,
+; MIPS_64-LABEL: __emutls_t.internal_y:
+; MIPS_64-NEXT:  .8byte 9
diff --git a/test/CodeGen/Mips/ex2.ll b/test/CodeGen/Mips/ex2.ll
index 7547fdf81e350..87fe77035ec26 100644
--- a/test/CodeGen/Mips/ex2.ll
+++ b/test/CodeGen/Mips/ex2.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @.str = private unnamed_addr constant [6 x i8] c"hello\00", align 1
 @_ZTIPKc = external constant i8*
diff --git a/test/CodeGen/Mips/extins.ll b/test/CodeGen/Mips/extins.ll
index 6604f89b18437..0b327a91bbfd2 100644
--- a/test/CodeGen/Mips/extins.ll
+++ b/test/CodeGen/Mips/extins.ll
@@ -1,5 +1,5 @@
 ; RUN: llc  < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2
-; RUN: llc  < %s -march=mips -mcpu=mips16 | FileCheck %s -check-prefix=16
+; RUN: llc  < %s -march=mips -mattr=mips16 | FileCheck %s -check-prefix=16
 
 define i32 @ext0_5_9(i32 %s, i32 %pos, i32 %sz) nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/f16abs.ll b/test/CodeGen/Mips/f16abs.ll
index 838983274e9bd..6c33e011719e5 100644
--- a/test/CodeGen/Mips/f16abs.ll
+++ b/test/CodeGen/Mips/f16abs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=static
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=static
 
 @y = global double -1.450000e+00, align 8
 @x = common global double 0.000000e+00, align 8
diff --git a/test/CodeGen/Mips/fixdfsf.ll b/test/CodeGen/Mips/fixdfsf.ll
index 869579922d51a..5eb336bf64999 100644
--- a/test/CodeGen/Mips/fixdfsf.ll
+++ b/test/CodeGen/Mips/fixdfsf.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic1
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic2
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic2
 
 @x = common global double 0.000000e+00, align 8
 @y = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/fp16instrinsmc.ll b/test/CodeGen/Mips/fp16instrinsmc.ll
index 797be2668d407..258b1bf97d7b6 100644
--- a/test/CodeGen/Mips/fp16instrinsmc.ll
+++ b/test/CodeGen/Mips/fp16instrinsmc.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -mips32-function-mask=1010111 -mips-os16 < %s | FileCheck %s -check-prefix=fmask
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static -mips32-function-mask=1010111 -mips-os16 < %s | FileCheck %s -check-prefix=fmask
 
 @x = global float 1.500000e+00, align 4
 @xn = global float -1.900000e+01, align 4
diff --git a/test/CodeGen/Mips/fp16mix.ll b/test/CodeGen/Mips/fp16mix.ll
index a94f838fb675c..d97759422cf94 100644
--- a/test/CodeGen/Mips/fp16mix.ll
+++ b/test/CodeGen/Mips/fp16mix.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -mips32-function-mask=10 -mips-os16 < %s | FileCheck %s -check-prefix=fmask1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static -mips32-function-mask=10 -mips-os16 < %s | FileCheck %s -check-prefix=fmask1
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -mips32-function-mask=01 -mips-os16 < %s | FileCheck %s -check-prefix=fmask2
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static -mips32-function-mask=01 -mips-os16 < %s | FileCheck %s -check-prefix=fmask2
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -mips32-function-mask=10. -mips-os16 < %s | FileCheck %s -check-prefix=fmask1nr
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static -mips32-function-mask=10. -mips-os16 < %s | FileCheck %s -check-prefix=fmask1nr
 
 ; Function Attrs: nounwind optsize readnone
 define void @foo1()  {
diff --git a/test/CodeGen/Mips/fp16static.ll b/test/CodeGen/Mips/fp16static.ll
index 4e5059ed39e93..341ecf02cb5c4 100644
--- a/test/CodeGen/Mips/fp16static.ll
+++ b/test/CodeGen/Mips/fp16static.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
 
 @x = common global float 0.000000e+00, align 4
 
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
index a0dbdf3afd47b..f715313354ea3 100644
--- a/test/CodeGen/Mips/helloworld.ll
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -1,10 +1,10 @@
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C1
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C2
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PE
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST1
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST2
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C1
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C2
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PE
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST1
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST2
 ;
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR
 ; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32  -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR32
 
 
diff --git a/test/CodeGen/Mips/hf16_1.ll b/test/CodeGen/Mips/hf16_1.ll
index 103fd2d7fd63f..aea241e271953 100644
--- a/test/CodeGen/Mips/hf16_1.ll
+++ b/test/CodeGen/Mips/hf16_1.ll
@@ -1,5 +1,5 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=1
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=2
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=1
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=2
 
 
 @x = common global float 0.000000e+00, align 4
diff --git a/test/CodeGen/Mips/hf16call32.ll b/test/CodeGen/Mips/hf16call32.ll
index 3b3f8f799111f..9fc94cac51752 100644
--- a/test/CodeGen/Mips/hf16call32.ll
+++ b/test/CodeGen/Mips/hf16call32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=stel
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=stel
 
 @x = common global float 0.000000e+00, align 4
 @y = common global float 0.000000e+00, align 4
@@ -751,280 +751,280 @@ land.end289:                                      ; preds = %land.rhs286, %land.
 }
 
 declare void @v_sf(float) #1
-; stel: .section	.mips16.call.fp.v_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_v_sf
-; stel:	mtc1 $4,$f12
-; stel:	lui  $25,%hi(v_sf)
-; stel:	addiu  $25,$25,%lo(v_sf)
-; stel:	jr $25
-; stel:	.end	__call_stub_fp_v_sf
+; stel: .section .mips16.call.fp.v_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_v_sf
+; stel: mtc1 $4, $f12
+; stel: lui $25, %hi(v_sf)
+; stel: addiu $25, $25, %lo(v_sf)
+; stel: jr $25
+; stel: .end __call_stub_fp_v_sf
 
 declare i32 @printf(i8*, ...) #1
 
 declare void @v_df(double) #1
-; stel: .section	.mips16.call.fp.v_df,"ax",@progbits
-; stel:	.ent	__call_stub_fp_v_df
+; stel: .section .mips16.call.fp.v_df,"ax",@progbits
+; stel: .ent __call_stub_fp_v_df
 ; stel: #APP
-; setl: .set reorder
-; stel:	mtc1 $4,$f12
-; stel:	mtc1 $5,$f13
-; stel:	lui  $25,%hi(v_df)
-; stel:	addiu  $25,$25,%lo(v_df)
-; stel:	jr $25
-; stel:	.end	__call_stub_fp_v_df
+; stel: .set reorder
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f13
+; stel: lui $25, %hi(v_df)
+; stel: addiu $25, $25, %lo(v_df)
+; stel: jr $25
+; stel: .end __call_stub_fp_v_df
 
 declare void @v_sf_sf(float, float) #1
-; stel: .section	.mips16.call.fp.v_sf_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_v_sf_sf
-; stel:	mtc1 $4,$f12
-; stel:	mtc1 $5,$f14
-; stel:	lui  $25,%hi(v_sf_sf)
-; stel:	addiu  $25,$25,%lo(v_sf_sf)
-; stel:	jr $25
-; stel:	.end	__call_stub_fp_v_sf_sf
+; stel: .section .mips16.call.fp.v_sf_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_v_sf_sf
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f14
+; stel: lui $25, %hi(v_sf_sf)
+; stel: addiu $25, $25, %lo(v_sf_sf)
+; stel: jr $25
+; stel: .end __call_stub_fp_v_sf_sf
 
 declare void @v_sf_df(float, double) #1
-; stel: .section	.mips16.call.fp.v_sf_df,"ax",@progbits
-; stel:	.ent	__call_stub_fp_v_sf_df
-; stel:	mtc1 $4,$f12
-; stel:	mtc1 $6,$f14
-; stel:	mtc1 $7,$f15
-; stel:	lui  $25,%hi(v_sf_df)
-; stel:	addiu  $25,$25,%lo(v_sf_df)
-; stel:	jr $25
-; stel:	.end	__call_stub_fp_v_sf_df
+; stel: .section .mips16.call.fp.v_sf_df,"ax",@progbits
+; stel: .ent __call_stub_fp_v_sf_df
+; stel: mtc1 $4, $f12
+; stel: mtc1 $6, $f14
+; stel: mtc1 $7, $f15
+; stel: lui $25, %hi(v_sf_df)
+; stel: addiu $25, $25, %lo(v_sf_df)
+; stel: jr $25
+; stel: .end __call_stub_fp_v_sf_df
 
 declare void @v_df_sf(double, float) #1
-; stel: .section	.mips16.call.fp.v_df_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_v_df_sf
-; stel:	mtc1 $4,$f12
-; stel:	mtc1 $5,$f13
-; stel:	mtc1 $6,$f14
-; stel:	lui  $25,%hi(v_df_sf)
-; stel:	addiu  $25,$25,%lo(v_df_sf)
-; stel:	jr $25
-; stel:	.end	__call_stub_fp_v_df_sf
+; stel: .section .mips16.call.fp.v_df_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_v_df_sf
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f13
+; stel: mtc1 $6, $f14
+; stel: lui $25, %hi(v_df_sf)
+; stel: addiu $25, $25, %lo(v_df_sf)
+; stel: jr $25
+; stel: .end __call_stub_fp_v_df_sf
 
 declare void @v_df_df(double, double) #1
-; stel: .section	.mips16.call.fp.v_df_df,"ax",@progbits
-; stel:	.ent	__call_stub_fp_v_df_df
-; stel:	mtc1 $4,$f12
-; stel:	mtc1 $5,$f13
-; stel:	mtc1 $6,$f14
-; stel:	mtc1 $7,$f15
-; stel:	lui  $25,%hi(v_df_df)
-; stel:	addiu  $25,$25,%lo(v_df_df)
-; stel:	jr $25
-; stel:	.end	__call_stub_fp_v_df_df
+; stel: .section .mips16.call.fp.v_df_df,"ax",@progbits
+; stel: .ent __call_stub_fp_v_df_df
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f13
+; stel: mtc1 $6, $f14
+; stel: mtc1 $7, $f15
+; stel: lui $25, %hi(v_df_df)
+; stel: addiu $25, $25, %lo(v_df_df)
+; stel: jr $25
+; stel: .end __call_stub_fp_v_df_df
 
 declare float @sf_v() #1
-; stel: .section	.mips16.call.fp.sf_v,"ax",@progbits
-; stel:	.ent	__call_stub_fp_sf_v
+; stel: .section .mips16.call.fp.sf_v,"ax",@progbits
+; stel: .ent __call_stub_fp_sf_v
 ; stel: move $18, $31
 ; stel: jal sf_v
-; stel:	mfc1 $2,$f0
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_sf_v
+; stel: mfc1 $2, $f0
+; stel: jr $18
+; stel: .end __call_stub_fp_sf_v
 
 declare float @sf_sf(float) #1
-; stel: .section	.mips16.call.fp.sf_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_sf_sf
-; stel: mtc1 $4,$f12
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.sf_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_sf_sf
+; stel: mtc1 $4, $f12
+; stel: move $18, ${{31|ra}}
 ; stel: jal sf_sf
-; stel:	mfc1 $2,$f0
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_sf_sf
+; stel: mfc1 $2, $f0
+; stel: jr $18
+; stel: .end __call_stub_fp_sf_sf
 
 declare float @sf_df(double) #1
-; stel: .section	.mips16.call.fp.sf_df,"ax",@progbits
-; stel:	.ent	__call_stub_fp_sf_df
-; stel: mtc1 $4,$f12
-; stel: mtc1 $5,$f13
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.sf_df,"ax",@progbits
+; stel: .ent __call_stub_fp_sf_df
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f13
+; stel: move $18, ${{31|ra}}
 ; stel: jal sf_df
-; stel:	mfc1 $2,$f0
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_sf_df
+; stel: mfc1 $2, $f0
+; stel: jr $18
+; stel: .end __call_stub_fp_sf_df
 
 declare float @sf_sf_sf(float, float) #1
-; stel: .section	.mips16.call.fp.sf_sf_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_sf_sf_sf
-; stel: mtc1 $4,$f12
-; stel: mtc1 $5,$f14
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.sf_sf_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_sf_sf_sf
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f14
+; stel: move $18, ${{31|ra}}
 ; stel: jal sf_sf_sf
-; stel:	mfc1 $2,$f0
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_sf_sf_sf
+; stel: mfc1 $2, $f0
+; stel: jr $18
+; stel: .end __call_stub_fp_sf_sf_sf
 
 declare float @sf_sf_df(float, double) #1
-; stel: .section	.mips16.call.fp.sf_sf_df,"ax",@progbits
-; stel:	.ent	__call_stub_fp_sf_sf_df
-; stel: mtc1 $4,$f12
-; stel: mtc1 $6,$f14
-; stel: mtc1 $7,$f15
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.sf_sf_df,"ax",@progbits
+; stel: .ent __call_stub_fp_sf_sf_df
+; stel: mtc1 $4, $f12
+; stel: mtc1 $6, $f14
+; stel: mtc1 $7, $f15
+; stel: move $18, ${{31|ra}}
 ; stel: jal sf_sf_df
-; stel:	mfc1 $2,$f0
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_sf_sf_df
+; stel: mfc1 $2, $f0
+; stel: jr $18
+; stel: .end __call_stub_fp_sf_sf_df
 
 declare float @sf_df_sf(double, float) #1
-; stel: .section	.mips16.call.fp.sf_df_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_sf_df_sf
-; stel: mtc1 $4,$f12
-; stel: mtc1 $5,$f13
-; stel: mtc1 $6,$f14
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.sf_df_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_sf_df_sf
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f13
+; stel: mtc1 $6, $f14
+; stel: move $18, ${{31|ra}}
 ; stel: jal sf_df_sf
-; stel:	mfc1 $2,$f0
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_sf_df_sf
+; stel: mfc1 $2, $f0
+; stel: jr $18
+; stel: .end __call_stub_fp_sf_df_sf
 
 declare float @sf_df_df(double, double) #1
-; stel: .section	.mips16.call.fp.sf_df_df,"ax",@progbits
-; stel:	.ent	__call_stub_fp_sf_df_df
-; stel: mtc1 $4,$f12
-; stel: mtc1 $5,$f13
-; stel: mtc1 $6,$f14
-; stel: mtc1 $7,$f15
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.sf_df_df,"ax",@progbits
+; stel: .ent __call_stub_fp_sf_df_df
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f13
+; stel: mtc1 $6, $f14
+; stel: mtc1 $7, $f15
+; stel: move $18, ${{31|ra}}
 ; stel: jal sf_df_df
-; stel:	mfc1 $2,$f0
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_sf_df_df
+; stel: mfc1 $2, $f0
+; stel: jr $18
+; stel: .end __call_stub_fp_sf_df_df
 
 declare double @df_v() #1
-; stel: .section	.mips16.call.fp.df_v,"ax",@progbits
-; stel:	.ent	__call_stub_fp_df_v
+; stel: .section .mips16.call.fp.df_v,"ax",@progbits
+; stel: .ent __call_stub_fp_df_v
 ; stel: move $18, $31
 ; stel: jal df_v
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f1
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_df_v
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f1
+; stel: jr $18
+; stel: .end __call_stub_fp_df_v
 
 declare double @df_sf(float) #1
-; stel: .section	.mips16.call.fp.df_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_df_sf
-; stel: mtc1 $4,$f12
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.df_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_df_sf
+; stel: mtc1 $4, $f12
+; stel: move $18, ${{31|ra}}
 ; stel: jal df_sf
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f1
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_df_sf
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f1
+; stel: jr $18
+; stel: .end __call_stub_fp_df_sf
 
 declare double @df_df(double) #1
-; stel: .section	.mips16.call.fp.df_df,"ax",@progbits
-; stel:	.ent	__call_stub_fp_df_df
-; stel: mtc1 $4,$f12
-; stel: mtc1 $5,$f13
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.df_df,"ax",@progbits
+; stel: .ent __call_stub_fp_df_df
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f13
+; stel: move $18, ${{31|ra}}
 ; stel: jal df_df
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f1
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_df_df
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f1
+; stel: jr $18
+; stel: .end __call_stub_fp_df_df
 
 declare double @df_sf_sf(float, float) #1
-; stel: .section	.mips16.call.fp.df_sf_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_df_sf_sf
-; stel: mtc1 $4,$f12
-; stel: mtc1 $5,$f14
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.df_sf_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_df_sf_sf
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f14
+; stel: move $18, ${{31|ra}}
 ; stel: jal df_sf_sf
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f1
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_df_sf_sf
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f1
+; stel: jr $18
+; stel: .end __call_stub_fp_df_sf_sf
 
 declare double @df_sf_df(float, double) #1
-; stel: .section	.mips16.call.fp.df_sf_df,"ax",@progbits
-; stel:	.ent	__call_stub_fp_df_sf_df
-; stel: mtc1 $4,$f12
-; stel: mtc1 $6,$f14
-; stel: mtc1 $7,$f15
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.df_sf_df,"ax",@progbits
+; stel: .ent __call_stub_fp_df_sf_df
+; stel: mtc1 $4, $f12
+; stel: mtc1 $6, $f14
+; stel: mtc1 $7, $f15
+; stel: move $18, ${{31|ra}}
 ; stel: jal df_sf_df
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f1
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_df_sf_df
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f1
+; stel: jr $18
+; stel: .end __call_stub_fp_df_sf_df
 
 declare double @df_df_sf(double, float) #1
-; stel: .section	.mips16.call.fp.df_df_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_df_df_sf
-; stel: mtc1 $4,$f12
-; stel: mtc1 $5,$f13
-; stel: mtc1 $6,$f14
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.df_df_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_df_df_sf
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f13
+; stel: mtc1 $6, $f14
+; stel: move $18, ${{31|ra}}
 ; stel: jal df_df_sf
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f1
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_df_df_sf
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f1
+; stel: jr $18
+; stel: .end __call_stub_fp_df_df_sf
 
 declare double @df_df_df(double, double) #1
-; stel: .section	.mips16.call.fp.df_df_df,"ax",@progbits
-; stel:	.ent	__call_stub_fp_df_df_df
-; stel: mtc1 $4,$f12
-; stel: mtc1 $5,$f13
-; stel: mtc1 $6,$f14
-; stel: mtc1 $7,$f15
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.df_df_df,"ax",@progbits
+; stel: .ent __call_stub_fp_df_df_df
+; stel: mtc1 $4, $f12
+; stel: mtc1 $5, $f13
+; stel: mtc1 $6, $f14
+; stel: mtc1 $7, $f15
+; stel: move $18, ${{31|ra}}
 ; stel: jal df_df_df
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f1
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_df_df_df
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f1
+; stel: jr $18
+; stel: .end __call_stub_fp_df_df_df
 
 declare { float, float } @sc_v() #1
-; stel: .section	.mips16.call.fp.sc_v,"ax",@progbits
-; stel:	.ent	__call_stub_fp_sc_v
+; stel: .section .mips16.call.fp.sc_v,"ax",@progbits
+; stel: .ent __call_stub_fp_sc_v
 ; stel: move $18, $31
 ; stel: jal sc_v
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f2
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_sc_v
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f2
+; stel: jr $18
+; stel: .end __call_stub_fp_sc_v
 
 declare { float, float } @sc_sf(float) #1
-; stel: .section	.mips16.call.fp.sc_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_sc_sf
-; stel: mtc1 $4,$f12
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.sc_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_sc_sf
+; stel: mtc1 $4, $f12
+; stel: move $18, ${{31|ra}}
 ; stel: jal sc_sf
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f2
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_sc_sf
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f2
+; stel: jr $18
+; stel: .end __call_stub_fp_sc_sf
 
 declare { double, double } @dc_v() #1
-; stel: .section	.mips16.call.fp.dc_v,"ax",@progbits
-; stel:	.ent	__call_stub_fp_dc_v
+; stel: .section .mips16.call.fp.dc_v,"ax",@progbits
+; stel: .ent __call_stub_fp_dc_v
 ; stel: move $18, $31
 ; stel: jal dc_v
-; stel:	mfc1 $4,$f2
-; stel:	mfc1 $5,$f3
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f1
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_dc_v
+; stel: mfc1 $4, $f2
+; stel: mfc1 $5, $f3
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f1
+; stel: jr $18
+; stel: .end __call_stub_fp_dc_v
 
 declare { double, double } @dc_sf(float) #1
-; stel: .section	.mips16.call.fp.dc_sf,"ax",@progbits
-; stel:	.ent	__call_stub_fp_dc_sf
-; stel: mtc1 $4,$f12
-; stel: move $18, $31
+; stel: .section .mips16.call.fp.dc_sf,"ax",@progbits
+; stel: .ent __call_stub_fp_dc_sf
+; stel: mtc1 $4, $f12
+; stel: move $18, ${{31|ra}}
 ; stel: jal dc_sf
-; stel:	mfc1 $4,$f2
-; stel:	mfc1 $5,$f3
-; stel:	mfc1 $2,$f0
-; stel:	mfc1 $3,$f1
-; stel:	jr $18
-; stel:	.end	__call_stub_fp_dc_sf
+; stel: mfc1 $4, $f2
+; stel: mfc1 $5, $f3
+; stel: mfc1 $2, $f0
+; stel: mfc1 $3, $f1
+; stel: jr $18
+; stel: .end __call_stub_fp_dc_sf
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/hf16call32_body.ll b/test/CodeGen/Mips/hf16call32_body.ll
index d06256cc564fd..1a04fd46f8bd6 100644
--- a/test/CodeGen/Mips/hf16call32_body.ll
+++ b/test/CodeGen/Mips/hf16call32_body.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=stel
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=stel
 
 @x = external global float
 @xd = external global double
@@ -18,13 +18,13 @@ entry:
   store float %0, float* @x, align 4
   ret void
 }
-; stel: .section	.mips16.fn.v_sf,"ax",@progbits
-; stel:	.ent	__fn_stub_v_sf
-; stel:		la $25,v_sf
-; stel:		mfc1 $4,$f12
-; stel:		jr $25
-; stel:		__fn_local_v_sf = v_sf
-; stel:	.end	__fn_stub_v_sf
+; stel: .section .mips16.fn.v_sf,"ax",@progbits
+; stel: .ent __fn_stub_v_sf
+; stel: la $25, v_sf
+; stel: mfc1 $4, $f12
+; stel: jr $25
+; stel: __fn_local_v_sf = v_sf
+; stel: .end __fn_stub_v_sf
 
 declare i32 @printf(i8*, ...) #1
 
@@ -38,14 +38,14 @@ entry:
   ret void
 }
 
-; stel: .section	.mips16.fn.v_df,"ax",@progbits
-; stel:	.ent	__fn_stub_v_df
-; stel:		la $25,v_df
-; stel:		mfc1 $4,$f12
-; stel:		mfc1 $5,$f13
-; stel:		jr $25
-; stel:		__fn_local_v_df = v_df
-; stel:	.end	__fn_stub_v_df
+; stel: .section .mips16.fn.v_df,"ax",@progbits
+; stel: .ent __fn_stub_v_df
+; stel: la $25, v_df
+; stel: mfc1 $4, $f12
+; stel: mfc1 $5, $f13
+; stel: jr $25
+; stel: __fn_local_v_df = v_df
+; stel: .end __fn_stub_v_df
 
 ; Function Attrs: nounwind
 define void @v_sf_sf(float %p1, float %p2) #0 {
@@ -61,14 +61,14 @@ entry:
   ret void
 }
 
-; stel: .section	.mips16.fn.v_sf_sf,"ax",@progbits
-; stel:	.ent	__fn_stub_v_sf_sf
-; stel:		la $25,v_sf_sf
-; stel:		mfc1 $4,$f12
-; stel:		mfc1 $5,$f14
-; stel:		jr $25
-; stel:		__fn_local_v_sf_sf = v_sf_sf
-; stel:	.end	__fn_stub_v_sf_sf
+; stel: .section .mips16.fn.v_sf_sf,"ax",@progbits
+; stel: .ent __fn_stub_v_sf_sf
+; stel: la $25, v_sf_sf
+; stel: mfc1 $4, $f12
+; stel: mfc1 $5, $f14
+; stel: jr $25
+; stel: __fn_local_v_sf_sf = v_sf_sf
+; stel: .end __fn_stub_v_sf_sf
 
 ; Function Attrs: nounwind
 define void @v_sf_df(float %p1, double %p2) #0 {
@@ -84,15 +84,15 @@ entry:
   ret void
 }
 
-; stel: .section	.mips16.fn.v_sf_df,"ax",@progbits
-; stel:	.ent	__fn_stub_v_sf_df
-; stel:		la $25,v_sf_df
-; stel:		mfc1 $4,$f12
-; stel:		mfc1 $6,$f14
-; stel:		mfc1 $7,$f15
-; stel:		jr $25
-; stel:		__fn_local_v_sf_df = v_sf_df
-; stel:	.end	__fn_stub_v_sf_df
+; stel: .section .mips16.fn.v_sf_df,"ax",@progbits
+; stel: .ent __fn_stub_v_sf_df
+; stel: la $25, v_sf_df
+; stel: mfc1 $4, $f12
+; stel: mfc1 $6, $f14
+; stel: mfc1 $7, $f15
+; stel: jr $25
+; stel: __fn_local_v_sf_df = v_sf_df
+; stel: .end __fn_stub_v_sf_df
 
 ; Function Attrs: nounwind
 define void @v_df_sf(double %p1, float %p2) #0 {
@@ -108,15 +108,15 @@ entry:
   ret void
 }
 
-; stel: .section	.mips16.fn.v_df_sf,"ax",@progbits
-; stel:	.ent	__fn_stub_v_df_sf
-; stel:		la $25,v_df_sf
-; stel:		mfc1 $4,$f12
-; stel:		mfc1 $5,$f13
-; stel:		mfc1 $6,$f14
-; stel:		jr $25
-; stel:		__fn_local_v_df_sf = v_df_sf
-; stel:	.end	__fn_stub_v_df_sf
+; stel: .section .mips16.fn.v_df_sf,"ax",@progbits
+; stel: .ent __fn_stub_v_df_sf
+; stel: la $25, v_df_sf
+; stel: mfc1 $4, $f12
+; stel: mfc1 $5, $f13
+; stel: mfc1 $6, $f14
+; stel: jr $25
+; stel: __fn_local_v_df_sf = v_df_sf
+; stel: .end __fn_stub_v_df_sf
 
 ; Function Attrs: nounwind
 define void @v_df_df(double %p1, double %p2) #0 {
@@ -132,16 +132,16 @@ entry:
   ret void
 }
 
-; stel: .section	.mips16.fn.v_df_df,"ax",@progbits
-; stel:	.ent	__fn_stub_v_df_df
-; stel:		la $25,v_df_df
-; stel:		mfc1 $4,$f12
-; stel:		mfc1 $5,$f13
-; stel:		mfc1 $6,$f14
-; stel:		mfc1 $7,$f15
-; stel:		jr $25
-; stel:		__fn_local_v_df_df = v_df_df
-; stel:	.end	__fn_stub_v_df_df
+; stel: .section .mips16.fn.v_df_df,"ax",@progbits
+; stel: .ent __fn_stub_v_df_df
+; stel: la $25, v_df_df
+; stel: mfc1 $4, $f12
+; stel: mfc1 $5, $f13
+; stel: mfc1 $6, $f14
+; stel: mfc1 $7, $f15
+; stel: jr $25
+; stel: __fn_local_v_df_df = v_df_df
+; stel: .end __fn_stub_v_df_df
 
 ; Function Attrs: nounwind
 define float @sf_v() #0 {
@@ -162,13 +162,13 @@ entry:
 }
 
 
-; stel: .section	.mips16.fn.sf_sf,"ax",@progbits
-; stel:	.ent	__fn_stub_sf_sf
-; stel:		la $25,sf_sf
-; stel:		mfc1 $4,$f12
-; stel:		jr $25
-; stel:		__fn_local_sf_sf = sf_sf
-; stel:	.end	__fn_stub_sf_sf
+; stel: .section .mips16.fn.sf_sf,"ax",@progbits
+; stel: .ent __fn_stub_sf_sf
+; stel: la $25, sf_sf
+; stel: mfc1 $4, $f12
+; stel: jr $25
+; stel: __fn_local_sf_sf = sf_sf
+; stel: .end __fn_stub_sf_sf
 
 
 ; Function Attrs: nounwind
@@ -182,14 +182,14 @@ entry:
   ret float %1
 }
 
-; stel: .section	.mips16.fn.sf_df,"ax",@progbits
-; stel:	.ent	__fn_stub_sf_df
-; stel:		la $25,sf_df
-; stel:		mfc1 $4,$f12
-; stel:		mfc1 $5,$f13
-; stel:		jr $25
-; stel:		__fn_local_sf_df = sf_df
-; stel:	.end	__fn_stub_sf_df
+; stel: .section .mips16.fn.sf_df,"ax",@progbits
+; stel: .ent __fn_stub_sf_df
+; stel: la $25, sf_df
+; stel: mfc1 $4, $f12
+; stel: mfc1 $5, $f13
+; stel: jr $25
+; stel: __fn_local_sf_df = sf_df
+; stel: .end __fn_stub_sf_df
 
 ; Function Attrs: nounwind
 define float @sf_sf_sf(float %p1, float %p2) #0 {
@@ -206,14 +206,14 @@ entry:
   ret float %2
 }
 
-; stel: .section	.mips16.fn.sf_sf_sf,"ax",@progbits
-; stel:	.ent	__fn_stub_sf_sf_sf
-; stel:		la $25,sf_sf_sf
-; stel:		mfc1 $4,$f12
-; stel:		mfc1 $5,$f14
-; stel:		jr $25
-; stel:		__fn_local_sf_sf_sf = sf_sf_sf
-; stel:	.end	__fn_stub_sf_sf_sf
+; stel: .section .mips16.fn.sf_sf_sf,"ax",@progbits
+; stel: .ent __fn_stub_sf_sf_sf
+; stel: la $25, sf_sf_sf
+; stel: mfc1 $4, $f12
+; stel: mfc1 $5, $f14
+; stel: jr $25
+; stel: __fn_local_sf_sf_sf = sf_sf_sf
+; stel: .end __fn_stub_sf_sf_sf
 
 ; Function Attrs: nounwind
 define float @sf_sf_df(float %p1, double %p2) #0 {
@@ -230,15 +230,15 @@ entry:
   ret float %2
 }
 
-; stel: .section	.mips16.fn.sf_sf_df,"ax",@progbits
-; stel:	.ent	__fn_stub_sf_sf_df
-; stel:		la $25,sf_sf_df
-; stel:		mfc1 $4,$f12
-; stel:		mfc1 $6,$f14
-; stel:		mfc1 $7,$f15
-; stel:		jr $25
-; stel:		__fn_local_sf_sf_df = sf_sf_df
-; stel:	.end	__fn_stub_sf_sf_df
+; stel: .section .mips16.fn.sf_sf_df,"ax",@progbits
+; stel: .ent __fn_stub_sf_sf_df
+; stel: la $25, sf_sf_df
+; stel: mfc1 $4, $f12
+; stel: mfc1 $6, $f14
+; stel: mfc1 $7, $f15
+; stel: jr $25
+; stel: __fn_local_sf_sf_df = sf_sf_df
+; stel: .end __fn_stub_sf_sf_df
 
 ; Function Attrs: nounwind
 define float @sf_df_sf(double %p1, float %p2) #0 {
@@ -255,15 +255,15 @@ entry:
   ret float %2
 }
 
-; stel: .section	.mips16.fn.sf_df_sf,"ax",@progbits
-; stel:	.ent	__fn_stub_sf_df_sf
-; stel:		la $25,sf_df_sf
-; stel:		mfc1 $4,$f12
-; stel:		mfc1 $5,$f13
-; stel:		mfc1 $6,$f14
-; stel:		jr $25
-; stel:		__fn_local_sf_df_sf = sf_df_sf
-; stel:	.end	__fn_stub_sf_df_sf
+; stel: .section .mips16.fn.sf_df_sf,"ax",@progbits
+; stel: .ent __fn_stub_sf_df_sf
+; stel: la $25, sf_df_sf
+; stel: mfc1 $4, $f12
+; stel: mfc1 $5, $f13
+; stel: mfc1 $6, $f14
+; stel: jr $25
+; stel: __fn_local_sf_df_sf = sf_df_sf
+; stel: .end __fn_stub_sf_df_sf
 
 ; Function Attrs: nounwind
 define float @sf_df_df(double %p1, double %p2) #0 {
@@ -280,15 +280,15 @@ entry:
   ret float %2
 }
 
-; stel: .section	.mips16.fn.sf_df_df,"ax",@progbits
-; stel:	.ent	__fn_stub_sf_df_df
-; stel:		la $25,sf_df_df
-; stel:		mfc1 $4,$f12
-; stel:		mfc1 $5,$f13
-; stel:		mfc1 $6,$f14
-; stel:		mfc1 $7,$f15
-; stel:		jr $25
-; stel:		__fn_local_sf_df_df = sf_df_df
-; stel:	.end	__fn_stub_sf_df_df
+; stel: .section .mips16.fn.sf_df_df,"ax",@progbits
+; stel: .ent __fn_stub_sf_df_df
+; stel: la $25, sf_df_df
+; stel: mfc1 $4, $f12
+; stel: mfc1 $5, $f13
+; stel: mfc1 $6, $f14
+; stel: mfc1 $7, $f15
+; stel: jr $25
+; stel: __fn_local_sf_df_df = sf_df_df
+; stel: .end __fn_stub_sf_df_df
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/hf1_body.ll b/test/CodeGen/Mips/hf1_body.ll
index 71a1b960c5b34..adf45109d69a4 100644
--- a/test/CodeGen/Mips/hf1_body.ll
+++ b/test/CodeGen/Mips/hf1_body.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16  -relocation-model=pic < %s | FileCheck %s -check-prefix=picfp16
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=picfp16
 
 @x = external global float
 
@@ -11,11 +11,11 @@ entry:
   store float %0, float* @x, align 4
   ret void
 }
-; picfp16:	.ent	__fn_stub_v_sf
-; picfp16:	.cpload  $25
-; picfp16:	.set reorder
-; picfp16:	.reloc 0,R_MIPS_NONE,v_sf
-; picfp16: 	la $25,$__fn_local_v_sf
-; picfp16: 	mfc1 $4,$f12
-; picfp16: 	jr $25
-; picfp16: 	.end	__fn_stub_v_sf
+; picfp16: .ent __fn_stub_v_sf
+; picfp16: .cpload $25
+; picfp16: .set reorder
+; picfp16: .reloc 0, R_MIPS_NONE, v_sf
+; picfp16: la $25, $__fn_local_v_sf
+; picfp16: mfc1 $4, $f12
+; picfp16: jr $25
+; picfp16: .end __fn_stub_v_sf
diff --git a/test/CodeGen/Mips/hfptrcall.ll b/test/CodeGen/Mips/hfptrcall.ll
index c9f1fe9737961..f6ce243c30471 100644
--- a/test/CodeGen/Mips/hfptrcall.ll
+++ b/test/CodeGen/Mips/hfptrcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=picel
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=picel
 
 @ptrsv = global float ()* @sv, align 4
 @ptrdv = global double ()* @dv, align 4
diff --git a/test/CodeGen/Mips/i32k.ll b/test/CodeGen/Mips/i32k.ll
index ba9cf73423082..57a2e788a301c 100644
--- a/test/CodeGen/Mips/i32k.ll
+++ b/test/CodeGen/Mips/i32k.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -mips16-constant-islands=false -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic -mips16-constant-islands=false -O3 < %s | FileCheck %s -check-prefix=16
 
 @.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
 
diff --git a/test/CodeGen/Mips/inlineasm-assembler-directives.ll b/test/CodeGen/Mips/inlineasm-assembler-directives.ll
index 88ceed4114c2c..9f6f1ebb28588 100644
--- a/test/CodeGen/Mips/inlineasm-assembler-directives.ll
+++ b/test/CodeGen/Mips/inlineasm-assembler-directives.ll
@@ -10,14 +10,14 @@ entry:
 ; CHECK-NEXT: .set  macro
 ; CHECK-NEXT: .set  reorder
 ; CHECK:      addi $9, ${{[2-9][0-9]?}}, 8
-; CHECK:      subi ${{[2-9][0-9]?}}, $9, 6
+; CHECK:      ori ${{[2-9][0-9]?}}, $9, 6
 ; CHECK:      .set  pop
 ; CHECK-NEXT: #NO_APP
   %a = alloca i32, align 4
   %b = alloca i32, align 4
   store i32 20, i32* %a, align 4
   %0 = load i32, i32* %a, align 4
-  %1 = call i32 asm sideeffect "addi $$9, $1, 8\0A\09subi $0, $$9, 6", "=r,r,~{$1}"(i32 %0)
+  %1 = call i32 asm sideeffect "addi $$9, $1, 8\0A\09ori $0, $$9, 6", "=r,r,~{$1}"(i32 %0)
   store i32 %1, i32* %b, align 4
   ret void
 }
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
index 41991d07a4fef..63ee42c0c7cd8 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
@@ -7,38 +7,38 @@ define i32 @main() nounwind {
 entry:
 
 ; r with char
-;CHECK:	#APP
-;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},23
-;CHECK:	#NO_APP
-  tail call i8 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i8 27, i8 23) nounwind
+;CHECK: #APP
+;CHECK: addiu ${{[0-9]+}}, ${{[0-9]+}}, 23
+;CHECK: #NO_APP
+  tail call i8 asm sideeffect "addiu $0, $1, $2", "=r,r,n"(i8 27, i8 23) nounwind
 
 ; r with short
-;CHECK:	#APP
-;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},13
-;CHECK:	#NO_APP
-  tail call i16 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i16 17, i16 13) nounwind
+;CHECK: #APP
+;CHECK: addiu ${{[0-9]+}}, ${{[0-9]+}}, 13
+;CHECK: #NO_APP
+  tail call i16 asm sideeffect "addiu $0, $1, $2", "=r,r,n"(i16 17, i16 13) nounwind
 
 ; r with int
-;CHECK:	#APP
-;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},3
-;CHECK:	#NO_APP
-  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i32 7, i32 3) nounwind
+;CHECK: #APP
+;CHECK: addiu ${{[0-9]+}}, ${{[0-9]+}}, 3
+;CHECK: #NO_APP
+  tail call i32 asm sideeffect "addiu $0, $1, $2", "=r,r,n"(i32 7, i32 3) nounwind
 
 ; Now c with 1024: make sure register $25 is picked
 ; CHECK: #APP
-; CHECK: addiu $25,${{[0-9]+}},1024
-; CHECK: #NO_APP	
-   tail call i32 asm sideeffect "addiu $0,$1,$2", "=c,c,I"(i32 4194304, i32 1024) nounwind
+; CHECK: addiu $25, ${{[0-9]+}}, 1024
+; CHECK: #NO_APP
+   tail call i32 asm sideeffect "addiu $0, $1, $2", "=c,c,I"(i32 4194304, i32 1024) nounwind
 
 ; Now l with 1024: make sure register lo is picked. We do this by checking the instruction
 ; after the inline expression for a mflo to pull the value out of lo.
 ; CHECK:       #APP
 ; CHECK:       mtlo ${{[0-9]+}}
-; CHECK-NEXT:  madd ${{[0-9]+}},${{[0-9]+}}
+; CHECK-NEXT:  madd ${{[0-9]+}}, ${{[0-9]+}}
 ; CHECK:       #NO_APP
-; CHECK-NEXT:  mflo	${{[0-9]+}}
+; CHECK-NEXT:  mflo ${{[0-9]+}}
   %bosco = alloca i32, align 4
-  call i32 asm sideeffect "\09mtlo $3 \0A\09\09madd $1,$2 ", "=l,r,r,r"(i32 7, i32 6, i32 44) nounwind
+  call i32 asm sideeffect "\09mtlo $3 \0A\09\09madd $1, $2 ", "=l,r,r,r"(i32 7, i32 6, i32 44) nounwind
   store volatile i32 %4, i32* %bosco, align 4
  
   ret i32 0
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
index acce632038122..3b078d6f70dbb 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
@@ -12,9 +12,9 @@ entry:
 
 ; r with long long
 ;CHECK:	#APP
-;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},3
+;CHECK:	addiu ${{[0-9]+}}, ${{[0-9]+}}, 3
 ;CHECK:	#NO_APP
-  tail call i64 asm sideeffect "addiu $0,$1,$2", "=r,r,i"(i64 7, i64 3) nounwind
+  tail call i64 asm sideeffect "addiu $0, $1, $2", "=r,r,i"(i64 7, i64 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-operand-code.ll b/test/CodeGen/Mips/inlineasm-operand-code.ll
index b9415ee90cdb3..6d41385d18de1 100644
--- a/test/CodeGen/Mips/inlineasm-operand-code.ll
+++ b/test/CodeGen/Mips/inlineasm-operand-code.ll
@@ -1,7 +1,9 @@
 ; Positive test for inline register constraints
 ;
-; RUN: llc -march=mipsel < %s  | FileCheck -check-prefix=CHECK_LITTLE_32 %s
-; RUN: llc -march=mips < %s  | FileCheck -check-prefix=CHECK_BIG_32 %s
+; RUN: llc -no-integrated-as -march=mipsel < %s | \
+; RUN:     FileCheck -check-prefix=ALL -check-prefix=LE32 -check-prefix=GAS %s
+; RUN: llc -no-integrated-as -march=mips < %s | \
+; RUN:     FileCheck -check-prefix=ALL -check-prefix=BE32 -check-prefix=GAS %s
 
 %union.u_tag = type { i64 }
 %struct.anon = type { i32, i32 }
@@ -10,171 +12,152 @@
 ; X with -3
 define i32 @constraint_X() nounwind {
 entry:
-;CHECK_LITTLE_32-LABEL:   constraint_X:
-;CHECK_LITTLE_32: #APP
-;CHECK_LITTLE_32: addiu ${{[0-9]+}},${{[0-9]+}},0xfffffffffffffffd
-;CHECK_LITTLE_32: #NO_APP
-  tail call i32 asm sideeffect "addiu $0,$1,${2:X}", "=r,r,I"(i32 7, i32 -3) ;
+; ALL-LABEL: constraint_X:
+; ALL:           #APP
+; GAS:           addiu ${{[0-9]+}}, ${{[0-9]+}}, 0xfffffffffffffffd
+; ALL:           #NO_APP
+  tail call i32 asm sideeffect "addiu $0, $1, ${2:X}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
 ; x with -3
 define i32 @constraint_x() nounwind {
 entry:
-;CHECK_LITTLE_32-LABEL:   constraint_x:
-;CHECK_LITTLE_32: #APP
-;CHECK_LITTLE_32: addiu ${{[0-9]+}},${{[0-9]+}},0xfffd
-;CHECK_LITTLE_32: #NO_APP
-  tail call i32 asm sideeffect "addiu $0,$1,${2:x}", "=r,r,I"(i32 7, i32 -3) ;
+; ALL-LABEL: constraint_x:
+; ALL: #APP
+; GAS: addiu ${{[0-9]+}}, ${{[0-9]+}}, 0xfffd
+; ALL: #NO_APP
+  tail call i32 asm sideeffect "addiu $0, $1, ${2:x}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
 ; d with -3
 define i32 @constraint_d() nounwind {
 entry:
-;CHECK_LITTLE_32-LABEL:   constraint_d:
-;CHECK_LITTLE_32:   #APP
-;CHECK_LITTLE_32:   addiu ${{[0-9]+}},${{[0-9]+}},-3
-;CHECK_LITTLE_32:   #NO_APP
-  tail call i32 asm sideeffect "addiu $0,$1,${2:d}", "=r,r,I"(i32 7, i32 -3) ;
+; ALL-LABEL: constraint_d:
+; ALL:   #APP
+; ALL:   addiu ${{[0-9]+}}, ${{[0-9]+}}, -3
+; ALL:   #NO_APP
+  tail call i32 asm sideeffect "addiu $0, $1, ${2:d}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
 ; m with -3
 define i32 @constraint_m() nounwind {
 entry:
-;CHECK_LITTLE_32-LABEL:   constraint_m:
-;CHECK_LITTLE_32:   #APP
-;CHECK_LITTLE_32:   addiu ${{[0-9]+}},${{[0-9]+}},-4
-;CHECK_LITTLE_32:   #NO_APP
-  tail call i32 asm sideeffect "addiu $0,$1,${2:m}", "=r,r,I"(i32 7, i32 -3) ;
+; ALL-LABEL: constraint_m:
+; ALL:   #APP
+; ALL:   addiu ${{[0-9]+}}, ${{[0-9]+}}, -4
+; ALL:   #NO_APP
+  tail call i32 asm sideeffect "addiu $0, $1, ${2:m}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
 ; z with -3
 define i32 @constraint_z() nounwind {
 entry:
-;CHECK_LITTLE_32-LABEL: constraint_z:
-;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},-3
-;CHECK_LITTLE_32:    #NO_APP
-  tail call i32 asm sideeffect "addiu $0,$1,${2:z}", "=r,r,I"(i32 7, i32 -3) ;
+; ALL-LABEL: constraint_z:
+; ALL:    #APP
+; ALL:    addiu ${{[0-9]+}}, ${{[0-9]+}}, -3
+; ALL:    #NO_APP
+  tail call i32 asm sideeffect "addiu $0, $1, ${2:z}", "=r,r,I"(i32 7, i32 -3) ;
 
 ; z with 0
-;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},$0
-;CHECK_LITTLE_32:    #NO_APP
-  tail call i32 asm sideeffect "addiu $0,$1,${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
+; ALL:    #APP
+; GAS:    addiu ${{[0-9]+}}, ${{[0-9]+}}, $0
+; ALL:    #NO_APP
+  tail call i32 asm sideeffect "addiu $0, $1, ${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
 
 ; z with non-zero and the "r"(register) and "J"(integer zero) constraints
-;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
-;CHECK_LITTLE_32:    #NO_APP
+; ALL:    #APP
+; ALL:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
+; ALL:    #NO_APP
   call void asm sideeffect "mtc0 ${0:z}, $$12", "Jr"(i32 7) nounwind
 
 ; z with zero and the "r"(register) and "J"(integer zero) constraints
-;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    mtc0 $0, ${{[0-9]+}}
-;CHECK_LITTLE_32:    #NO_APP
+; ALL:    #APP
+; ALL:    mtc0 $0, ${{[0-9]+}}
+; ALL:    #NO_APP
   call void asm sideeffect "mtc0 ${0:z}, $$12", "Jr"(i32 0) nounwind
 
 ; z with non-zero and just the "r"(register) constraint
-;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
-;CHECK_LITTLE_32:    #NO_APP
+; ALL:    #APP
+; ALL:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
+; ALL:    #NO_APP
   call void asm sideeffect "mtc0 ${0:z}, $$12", "r"(i32 7) nounwind
 
 ; z with zero and just the "r"(register) constraint
 ; FIXME: Check for $0, instead of other registers.
 ;        We should be using $0 directly in this case, not real registers.
 ;        When the materialization of 0 gets fixed, this test will fail.
-;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
-;CHECK_LITTLE_32:    #NO_APP
+; ALL:    #APP
+; ALL:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
+; ALL:    #NO_APP
   call void asm sideeffect "mtc0 ${0:z}, $$12", "r"(i32 0) nounwind
   ret i32 0
 }
 
-; a long long in 32 bit mode (use to assert)
+; A long long in 32 bit mode (use to assert)
 define i32 @constraint_longlong() nounwind {
 entry:
-;CHECK_LITTLE_32-LABEL: constraint_longlong:
-;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},3
-;CHECK_LITTLE_32:    #NO_APP
-  tail call i64 asm sideeffect "addiu $0,$1,$2 \0A\09", "=r,r,X"(i64 1229801703532086340, i64 3) nounwind
+; ALL-LABEL: constraint_longlong:
+; ALL:           #APP
+; ALL:           addiu ${{[0-9]+}}, ${{[0-9]+}}, 3
+; ALL:           #NO_APP
+  tail call i64 asm sideeffect "addiu $0, $1, $2 \0A\09", "=r,r,X"(i64 1229801703532086340, i64 3) nounwind
   ret i32 0
 }
 
-; D, in little endian the source reg will be 4 bytes into the long long
+; In little endian the source reg will be 4 bytes into the long long
+; In big endian the source reg will also be 4 bytes into the long long
 define i32 @constraint_D() nounwind {
 entry:
-;CHECK_LITTLE_32-LABEL: constraint_D:
-;CHECK_LITTLE_32:    lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
-;CHECK_LITTLE_32:    lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
-;CHECK_LITTLE_32:    lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
-;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    or ${{[0-9]+}},$[[SECOND]],${{[0-9]+}}
-;CHECK_LITTLE_32:    #NO_APP
-
-; D, in big endian the source reg will also be 4 bytes into the long long
-;CHECK_BIG_32-LABEL:    constraint_D:
-;CHECK_BIG_32:       lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
-;CHECK_BIG_32:       lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
-;CHECK_BIG_32:       lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
-;CHECK_BIG_32:       #APP
-;CHECK_BIG_32:       or ${{[0-9]+}},$[[SECOND]],${{[0-9]+}}
-;CHECK_BIG_32:       #NO_APP
+; ALL-LABEL: constraint_D:
+; ALL:           lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
+; ALL:           lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
+; ALL:           lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
+; ALL:           #APP
+; LE32:          or ${{[0-9]+}}, $[[SECOND]], ${{[0-9]+}}
+; BE32:          or ${{[0-9]+}}, $[[SECOND]], ${{[0-9]+}}
+; ALL:           #NO_APP
   %bosco = load i64, i64* getelementptr inbounds (%union.u_tag, %union.u_tag* @uval, i32 0, i32 0), align 8
   %trunc1 = trunc i64 %bosco to i32
-  tail call i32 asm sideeffect "or $0,${1:D},$2", "=r,r,r"(i64 %bosco, i32 %trunc1) nounwind
+  tail call i32 asm sideeffect "or $0, ${1:D}, $2", "=r,r,r"(i64 %bosco, i32 %trunc1) nounwind
   ret i32 0
 }
 
-; L, in little endian the source reg will be 0 bytes into the long long
+; In little endian the source reg will be 0 bytes into the long long
+; In big endian the source reg will be 4 bytes into the long long
 define i32 @constraint_L() nounwind {
 entry:
-;CHECK_LITTLE_32-LABEL: constraint_L:
-;CHECK_LITTLE_32:    lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
-;CHECK_LITTLE_32:    lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
-;CHECK_LITTLE_32:    lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
-;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    or ${{[0-9]+}},$[[FIRST]],${{[0-9]+}}
-;CHECK_LITTLE_32:    #NO_APP
-; L, in big endian the source reg will be 4 bytes into the long long
-;CHECK_BIG_32-LABEL: constraint_L:
-;CHECK_BIG_32:       lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
-;CHECK_BIG_32:       lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
-;CHECK_BIG_32:       lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
-;CHECK_BIG_32:       #APP
-;CHECK_BIG_32:       or ${{[0-9]+}},$[[SECOND]],${{[0-9]+}}
-;CHECK_BIG_32:       #NO_APP
+; ALL-LABEL: constraint_L:
+; ALL:           lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
+; ALL:           lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
+; ALL:           lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
+; ALL:           #APP
+; LE32:          or ${{[0-9]+}}, $[[FIRST]], ${{[0-9]+}}
+; BE32:          or ${{[0-9]+}}, $[[SECOND]], ${{[0-9]+}}
+; ALL:           #NO_APP
   %bosco = load i64, i64* getelementptr inbounds (%union.u_tag, %union.u_tag* @uval, i32 0, i32 0), align 8
   %trunc1 = trunc i64 %bosco to i32
-  tail call i32 asm sideeffect "or $0,${1:L},$2", "=r,r,r"(i64 %bosco, i32 %trunc1) nounwind
+  tail call i32 asm sideeffect "or $0, ${1:L}, $2", "=r,r,r"(i64 %bosco, i32 %trunc1) nounwind
   ret i32 0
 }
 
-; M, in little endian the source reg will be 4 bytes into the long long
+; In little endian the source reg will be 4 bytes into the long long
+; In big endian the source reg will be 0 bytes into the long long
 define i32 @constraint_M() nounwind {
 entry:
-;CHECK_LITTLE_32-LABEL: constraint_M:
-;CHECK_LITTLE_32:    lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
-;CHECK_LITTLE_32:    lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
-;CHECK_LITTLE_32:    lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
-;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    or ${{[0-9]+}},$[[SECOND]],${{[0-9]+}}
-;CHECK_LITTLE_32:    #NO_APP
-; M, in big endian the source reg will be 0 bytes into the long long
-;CHECK_BIG_32-LABEL:    constraint_M:
-;CHECK_BIG_32:       lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
-;CHECK_BIG_32:       lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
-;CHECK_BIG_32:       lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
-;CHECK_BIG_32:       #APP
-;CHECK_BIG_32:       or ${{[0-9]+}},$[[FIRST]],${{[0-9]+}}
-;CHECK_BIG_32:       #NO_APP
+; ALL-LABEL: constraint_M:
+; ALL:           lw ${{[0-9]+}}, %got(uval)(${{[0-9,a-z]+}})
+; ALL:           lw $[[SECOND:[0-9]+]], 4(${{[0-9]+}})
+; ALL:           lw $[[FIRST:[0-9]+]], 0(${{[0-9]+}})
+; ALL:           #APP
+; LE32:          or ${{[0-9]+}}, $[[SECOND]], ${{[0-9]+}}
+; BE32:          or ${{[0-9]+}}, $[[FIRST]], ${{[0-9]+}}
+; ALL:           #NO_APP
   %bosco = load i64, i64* getelementptr inbounds (%union.u_tag, %union.u_tag* @uval, i32 0, i32 0), align 8
   %trunc1 = trunc i64 %bosco to i32
-  tail call i32 asm sideeffect "or $0,${1:M},$2", "=r,r,r"(i64 %bosco, i32 %trunc1) nounwind
+  tail call i32 asm sideeffect "or $0, ${1:M}, $2", "=r,r,r"(i64 %bosco, i32 %trunc1) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/Mips/inlineasm_constraint.ll b/test/CodeGen/Mips/inlineasm_constraint.ll
index 868433e0941fb..a6ac07182ff5a 100644
--- a/test/CodeGen/Mips/inlineasm_constraint.ll
+++ b/test/CodeGen/Mips/inlineasm_constraint.ll
@@ -1,55 +1,73 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
-
-define i32 @main() nounwind {
-entry:
+; RUN: llc -no-integrated-as -march=mipsel < %s | \
+; RUN:     FileCheck %s -check-prefix=ALL -check-prefix=GAS
 
+define void @constraint_I() nounwind {
 ; First I with short
-; CHECK: #APP
-; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},4096
-; CHECK: #NO_APP
-  tail call i16 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i16 7, i16 4096) nounwind
+; ALL-LABEL: constraint_I:
+; ALL:           #APP
+; ALL:           addiu ${{[0-9]+}}, ${{[0-9]+}}, 4096
+; ALL:           #NO_APP
+  tail call i16 asm sideeffect "addiu $0, $1, $2", "=r,r,I"(i16 7, i16 4096) nounwind
 
 ; Then I with int
-; CHECK: #APP
-; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
-; CHECK: #NO_APP
-   tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i32 7, i32 -3) nounwind
+; ALL: #APP
+; ALL: addiu ${{[0-9]+}}, ${{[0-9]+}}, -3
+; ALL: #NO_APP
+  tail call i32 asm sideeffect "addiu $0, $1, $2", "=r,r,I"(i32 7, i32 -3) nounwind
+  ret void
+}
 
+define void @constraint_J() nounwind {
 ; Now J with 0
-; CHECK: #APP
-; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},0
-; CHECK: #NO_APP
-  tail call i32 asm sideeffect "addiu $0,$1,$2\0A\09 ", "=r,r,J"(i32 7, i16 0) nounwind
+; ALL-LABEL: constraint_J:
+; ALL: #APP
+; ALL: addiu ${{[0-9]+}}, ${{[0-9]+}}, 0
+; ALL: #NO_APP
+  tail call i32 asm sideeffect "addiu $0, $1, $2\0A\09 ", "=r,r,J"(i32 7, i16 0) nounwind
+  ret void
+}
 
+define void @constraint_K() nounwind {
 ; Now K with 64
-; CHECK: #APP
-; CHECK: addu ${{[0-9]+}},${{[0-9]+}},64
-; CHECK: #NO_APP	
-  tail call i16 asm sideeffect "addu $0,$1,$2\0A\09 ", "=r,r,K"(i16 7, i16 64) nounwind
+; ALL: #APP
+; GAS: addu ${{[0-9]+}}, ${{[0-9]+}}, 64
+; ALL: #NO_APP	
+  tail call i16 asm sideeffect "addu $0, $1, $2\0A\09 ", "=r,r,K"(i16 7, i16 64) nounwind
+  ret void
+}
 
+define void @constraint_L() nounwind {
 ; Now L with 0x00100000
-; CHECK: #APP
-; CHECK: add ${{[0-9]+}},${{[0-9]+}},${{[0-9]+}}
-; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "add $0,$1,$3\0A\09", "=r,r,L,r"(i32 7, i32 1048576, i32 0) nounwind
+; ALL: #APP
+; ALL: add ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; ALL: #NO_APP	
+  tail call i32 asm sideeffect "add $0, $1, $3\0A\09", "=r,r,L,r"(i32 7, i32 1048576, i32 0) nounwind
+  ret void
+}
 
+define void @constraint_N() nounwind {
 ; Now N with -3
-; CHECK: #APP
-; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
-; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,N"(i32 7, i32 -3) nounwind
+; ALL: #APP
+; ALL: addiu ${{[0-9]+}}, ${{[0-9]+}}, -3
+; ALL: #NO_APP	
+  tail call i32 asm sideeffect "addiu $0, $1, $2", "=r,r,N"(i32 7, i32 -3) nounwind
+  ret void
+}
 
+define void @constraint_O() nounwind {
 ; Now O with -3
-; CHECK: #APP
-; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
-; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,O"(i32 7, i16 -3) nounwind
+; ALL: #APP
+; ALL: addiu ${{[0-9]+}}, ${{[0-9]+}}, -3
+; ALL: #NO_APP	
+  tail call i32 asm sideeffect "addiu $0, $1, $2", "=r,r,O"(i32 7, i16 -3) nounwind
+  ret void
+}
 
+define void @constraint_P() nounwind {
 ; Now P with 65535
-; CHECK: #APP
-; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},65535
-; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,P"(i32 7, i32 65535) nounwind
-
-  ret i32 0
+; ALL: #APP
+; GAS: addiu ${{[0-9]+}}, ${{[0-9]+}}, 65535
+; ALL: #NO_APP
+  tail call i32 asm sideeffect "addiu $0, $1, $2", "=r,r,P"(i32 7, i32 65535) nounwind
+  ret void
 }
diff --git a/test/CodeGen/Mips/inlineasmmemop.ll b/test/CodeGen/Mips/inlineasmmemop.ll
index 9e9b6cd089ea6..bdf3ae55b8028 100644
--- a/test/CodeGen/Mips/inlineasmmemop.ll
+++ b/test/CodeGen/Mips/inlineasmmemop.ll
@@ -26,13 +26,13 @@ entry:
 ; "D": Second word of a double word. This works for any memory element
 ; double or single.
 ; CHECK: #APP
-; CHECK: lw ${{[0-9]+}}, 16(${{[0-9]+}});
+; CHECK: lw ${{[0-9]+}}, 16(${{[0-9]+}})
 ; CHECK: #NO_APP
 
 ; No "D": First word of a double word. This works for any memory element
 ; double or single.
 ; CHECK: #APP
-; CHECK: lw ${{[0-9]+}}, 12(${{[0-9]+}});
+; CHECK: lw ${{[0-9]+}}, 12(${{[0-9]+}})
 ; CHECK: #NO_APP
 
 @b = common global [20 x i32] zeroinitializer, align 4
@@ -40,8 +40,8 @@ entry:
 define void @main() {
 entry:
 ; Second word:
-  tail call void asm sideeffect "    lw    $0, ${1:D};", "r,*m,~{$11}"(i32 undef, i32* getelementptr inbounds ([20 x i32], [20 x i32]* @b, i32 0, i32 3))
+  tail call void asm sideeffect "    lw    $0, ${1:D}", "r,*m,~{$11}"(i32 undef, i32* getelementptr inbounds ([20 x i32], [20 x i32]* @b, i32 0, i32 3))
 ; First word. Notice, no 'D':
-  tail call void asm sideeffect "    lw    $0, ${1};", "r,*m,~{$11}"(i32 undef, i32* getelementptr inbounds ([20 x i32], [20 x i32]* @b, i32 0, i32 3))
+  tail call void asm sideeffect "    lw    $0, ${1}", "r,*m,~{$11}"(i32 undef, i32* getelementptr inbounds ([20 x i32], [20 x i32]* @b, i32 0, i32 3))
   ret void
 }
diff --git a/test/CodeGen/Mips/insn-zero-size-bb.ll b/test/CodeGen/Mips/insn-zero-size-bb.ll
index ea61c994ae1d9..d2124c407a0d8 100644
--- a/test/CodeGen/Mips/insn-zero-size-bb.ll
+++ b/test/CodeGen/Mips/insn-zero-size-bb.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s
 ; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s
-; RUN: llc < %s -march=mips -mcpu=mips16 | FileCheck %s
+; RUN: llc < %s -march=mips -mattr=mips16 | FileCheck %s
 
 ; Verify that we emit the .insn directive for zero-sized (empty) basic blocks.
 ; This only really matters for microMIPS and MIPS16.
@@ -14,7 +14,7 @@ entry:
           to label %unreachable unwind label %return
 
 unreachable:
-; CHECK:          ${{.*}}: # %unreachable
+; CHECK:          {{.*}}: # %unreachable
 ; CHECK-NEXT:         .insn
   unreachable
 
diff --git a/test/CodeGen/Mips/interrupt-attr-64-error.ll b/test/CodeGen/Mips/interrupt-attr-64-error.ll
new file mode 100644
index 0000000000000..830c199d91d9b
--- /dev/null
+++ b/test/CodeGen/Mips/interrupt-attr-64-error.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -mcpu=mips64r6 -march=mipsel -relocation-model=static < %s 2>%t
+; RUN: FileCheck %s < %t
+
+; CHECK: LLVM ERROR: "interrupt" attribute is only supported for the O32 ABI on MIPS32R2+ at the present time.
+define i32 @isr_sw0() #0 {
+  ret i32 0
+}
+
+attributes #0 = { "interrupt"="sw0" }
diff --git a/test/CodeGen/Mips/interrupt-attr-args-error.ll b/test/CodeGen/Mips/interrupt-attr-args-error.ll
new file mode 100644
index 0000000000000..993629bdbcd6b
--- /dev/null
+++ b/test/CodeGen/Mips/interrupt-attr-args-error.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -mcpu=mips32r2 -march=mipsel -relocation-model=static < %s 2> %t
+; RUN: FileCheck %s < %t
+
+; CHECK: LLVM ERROR: Functions with the interrupt attribute cannot have arguments!
+define i32 @isr_sw0(i8 signext %n) #0 {
+  ret i32 0
+}
+
+attributes #0 = { "interrupt"="sw0" }
diff --git a/test/CodeGen/Mips/interrupt-attr-error.ll b/test/CodeGen/Mips/interrupt-attr-error.ll
new file mode 100644
index 0000000000000..f35e98ea14bfb
--- /dev/null
+++ b/test/CodeGen/Mips/interrupt-attr-error.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -mcpu=mips32 -march=mipsel -relocation-model=static < %s 2>%t
+; RUN: FileCheck %s < %t
+
+; CHECK: LLVM ERROR: "interrupt" attribute is not supported on pre-MIPS32R2 or MIPS16 targets.
+define i32 @isr_sw0() #0 {
+  ret i32 0
+}
+
+attributes #0 = { "interrupt"="sw0" }
diff --git a/test/CodeGen/Mips/interrupt-attr.ll b/test/CodeGen/Mips/interrupt-attr.ll
new file mode 100644
index 0000000000000..6cfb0c3ebd54b
--- /dev/null
+++ b/test/CodeGen/Mips/interrupt-attr.ll
@@ -0,0 +1,244 @@
+; RUN: llc -mcpu=mips32r2 -march=mipsel -relocation-model=static -o - %s | FileCheck %s
+
+define void @isr_sw0() #0 {
+; CHECK-LABEL: isr_sw0:
+; CHECK: mfc0   $27, $14, 0
+; CHECK: sw     $27, [[R1:[0-9]+]]($sp)
+; CHECK: mfc0   $27, $12, 0
+; CHECK: sw     $27, [[R2:[0-9]+]]($sp)
+; CHECK: ins    $27, $zero, 8, 1
+; CHECK: ins    $27, $zero, 1, 4
+; CHECK: ins    $27, $zero, 29, 1
+; CHECK: mtc0   $27, $12, 0
+  ; Must save all registers
+; CHECK: sw      $7, {{[0-9]+}}($sp)
+; CHECK: sw      $6, {{[0-9]+}}($sp)
+; CHECK: sw      $5, {{[0-9]+}}($sp)
+; CHECK: sw      $4, {{[0-9]+}}($sp)
+; CHECK: sw      $3, {{[0-9]+}}($sp)
+; CHECK: sw      $2, {{[0-9]+}}($sp)
+; CHECK: sw      $25, {{[0-9]+}}($sp)
+; CHECK: sw      $24, {{[0-9]+}}($sp)
+; CHECK: sw      $15, {{[0-9]+}}($sp)
+; CHECK: sw      $14, {{[0-9]+}}($sp)
+; CHECK: sw      $13, {{[0-9]+}}($sp)
+; CHECK: sw      $12, {{[0-9]+}}($sp)
+; CHECK: sw      $11, {{[0-9]+}}($sp)
+; CHECK: sw      $10, {{[0-9]+}}($sp)
+; CHECK: sw      $9, {{[0-9]+}}($sp)
+; CHECK: sw      $8, {{[0-9]+}}($sp)
+; CHECK: sw      $ra, [[R5:[0-9]+]]($sp)
+; CHECK: sw      $gp, {{[0-9]+}}($sp)
+; CHECK: sw      $1, {{[0-9]+}}($sp)
+; CHECK: mflo    $26
+; CHECK: sw      $26, [[R3:[0-9]+]]($sp)
+; CHECK: mfhi    $26
+; CHECK: sw      $26, [[R4:[0-9]+]]($sp)
+  call void bitcast (void (...)* @write to void ()*)()
+; CHECK: lw      $26, [[R4:[0-9]+]]($sp)
+; CHECK: mthi    $26
+; CHECK: lw      $26, [[R3:[0-9]+]]($sp)
+; CHECK: mtlo    $26
+; CHECK: lw      $1, {{[0-9]+}}($sp)
+; CHECK: lw      $gp, {{[0-9]+}}($sp)
+; CHECK: lw      $ra, [[R5:[0-9]+]]($sp)
+; CHECK: lw      $8, {{[0-9]+}}($sp)
+; CHECK: lw      $9, {{[0-9]+}}($sp)
+; CHECK: lw      $10, {{[0-9]+}}($sp)
+; CHECK: lw      $11, {{[0-9]+}}($sp)
+; CHECK: lw      $12, {{[0-9]+}}($sp)
+; CHECK: lw      $13, {{[0-9]+}}($sp)
+; CHECK: lw      $14, {{[0-9]+}}($sp)
+; CHECK: lw      $15, {{[0-9]+}}($sp)
+; CHECK: lw      $24, {{[0-9]+}}($sp)
+; CHECK: lw      $25, {{[0-9]+}}($sp)
+; CHECK: lw      $2, {{[0-9]+}}($sp)
+; CHECK: lw      $3, {{[0-9]+}}($sp)
+; CHECK: lw      $4, {{[0-9]+}}($sp)
+; CHECK: lw      $5, {{[0-9]+}}($sp)
+; CHECK: lw      $6, {{[0-9]+}}($sp)
+; CHECK: lw      $7, {{[0-9]+}}($sp)
+; CHECK: di
+; CHECK: ehb
+; CHECK: lw      $27, [[R2:[0-9]+]]($sp)
+; CHECK: mtc0    $27, $14, 0
+; CHECK: lw      $27, [[R1:[0-9]+]]($sp)
+; CHECK: mtc0    $27, $12, 0
+; CHECK: eret
+  ret void
+}
+
+declare void @write(...)
+
+define void @isr_sw1() #2 {
+; CHECK-LABEL: isr_sw1:
+; CHECK: mfc0   $27, $14, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: mfc0   $27, $12, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: ins    $27, $zero, 8, 2
+; CHECK: ins    $27, $zero, 1, 4
+; CHECK: ins    $27, $zero, 29, 1
+; CHECK: mtc0   $27, $12, 0
+  ret void
+; CHECK: di
+; CHECK: ehb
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $14, 0
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $12, 0
+; CHECK: eret
+ }
+
+define void @isr_hw0() #3 {
+; CHECK-LABEL: isr_hw0:
+; CHECK: mfc0   $27, $14, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: mfc0   $27, $12, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: ins    $27, $zero, 8, 3
+; CHECK: ins    $27, $zero, 1, 4
+; CHECK: ins    $27, $zero, 29, 1
+; CHECK: mtc0   $27, $12, 0
+  ret void
+; CHECK: di
+; CHECK: ehb
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $14, 0
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $12, 0
+; CHECK: eret
+ }
+
+define void @isr_hw1() #4 {
+; CHECK-LABEL: isr_hw1:
+; CHECK: mfc0   $27, $14, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: mfc0   $27, $12, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: ins    $27, $zero, 8, 4
+; CHECK: ins    $27, $zero, 1, 4
+; CHECK: ins    $27, $zero, 29, 1
+; CHECK: mtc0   $27, $12, 0
+  ret void
+; CHECK: di
+; CHECK: ehb
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $14, 0
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $12, 0
+; CHECK: eret
+ }
+
+
+define void @isr_hw2() #5 {
+; CHECK-LABEL: isr_hw2:
+; CHECK: mfc0   $27, $14, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: mfc0   $27, $12, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: ins    $27, $zero, 8, 5
+; CHECK: ins    $27, $zero, 1, 4
+; CHECK: ins    $27, $zero, 29, 1
+; CHECK: mtc0   $27, $12, 0
+  ret void
+; CHECK: di
+; CHECK: ehb
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $14, 0
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $12, 0
+; CHECK: eret
+ }
+
+define void @isr_hw3() #6 {
+; CHECK-LABEL: isr_hw3:
+; CHECK: mfc0   $27, $14, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: mfc0   $27, $12, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: ins    $27, $zero, 8, 6
+; CHECK: ins    $27, $zero, 1, 4
+; CHECK: ins    $27, $zero, 29, 1
+; CHECK: mtc0   $27, $12, 0
+  ret void
+; CHECK: di
+; CHECK: ehb
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $14, 0
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $12, 0
+; CHECK: eret
+ }
+
+define void @isr_hw4() #7 {
+; CHECK-LABEL: isr_hw4:
+; CHECK: mfc0   $27, $14, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: mfc0   $27, $12, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: ins    $27, $zero, 8, 7
+; CHECK: ins    $27, $zero, 1, 4
+; CHECK: ins    $27, $zero, 29, 1
+; CHECK: mtc0   $27, $12, 0
+  ret void
+; CHECK: di
+; CHECK: ehb
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $14, 0
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $12, 0
+; CHECK: eret
+ }
+
+define void @isr_hw5() #8 {
+; CHECK-LABEL: isr_hw5:
+; CHECK: mfc0   $27, $14, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: mfc0   $27, $12, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: ins    $27, $zero, 8, 8
+; CHECK: ins    $27, $zero, 1, 4
+; CHECK: ins    $27, $zero, 29, 1
+; CHECK: mtc0   $27, $12, 0
+  ret void
+; CHECK: di
+; CHECK: ehb
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $14, 0
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $12, 0
+; CHECK: eret
+ }
+
+define void @isr_eic() #9 {
+; CHECK-LABEL: isr_eic:
+; CHECK: mfc0   $26, $13, 0
+; CHECK: ext    $26, $26, 10, 6
+; CHECK: mfc0   $27, $14, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: mfc0   $27, $12, 0
+; CHECK: sw     $27, {{[0-9]+}}($sp)
+; CHECK: ins    $27, $26, 10, 6
+; CHECK: ins    $27, $zero, 1, 4
+; CHECK: ins    $27, $zero, 29, 1
+; CHECK: mtc0   $27, $12, 0
+  ret void
+; CHECK: di
+; CHECK: ehb
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $14, 0
+; CHECK: lw      $27, {{[0-9]+}}($sp)
+; CHECK: mtc0    $27, $12, 0
+; CHECK: eret
+ }
+
+attributes #0 = { "interrupt"="sw0" }
+attributes #2 = { "interrupt"="sw1" }
+attributes #3 = { "interrupt"="hw0" }
+attributes #4 = { "interrupt"="hw1" }
+attributes #5 = { "interrupt"="hw2" }
+attributes #6 = { "interrupt"="hw3" }
+attributes #7 = { "interrupt"="hw4" }
+attributes #8 = { "interrupt"="hw5" }
+attributes #9 = { "interrupt"="eic" }
diff --git a/test/CodeGen/Mips/jtstat.ll b/test/CodeGen/Mips/jtstat.ll
index 35f71cf2dc858..5b578d4cffe4e 100644
--- a/test/CodeGen/Mips/jtstat.ll
+++ b/test/CodeGen/Mips/jtstat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
 
 @s = global i8 115, align 1
 @c = common global i8 0, align 1
diff --git a/test/CodeGen/Mips/l3mc.ll b/test/CodeGen/Mips/l3mc.ll
index c1bff11595c93..c6855fc03fec1 100644
--- a/test/CodeGen/Mips/l3mc.ll
+++ b/test/CodeGen/Mips/l3mc.ll
@@ -1,22 +1,22 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck  %s -check-prefix=__call_stub_fp___fixunsdfsi 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck  %s -check-prefix=__call_stub_fp___fixunsdfsi 
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatdidf 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatdidf 
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatdisf 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatdisf 
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatundidf
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatundidf
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixsfdi 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixsfdi 
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixunsdfdi 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixunsdfdi 
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixdfdi
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixdfdi
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixunssfsi 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixunssfsi 
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixunssfdi 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixunssfdi 
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatundisf 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatundisf 
 
 @ll1 = global i64 0, align 8
 @ll2 = global i64 0, align 8
diff --git a/test/CodeGen/Mips/lb1.ll b/test/CodeGen/Mips/lb1.ll
index 21648d7572a52..1e908b81a8780 100644
--- a/test/CodeGen/Mips/lb1.ll
+++ b/test/CodeGen/Mips/lb1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @c = global i8 -1, align 1
 @.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
diff --git a/test/CodeGen/Mips/lbu1.ll b/test/CodeGen/Mips/lbu1.ll
index 28ca271324671..32515411b7d47 100644
--- a/test/CodeGen/Mips/lbu1.ll
+++ b/test/CodeGen/Mips/lbu1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @c = global i8 97, align 1
 @.str = private unnamed_addr constant [5 x i8] c"%c \0A\00", align 1
diff --git a/test/CodeGen/Mips/lcb2.ll b/test/CodeGen/Mips/lcb2.ll
index a6f4968e6d239..b15a5085f5a2f 100644
--- a/test/CodeGen/Mips/lcb2.ll
+++ b/test/CodeGen/Mips/lcb2.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands=true   < %s | FileCheck %s -check-prefix=lcb
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands=true   < %s | FileCheck %s -check-prefix=lcb
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands=true   < %s | FileCheck %s -check-prefix=lcbn
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands=true   < %s | FileCheck %s -check-prefix=lcbn
 
 @i = global i32 0, align 4
 @j = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/lcb3c.ll b/test/CodeGen/Mips/lcb3c.ll
index 4c6f2c036a0b5..b953229a15c9f 100644
--- a/test/CodeGen/Mips/lcb3c.ll
+++ b/test/CodeGen/Mips/lcb3c.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -O0    < %s | FileCheck %s -check-prefix=lcb
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -O0    < %s | FileCheck %s -check-prefix=lcb
 
 @i = global i32 0, align 4
 @j = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/lcb4a.ll b/test/CodeGen/Mips/lcb4a.ll
index 9e97b5bf1433c..4a99ef26efcae 100644
--- a/test/CodeGen/Mips/lcb4a.ll
+++ b/test/CodeGen/Mips/lcb4a.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static     < %s | FileCheck %s -check-prefix=ci
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static     < %s | FileCheck %s -check-prefix=ci
 
 @i = global i32 0, align 4
 @j = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/lcb5.ll b/test/CodeGen/Mips/lcb5.ll
index 41878d5f8817c..ec4c3da6515c2 100644
--- a/test/CodeGen/Mips/lcb5.ll
+++ b/test/CodeGen/Mips/lcb5.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static     < %s | FileCheck %s -check-prefix=ci
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static     < %s | FileCheck %s -check-prefix=ci
 
 @i = global i32 0, align 4
 @j = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/lh1.ll b/test/CodeGen/Mips/lh1.ll
index 31967e5a53799..dcab12a38e174 100644
--- a/test/CodeGen/Mips/lh1.ll
+++ b/test/CodeGen/Mips/lh1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @s = global i16 -1, align 2
 @.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
diff --git a/test/CodeGen/Mips/lhu1.ll b/test/CodeGen/Mips/lhu1.ll
index 413da46d4a311..9a52d6fb269fb 100644
--- a/test/CodeGen/Mips/lhu1.ll
+++ b/test/CodeGen/Mips/lhu1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 
 @s = global i16 255, align 2
diff --git a/test/CodeGen/Mips/llcarry.ll b/test/CodeGen/Mips/llcarry.ll
index f4120ecec1754..fcf129420234c 100644
--- a/test/CodeGen/Mips/llcarry.ll
+++ b/test/CodeGen/Mips/llcarry.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i64 4294967295, align 8
 @j = global i64 15, align 8
diff --git a/test/CodeGen/Mips/llvm-ir/atomicrmx.ll b/test/CodeGen/Mips/llvm-ir/atomicrmx.ll
new file mode 100644
index 0000000000000..9069a6f2d13f3
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/atomicrmx.ll
@@ -0,0 +1,26 @@
+; RUN: llc -asm-show-inst  -march=mipsel -mcpu=mips32r6 < %s | \
+; RUN:    FileCheck %s -check-prefix=CHK32
+; RUN: llc -asm-show-inst  -march=mips64el -mcpu=mips64r6 < %s | \
+; RUN:    FileCheck %s -check-prefix=CHK64
+
+@a = common global i32 0, align 4
+@b = common global i64 0, align 8
+
+
+define i32 @ll_sc(i32 signext %x) {
+; CHK32-LABEL: ll_sc
+
+;CHK32:  LL_R6
+;CHK32:  SC_R6
+  %1 = atomicrmw add i32* @a, i32 %x monotonic
+  ret i32 %1
+}
+
+define i64 @lld_scd(i64 signext %x) {
+; CHK64-LABEL: lld_scd
+
+;CHK64:  LLD_R6
+;CHK64:  SCD_R6
+  %1 = atomicrmw add i64* @b, i64 %x monotonic
+  ret i64 %1
+}
diff --git a/test/CodeGen/Mips/llvm-ir/call.ll b/test/CodeGen/Mips/llvm-ir/call.ll
index 112ab8ee8c7fe..22a44da0b0696 100644
--- a/test/CodeGen/Mips/llvm-ir/call.ll
+++ b/test/CodeGen/Mips/llvm-ir/call.ll
@@ -6,6 +6,7 @@
 ; RUN: llc -march=mips   -mcpu=mips32r3 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
 ; RUN: llc -march=mips   -mcpu=mips32r5 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
 ; RUN: llc -march=mips   -mcpu=mips32r6 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
+; RUN: llc -march=mips   -mcpu=mips32r6 -mattr=+fp64,+nooddspreg -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
 ; RUN: llc -march=mips64 -mcpu=mips4    -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
 ; RUN: llc -march=mips64 -mcpu=mips64   -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
 ; RUN: llc -march=mips64 -mcpu=mips64r2 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
@@ -168,3 +169,16 @@ define float @tail_indirect_call_float_void(float ()* %addr) {
   %1 = tail call float %addr()
   ret float %1
 }
+
+; Check that passing undef as a double value doesn't cause machine code errors
+; for FP64.
+declare hidden void @undef_double(i32 %this, double %volume) unnamed_addr align 2
+
+define hidden void @thunk_undef_double(i32 %this, double %volume) unnamed_addr align 2 {
+; ALL-LABEL: thunk_undef_double:
+; O32: # implicit-def: %A2
+; O32: # implicit-def: %A3
+; ALL: jr $25
+  tail call void @undef_double(i32 undef, double undef) #8
+  ret void
+}
diff --git a/test/CodeGen/Mips/llvm-ir/load-atomic.ll b/test/CodeGen/Mips/llvm-ir/load-atomic.ll
new file mode 100644
index 0000000000000..a44b00bff586c
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/load-atomic.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL
+; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL
+; RUN: llc -march=mips64 -mcpu=mips64r2 < %s | \
+; RUN:    FileCheck %s -check-prefix=ALL -check-prefix=M64
+; RUN: llc -march=mips64 -mcpu=mips64r6 < %s | \
+; RUN:    FileCheck %s -check-prefix=ALL -check-prefix=M64
+
+define i8 @load_i8(i8* %ptr) {
+; ALL-LABEL: load_i8
+
+; ALL: lb $2, 0($4)
+; ALL: sync
+  %val = load atomic i8, i8* %ptr acquire, align 1
+  ret i8 %val
+}
+
+define i16 @load_i16(i16* %ptr) {
+; ALL-LABEL: load_i16
+
+; ALL: lh $2, 0($4)
+; ALL: sync
+  %val = load atomic i16, i16* %ptr acquire, align 2
+  ret i16 %val
+}
+
+define i32 @load_i32(i32* %ptr) {
+; ALL-LABEL: load_i32
+
+; ALL: lw $2, 0($4)
+; ALL: sync
+  %val = load atomic i32, i32* %ptr acquire, align 4
+  ret i32 %val
+}
+
+define i64 @load_i64(i64* %ptr) {
+; M64-LABEL: load_i64
+
+; M64: ld $2, 0($4)
+; M64: sync
+  %val = load atomic i64, i64* %ptr acquire, align 8
+  ret i64 %val
+}
diff --git a/test/CodeGen/Mips/llvm-ir/sqrt.ll b/test/CodeGen/Mips/llvm-ir/sqrt.ll
new file mode 100644
index 0000000000000..1a8892de0ee20
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/sqrt.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -mattr=+micromips | FileCheck %s
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -mattr=+micromips | FileCheck %s
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s
+
+define float @sqrt_fn(float %value) #0 {
+entry:
+  %sqrtf = tail call float @sqrtf(float %value) #0
+  ret float %sqrtf
+}
+
+declare float @sqrtf(float)
+
+; CHECK: sqrt.s $f0, $f12
diff --git a/test/CodeGen/Mips/llvm-ir/store-atomic.ll b/test/CodeGen/Mips/llvm-ir/store-atomic.ll
new file mode 100644
index 0000000000000..6b33f2685d170
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/store-atomic.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL
+; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL
+; RUN: llc -march=mips64 -mcpu=mips64r2 < %s | \
+; RUN:    FileCheck %s -check-prefix=ALL -check-prefix=M64
+; RUN: llc -march=mips64 -mcpu=mips64r6 < %s | \
+; RUN:    FileCheck %s -check-prefix=ALL -check-prefix=M64
+
+define void @store_i8(i8* %ptr, i8 signext %v) {
+; ALL-LABEL: store_i8
+
+; ALL: sync
+; ALL: sb $5, 0($4)
+  store atomic i8 %v, i8* %ptr release, align 1
+  ret void
+}
+
+define void @store_i16(i16* %ptr, i16 signext %v) {
+; ALL-LABEL: store_i16
+
+; ALL: sync
+; ALL: sh $5, 0($4)
+  store atomic i16 %v, i16* %ptr release, align 2
+  ret void
+}
+
+define void @store_i32(i32* %ptr, i32 signext %v) {
+; ALL-LABEL: store_i32
+
+; ALL: sync
+; ALL: sw $5, 0($4)
+  store atomic i32 %v, i32* %ptr release, align 4
+  ret void
+}
+
+define void @store_i64(i64* %ptr, i64 %v) {
+; M64-LABEL: store_i64
+
+; M64: sync
+; M64: sd $5, 0($4)
+  store atomic i64 %v, i64* %ptr release, align 8
+  ret void
+}
diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll
index b0c3ff6ff9b56..b84d94d31494c 100644
--- a/test/CodeGen/Mips/madd-msub.ll
+++ b/test/CodeGen/Mips/madd-msub.ll
@@ -7,7 +7,7 @@
 ; RUN: llc -march=mips -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64R6
 
 ; FIXME: The MIPS16 test should check its output
-; RUN: llc -march=mips -mcpu=mips16 < %s
+; RUN: llc -march=mips -mattr=mips16 < %s
 
 ; ALL-LABEL: madd1:
 
diff --git a/test/CodeGen/Mips/mbrsize4a.ll b/test/CodeGen/Mips/mbrsize4a.ll
index 264d2284afc9a..9562e55dcc2aa 100644
--- a/test/CodeGen/Mips/mbrsize4a.ll
+++ b/test/CodeGen/Mips/mbrsize4a.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands   < %s | FileCheck %s -check-prefix=jal16
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands   < %s | FileCheck %s -check-prefix=jal16
 
 @j = global i32 10, align 4
 @.str = private unnamed_addr constant [11 x i8] c"at bottom\0A\00", align 1
diff --git a/test/CodeGen/Mips/mips16-hf-attr-2.ll b/test/CodeGen/Mips/mips16-hf-attr-2.ll
index 60c6eaad8f765..82c0989c42543 100644
--- a/test/CodeGen/Mips/mips16-hf-attr-2.ll
+++ b/test/CodeGen/Mips/mips16-hf-attr-2.ll
@@ -1,7 +1,7 @@
 ; Check that stubs generation for mips16 hard-float mode does not depend
 ; on the function 'use-soft-float' attribute's value.
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel \
-; RUN:     -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s
+; RUN:     -mattr=mips16 -relocation-model=pic < %s | FileCheck %s
 
 define void @bar_sf() #1 {
 ; CHECK: bar_sf:
diff --git a/test/CodeGen/Mips/mips16-hf-attr.ll b/test/CodeGen/Mips/mips16-hf-attr.ll
index c6ad442fdea2c..bcae1e92f7ccc 100644
--- a/test/CodeGen/Mips/mips16-hf-attr.ll
+++ b/test/CodeGen/Mips/mips16-hf-attr.ll
@@ -1,7 +1,7 @@
 ; Check that stubs generation for mips16 hard-float mode does not depend
 ; on the function 'use-soft-float' attribute's value.
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel \
-; RUN:     -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s
+; RUN:     -mattr=mips16 -relocation-model=pic < %s | FileCheck %s
 
 define void @bar_hf() #0 {
 ; CHECK: bar_hf:
diff --git a/test/CodeGen/Mips/mips16_32_1.ll b/test/CodeGen/Mips/mips16_32_1.ll
index f6096b402f2d7..211aa2a0f4b07 100644
--- a/test/CodeGen/Mips/mips16_32_1.ll
+++ b/test/CodeGen/Mips/mips16_32_1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s -mips-mixed-16-32  | FileCheck %s 
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s -mips-mixed-16-32  | FileCheck %s 
 ; RUN: llc  -march=mipsel -mcpu=mips32 -relocation-model=pic -O3 < %s -mips-mixed-16-32  | FileCheck %s 
 
 define void @foo() #0 {
diff --git a/test/CodeGen/Mips/mips16_32_10.ll b/test/CodeGen/Mips/mips16_32_10.ll
index ff9831ed76226..b256912d7151b 100644
--- a/test/CodeGen/Mips/mips16_32_10.ll
+++ b/test/CodeGen/Mips/mips16_32_10.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
 
 define void @foo() #0 {
 entry:
diff --git a/test/CodeGen/Mips/mips16_32_3.ll b/test/CodeGen/Mips/mips16_32_3.ll
index c5a29a0b8fdbf..8891eba8c954e 100644
--- a/test/CodeGen/Mips/mips16_32_3.ll
+++ b/test/CodeGen/Mips/mips16_32_3.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
 ; RUN: llc  -march=mipsel -mcpu=mips32 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=32
 
 define void @foo() #0 {
diff --git a/test/CodeGen/Mips/mips16_32_4.ll b/test/CodeGen/Mips/mips16_32_4.ll
index 1238363d907e0..2d50881c61b49 100644
--- a/test/CodeGen/Mips/mips16_32_4.ll
+++ b/test/CodeGen/Mips/mips16_32_4.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
 ; RUN: llc  -march=mipsel -mcpu=mips32 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=32
 
 define void @foo() #0 {
diff --git a/test/CodeGen/Mips/mips16_32_5.ll b/test/CodeGen/Mips/mips16_32_5.ll
index 5d4c8a1af5630..644ba4c98e348 100644
--- a/test/CodeGen/Mips/mips16_32_5.ll
+++ b/test/CodeGen/Mips/mips16_32_5.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
 ; RUN: llc  -march=mipsel -mcpu=mips32 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=32
 
 define void @foo() #0 {
diff --git a/test/CodeGen/Mips/mips16_32_6.ll b/test/CodeGen/Mips/mips16_32_6.ll
index 63323b608bc57..1afce84b0335f 100644
--- a/test/CodeGen/Mips/mips16_32_6.ll
+++ b/test/CodeGen/Mips/mips16_32_6.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
 ; RUN: llc  -march=mipsel -mcpu=mips32 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=32
 
 define void @foo() #0 {
diff --git a/test/CodeGen/Mips/mips16_32_7.ll b/test/CodeGen/Mips/mips16_32_7.ll
index 480a23c8b25ea..cc247c0545434 100644
--- a/test/CodeGen/Mips/mips16_32_7.ll
+++ b/test/CodeGen/Mips/mips16_32_7.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=16
 ; RUN: llc  -march=mipsel -mcpu=mips32 -relocation-model=static -O3 < %s -mips-mixed-16-32  | FileCheck %s -check-prefix=32
 
 define void @foo() #0 {
diff --git a/test/CodeGen/Mips/mips16_fpret.ll b/test/CodeGen/Mips/mips16_fpret.ll
index 0f09c4105a177..651feba198083 100644
--- a/test/CodeGen/Mips/mips16_fpret.ll
+++ b/test/CodeGen/Mips/mips16_fpret.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=1
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=2
-; RUN: llc -mtriple=mipsel-linux-gnu  -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=3
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=4
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=2
+; RUN: llc -mtriple=mipsel-linux-gnu  -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=3
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=4
 
 
 @x = global float 0x41F487E980000000, align 4
diff --git a/test/CodeGen/Mips/mips16ex.ll b/test/CodeGen/Mips/mips16ex.ll
index c3a02261119ec..b2521ae872a81 100644
--- a/test/CodeGen/Mips/mips16ex.ll
+++ b/test/CodeGen/Mips/mips16ex.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 ;16: main:
 ;16-NEXT: [[TMP:.*]]:
diff --git a/test/CodeGen/Mips/mips16fpe.ll b/test/CodeGen/Mips/mips16fpe.ll
index f8b916da3a49a..16695e45265aa 100644
--- a/test/CodeGen/Mips/mips16fpe.ll
+++ b/test/CodeGen/Mips/mips16fpe.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16hf
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16hf
 
 @x = global float 5.000000e+00, align 4
 @y = global float 1.500000e+01, align 4
@@ -297,7 +297,7 @@ entry:
   %and2 = and i1 %lnot, %cmp1
   %and = zext i1 %and2 to i32
   store i32 %and, i32* @ltsf2_result, align 4
-;16hf:  lw	${{[0-9]+}}, %call16(__mips16_unordsf2)(${{[0-9]+}})
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_ltsf2)(${{[0-9]+}})
 ;16hf:  lw	${{[0-9]+}}, %call16(__mips16_ltsf2)(${{[0-9]+}})
   ret void
 }
@@ -313,7 +313,7 @@ entry:
   %and2 = and i1 %lnot, %cmp1
   %and = zext i1 %and2 to i32
   store i32 %and, i32* @ltdf2_result, align 4
-;16hf:  lw	${{[0-9]+}}, %call16(__mips16_unorddf2)(${{[0-9]+}})
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_ltdf2)(${{[0-9]+}})
 ;16hf:  lw	${{[0-9]+}}, %call16(__mips16_ltdf2)(${{[0-9]+}})
   ret void
 }
diff --git a/test/CodeGen/Mips/misha.ll b/test/CodeGen/Mips/misha.ll
index 23ad7f6057af4..bedea9de5f927 100644
--- a/test/CodeGen/Mips/misha.ll
+++ b/test/CodeGen/Mips/misha.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 define i32 @sumc(i8* nocapture %to, i8* nocapture %from, i32) nounwind {
 entry:
diff --git a/test/CodeGen/Mips/msa/elm_copy.ll b/test/CodeGen/Mips/msa/elm_copy.ll
index 2a0d74f445246..251b535fd76ca 100644
--- a/test/CodeGen/Mips/msa/elm_copy.ll
+++ b/test/CodeGen/Mips/msa/elm_copy.ll
@@ -170,7 +170,8 @@ declare i32 @llvm.mips.copy.u.w(<4 x i32>, i32) nounwind
 ; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_copy_u_w_ARG1)
 ; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_copy_u_w_ARG1)
 ; MIPS-ANY-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
-; MIPS-ANY-DAG: copy_u.w [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS32-DAG: copy_s.w [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS64-DAG: copy_u.w [[RD:\$[0-9]+]], [[WS]][1]
 ; MIPS32-DAG: lw [[RES:\$[0-9]+]], %got(llvm_mips_copy_u_w_RES)
 ; MIPS64-DAG: ld [[RES:\$[0-9]+]], %got_disp(llvm_mips_copy_u_w_RES)
 ; MIPS-ANY-DAG: sw [[RD]], 0([[RES]])
@@ -196,7 +197,7 @@ declare i64 @llvm.mips.copy.u.d(<2 x i64>, i32) nounwind
 ; MIPS64-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
 ; MIPS32-DAG: copy_s.w [[RD1:\$[0-9]+]], [[WS]][2]
 ; MIPS32-DAG: copy_s.w [[RD2:\$[0-9]+]], [[WS]][3]
-; MIPS64-DAG: copy_u.d [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS64-DAG: copy_s.d [[RD:\$[0-9]+]], [[WS]][1]
 ; MIPS32-DAG: lw [[RES:\$[0-9]+]], %got(llvm_mips_copy_u_d_RES)
 ; MIPS64-DAG: ld [[RES:\$[0-9]+]], %got_disp(llvm_mips_copy_u_d_RES)
 ; MIPS32-DAG: sw [[RD1]], 0([[RES]])
diff --git a/test/CodeGen/Mips/mul.ll b/test/CodeGen/Mips/mul.ll
index 3231f9cac38e8..9e053fc2e7d67 100644
--- a/test/CodeGen/Mips/mul.ll
+++ b/test/CodeGen/Mips/mul.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @iiii = global i32 5, align 4
 @jjjj = global i32 -6, align 4
diff --git a/test/CodeGen/Mips/mulll.ll b/test/CodeGen/Mips/mulll.ll
index 6e5ba647b8bf3..9a2acd4173815 100644
--- a/test/CodeGen/Mips/mulll.ll
+++ b/test/CodeGen/Mips/mulll.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @iiii = global i64 5, align 8
 @jjjj = global i64 -6, align 8
diff --git a/test/CodeGen/Mips/mulull.ll b/test/CodeGen/Mips/mulull.ll
index c1334484fb66f..fdcb68d036f85 100644
--- a/test/CodeGen/Mips/mulull.ll
+++ b/test/CodeGen/Mips/mulull.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @iiii = global i64 5, align 8
 @jjjj = global i64 6, align 8
diff --git a/test/CodeGen/Mips/nacl-align.ll b/test/CodeGen/Mips/nacl-align.ll
index ec8f3f06afdf3..8191c7dec6f20 100644
--- a/test/CodeGen/Mips/nacl-align.ll
+++ b/test/CodeGen/Mips/nacl-align.ll
@@ -44,18 +44,17 @@ default:
 ; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
 ; CHECK-NEXT:        jr      $ra
 ; CHECK-NEXT:        addiu   $2, $zero, 111
-; CHECK-NEXT:        .align  4
 ; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
 ; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 222
+; CHECK-NEXT:        addiu   $2, $zero, 555
 ; CHECK-NEXT:        .align  4
 ; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
 ; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 333
+; CHECK-NEXT:        addiu   $2, $zero, 222
 ; CHECK-NEXT:        .align  4
 ; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
 ; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 444
+; CHECK-NEXT:        addiu   $2, $zero, 333
 
 }
 
diff --git a/test/CodeGen/Mips/neg1.ll b/test/CodeGen/Mips/neg1.ll
index 36275a2991f6d..dd5d7a09eb281 100644
--- a/test/CodeGen/Mips/neg1.ll
+++ b/test/CodeGen/Mips/neg1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 10, align 4
 @.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
diff --git a/test/CodeGen/Mips/no-odd-spreg-msa.ll b/test/CodeGen/Mips/no-odd-spreg-msa.ll
index cf79557cc97fa..7213044a2300b 100644
--- a/test/CodeGen/Mips/no-odd-spreg-msa.ll
+++ b/test/CodeGen/Mips/no-odd-spreg-msa.ll
@@ -1,5 +1,9 @@
-; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+fp64,+msa,-nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG
-; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+fp64,+msa,+nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+fp64,+msa,-nooddspreg \
+; RUN:     -no-integrated-as < %s | FileCheck %s -check-prefix=ALL \
+; RUN:     -check-prefix=ODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+fp64,+msa,+nooddspreg \
+; RUN:     -no-integrated-as < %s | FileCheck %s -check-prefix=ALL \
+; RUN:     -check-prefix=NOODDSPREG
 
 @v4f32 = global <4 x float> zeroinitializer
 
@@ -19,7 +23,7 @@ entry:
   ; On the other hand, if odd single precision registers are not permitted, it
   ; must copy $f13 to an even-numbered register before inserting into the
   ; vector.
-  call void asm sideeffect "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+  call void asm sideeffect "teqi $$zero, 1", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
   %1 = insertelement <4 x float> %0, float %b, i32 0
   store <4 x float> %1, <4 x float>* @v4f32
   ret void
@@ -32,7 +36,7 @@ entry:
 ; NOODDSPREG:     mov.s $f[[F0:[0-9]+]], $f13
 ; NOODDSPREG:     insve.w $w[[W0]][0], $w[[F0]][0]
 ; ODDSPREG:       insve.w $w[[W0]][0], $w13[0]
-; ALL:            # Clobber
+; ALL:            teqi $zero, 1
 ; ALL-NOT: sdc1
 ; ALL-NOT: ldc1
 ; ALL:            st.w $w[[W0]], 0($[[R0]])
@@ -53,7 +57,7 @@ entry:
   ; On the other hand, if odd single precision registers are not permitted, it
   ; must copy $f13 to an even-numbered register before inserting into the
   ; vector.
-  call void asm sideeffect "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+  call void asm sideeffect "teqi $$zero, 1", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
   %1 = insertelement <4 x float> %0, float %b, i32 1
   store <4 x float> %1, <4 x float>* @v4f32
   ret void
@@ -66,7 +70,7 @@ entry:
 ; NOODDSPREG:     mov.s $f[[F0:[0-9]+]], $f13
 ; NOODDSPREG:     insve.w $w[[W0]][1], $w[[F0]][0]
 ; ODDSPREG:       insve.w $w[[W0]][1], $w13[0]
-; ALL:            # Clobber
+; ALL:            teqi $zero, 1
 ; ALL-NOT: sdc1
 ; ALL-NOT: ldc1
 ; ALL:            st.w $w[[W0]], 0($[[R0]])
@@ -83,7 +87,7 @@ entry:
   ;
   ; On the other hand, if odd single precision registers are not permitted, it
   ; must move it to $f12/$w12.
-  call void asm sideeffect "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+  call void asm sideeffect "teqi $$zero, 1", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
 
   %2 = extractelement <4 x float> %1, i32 0
   ret float %2
@@ -94,7 +98,7 @@ entry:
 ; ALL:            ld.w $w12, 0($[[R0]])
 ; ALL:            move.v $w[[W0:13]], $w12
 ; NOODDSPREG:     move.v $w[[W0:12]], $w13
-; ALL:            # Clobber
+; ALL:            teqi $zero, 1
 ; ALL-NOT: st.w
 ; ALL-NOT: ld.w
 ; ALL:            mov.s $f0, $f[[W0]]
@@ -111,7 +115,7 @@ entry:
   ;
   ; On the other hand, if odd single precision registers are not permitted, it
   ; must be spilled.
-  call void asm sideeffect "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f12},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+  call void asm sideeffect "teqi $$zero, 1", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f12},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
 
   %2 = extractelement <4 x float> %1, i32 1
   ret float %2
@@ -124,7 +128,7 @@ entry:
 ; NOODDSPREG:     st.w $w[[W0]], 0($sp)
 ; ODDSPREG-NOT: st.w
 ; ODDSPREG-NOT: ld.w
-; ALL:            # Clobber
+; ALL:            teqi $zero, 1
 ; ODDSPREG-NOT: st.w
 ; ODDSPREG-NOT: ld.w
 ; NOODDSPREG:     ld.w $w0, 0($sp)
diff --git a/test/CodeGen/Mips/nomips16.ll b/test/CodeGen/Mips/nomips16.ll
index 418d8ead2c391..7492721818840 100644
--- a/test/CodeGen/Mips/nomips16.ll
+++ b/test/CodeGen/Mips/nomips16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s
 
 @x = global float 0.000000e+00, align 4
 @.str = private unnamed_addr constant [20 x i8] c"in main: mips16 %f\0A\00", align 1
diff --git a/test/CodeGen/Mips/not1.ll b/test/CodeGen/Mips/not1.ll
index f5ec5b60e4216..5124805082729 100644
--- a/test/CodeGen/Mips/not1.ll
+++ b/test/CodeGen/Mips/not1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @x = global i32 65504, align 4
 @y = global i32 60929, align 4
diff --git a/test/CodeGen/Mips/null.ll b/test/CodeGen/Mips/null.ll
index bc78a27f199c8..67e346d959f96 100644
--- a/test/CodeGen/Mips/null.ll
+++ b/test/CodeGen/Mips/null.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 < %s | FileCheck %s -check-prefix=16
 
 
 define i32 @main() nounwind {
diff --git a/test/CodeGen/Mips/or1.ll b/test/CodeGen/Mips/or1.ll
index 51b6ebfe8e3b4..aabffd111c5e0 100644
--- a/test/CodeGen/Mips/or1.ll
+++ b/test/CodeGen/Mips/or1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @x = global i32 65504, align 4
 @y = global i32 60929, align 4
diff --git a/test/CodeGen/Mips/powif64_16.ll b/test/CodeGen/Mips/powif64_16.ll
index 33ec8c40c6106..9be1a368b1999 100644
--- a/test/CodeGen/Mips/powif64_16.ll
+++ b/test/CodeGen/Mips/powif64_16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static < %s | FileCheck %s
 
 declare float     @llvm.powi.f32(float  %Val, i32 %power)
 declare double    @llvm.powi.f64(double %Val, i32 %power)
diff --git a/test/CodeGen/Mips/rem.ll b/test/CodeGen/Mips/rem.ll
index 70f957ce15f6f..ef16483f39d3c 100644
--- a/test/CodeGen/Mips/rem.ll
+++ b/test/CodeGen/Mips/rem.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @iiii = global i32 103, align 4
 @jjjj = global i32 -4, align 4
diff --git a/test/CodeGen/Mips/remu.ll b/test/CodeGen/Mips/remu.ll
index 12679727952f1..dac4b05cd00a1 100644
--- a/test/CodeGen/Mips/remu.ll
+++ b/test/CodeGen/Mips/remu.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @iiii = global i32 103, align 4
 @jjjj = global i32 4, align 4
diff --git a/test/CodeGen/Mips/s2rem.ll b/test/CodeGen/Mips/s2rem.ll
index 65e48fe57c928..715abc072b4bf 100644
--- a/test/CodeGen/Mips/s2rem.ll
+++ b/test/CodeGen/Mips/s2rem.ll
@@ -1,6 +1,6 @@
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic  < %s | FileCheck %s -check-prefix=PIC
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic  < %s | FileCheck %s -check-prefix=PIC
 
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static  < %s | FileCheck %s -check-prefix=STATIC
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static  < %s | FileCheck %s -check-prefix=STATIC
 
 
 @xi = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/sb1.ll b/test/CodeGen/Mips/sb1.ll
index d2e8510024e56..4724a7f2cfd73 100644
--- a/test/CodeGen/Mips/sb1.ll
+++ b/test/CodeGen/Mips/sb1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 97, align 4
 @c = common global i8 0, align 1
diff --git a/test/CodeGen/Mips/sel1c.ll b/test/CodeGen/Mips/sel1c.ll
index 6753af106e0fb..b09be8d41157e 100644
--- a/test/CodeGen/Mips/sel1c.ll
+++ b/test/CodeGen/Mips/sel1c.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
 
 @i = global i32 1, align 4
 @j = global i32 2, align 4
diff --git a/test/CodeGen/Mips/sel2c.ll b/test/CodeGen/Mips/sel2c.ll
index 987cccad5bf4e..0650147be70cb 100644
--- a/test/CodeGen/Mips/sel2c.ll
+++ b/test/CodeGen/Mips/sel2c.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
 
 @i = global i32 1, align 4
 @j = global i32 2, align 4
diff --git a/test/CodeGen/Mips/selTBteqzCmpi.ll b/test/CodeGen/Mips/selTBteqzCmpi.ll
index 5a72ea01073c6..97eba29e99fbf 100644
--- a/test/CodeGen/Mips/selTBteqzCmpi.ll
+++ b/test/CodeGen/Mips/selTBteqzCmpi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 1, align 4
 @j = global i32 2, align 4
diff --git a/test/CodeGen/Mips/selTBtnezCmpi.ll b/test/CodeGen/Mips/selTBtnezCmpi.ll
index b6407e67f27af..62af3dffb7b3e 100644
--- a/test/CodeGen/Mips/selTBtnezCmpi.ll
+++ b/test/CodeGen/Mips/selTBtnezCmpi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 1, align 4
 @j = global i32 2, align 4
diff --git a/test/CodeGen/Mips/selTBtnezSlti.ll b/test/CodeGen/Mips/selTBtnezSlti.ll
index 2f1cdb8662940..3851fdf093e44 100644
--- a/test/CodeGen/Mips/selTBtnezSlti.ll
+++ b/test/CodeGen/Mips/selTBtnezSlti.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 1, align 4
 @j = global i32 2, align 4
diff --git a/test/CodeGen/Mips/seleq.ll b/test/CodeGen/Mips/seleq.ll
index bd25358fb9e65..7d1e034d68c7c 100644
--- a/test/CodeGen/Mips/seleq.ll
+++ b/test/CodeGen/Mips/seleq.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/test/CodeGen/Mips/seleqk.ll b/test/CodeGen/Mips/seleqk.ll
index 2eeaa9e33738f..a0bfe44eadd65 100644
--- a/test/CodeGen/Mips/seleqk.ll
+++ b/test/CodeGen/Mips/seleqk.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/test/CodeGen/Mips/selgek.ll b/test/CodeGen/Mips/selgek.ll
index 38ad95ee01a95..9d9df743db9b9 100644
--- a/test/CodeGen/Mips/selgek.ll
+++ b/test/CodeGen/Mips/selgek.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/test/CodeGen/Mips/selgt.ll b/test/CodeGen/Mips/selgt.ll
index a2e1e39e742f0..94f0f9b50af1c 100644
--- a/test/CodeGen/Mips/selgt.ll
+++ b/test/CodeGen/Mips/selgt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/test/CodeGen/Mips/selle.ll b/test/CodeGen/Mips/selle.ll
index 1adefb7846e4c..8925aac10c4d1 100644
--- a/test/CodeGen/Mips/selle.ll
+++ b/test/CodeGen/Mips/selle.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/test/CodeGen/Mips/selltk.ll b/test/CodeGen/Mips/selltk.ll
index db9f8c171b78c..106fe9b85d602 100644
--- a/test/CodeGen/Mips/selltk.ll
+++ b/test/CodeGen/Mips/selltk.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/test/CodeGen/Mips/selne.ll b/test/CodeGen/Mips/selne.ll
index 9be99d6694757..270c0dadd8640 100644
--- a/test/CodeGen/Mips/selne.ll
+++ b/test/CodeGen/Mips/selne.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/test/CodeGen/Mips/selnek.ll b/test/CodeGen/Mips/selnek.ll
index 5b6aa2afa1af9..13ab693adb8b1 100644
--- a/test/CodeGen/Mips/selnek.ll
+++ b/test/CodeGen/Mips/selnek.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/test/CodeGen/Mips/selpat.ll b/test/CodeGen/Mips/selpat.ll
index c682d8182a467..ff4bed327f456 100644
--- a/test/CodeGen/Mips/selpat.ll
+++ b/test/CodeGen/Mips/selpat.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/test/CodeGen/Mips/seteq.ll b/test/CodeGen/Mips/seteq.ll
index 8fad6122bdbe2..76f9bb3ebf9d7 100644
--- a/test/CodeGen/Mips/seteq.ll
+++ b/test/CodeGen/Mips/seteq.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 1, align 4
 @j = global i32 10, align 4
diff --git a/test/CodeGen/Mips/seteqz.ll b/test/CodeGen/Mips/seteqz.ll
index 8e9a4beac75b1..368e85ce886ee 100644
--- a/test/CodeGen/Mips/seteqz.ll
+++ b/test/CodeGen/Mips/seteqz.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 0, align 4
 @j = global i32 99, align 4
diff --git a/test/CodeGen/Mips/setge.ll b/test/CodeGen/Mips/setge.ll
index 8fb729964cf5f..af69d7b325b9f 100644
--- a/test/CodeGen/Mips/setge.ll
+++ b/test/CodeGen/Mips/setge.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 -5, align 4
 @k = global i32 10, align 4
diff --git a/test/CodeGen/Mips/setgek.ll b/test/CodeGen/Mips/setgek.ll
index 1148d1b67bda8..d6eee1ff6e184 100644
--- a/test/CodeGen/Mips/setgek.ll
+++ b/test/CodeGen/Mips/setgek.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @k = global i32 10, align 4
 @r1 = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/setle.ll b/test/CodeGen/Mips/setle.ll
index fe4a2c37eb545..f7d25054e013c 100644
--- a/test/CodeGen/Mips/setle.ll
+++ b/test/CodeGen/Mips/setle.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 -5, align 4
 @k = global i32 10, align 4
diff --git a/test/CodeGen/Mips/setlt.ll b/test/CodeGen/Mips/setlt.ll
index c4211e6dd6962..040f8b17f2190 100644
--- a/test/CodeGen/Mips/setlt.ll
+++ b/test/CodeGen/Mips/setlt.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 -5, align 4
 @k = global i32 10, align 4
diff --git a/test/CodeGen/Mips/setltk.ll b/test/CodeGen/Mips/setltk.ll
index 8c00411112709..79d25b1f130d9 100644
--- a/test/CodeGen/Mips/setltk.ll
+++ b/test/CodeGen/Mips/setltk.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 -5, align 4
 @k = global i32 10, align 4
diff --git a/test/CodeGen/Mips/setne.ll b/test/CodeGen/Mips/setne.ll
index 484674e5da324..02692bf9e6326 100644
--- a/test/CodeGen/Mips/setne.ll
+++ b/test/CodeGen/Mips/setne.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 1, align 4
 @j = global i32 10, align 4
diff --git a/test/CodeGen/Mips/setuge.ll b/test/CodeGen/Mips/setuge.ll
index 025b4dcefd76d..6ae77b6cc1b46 100644
--- a/test/CodeGen/Mips/setuge.ll
+++ b/test/CodeGen/Mips/setuge.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 5, align 4
 @k = global i32 10, align 4
diff --git a/test/CodeGen/Mips/setugt.ll b/test/CodeGen/Mips/setugt.ll
index 0ce317e0df9e0..f8de59b754cfc 100644
--- a/test/CodeGen/Mips/setugt.ll
+++ b/test/CodeGen/Mips/setugt.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 5, align 4
 @k = global i32 10, align 4
diff --git a/test/CodeGen/Mips/setule.ll b/test/CodeGen/Mips/setule.ll
index 4255fd27c5cd4..8874d4d698b99 100644
--- a/test/CodeGen/Mips/setule.ll
+++ b/test/CodeGen/Mips/setule.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 5, align 4
 @k = global i32 10, align 4
diff --git a/test/CodeGen/Mips/setult.ll b/test/CodeGen/Mips/setult.ll
index d30107e54dd09..29c7588a153dc 100644
--- a/test/CodeGen/Mips/setult.ll
+++ b/test/CodeGen/Mips/setult.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 5, align 4
 @k = global i32 10, align 4
diff --git a/test/CodeGen/Mips/setultk.ll b/test/CodeGen/Mips/setultk.ll
index 1b79f103bed7e..c1ef0aa0b0596 100644
--- a/test/CodeGen/Mips/setultk.ll
+++ b/test/CodeGen/Mips/setultk.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 5, align 4
 @k = global i32 10, align 4
diff --git a/test/CodeGen/Mips/sh1.ll b/test/CodeGen/Mips/sh1.ll
index 3f70b9bc6e682..ccba32a4cca9d 100644
--- a/test/CodeGen/Mips/sh1.ll
+++ b/test/CodeGen/Mips/sh1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 97, align 4
 @s = common global i16 0, align 2
diff --git a/test/CodeGen/Mips/simplebr.ll b/test/CodeGen/Mips/simplebr.ll
index 2aeacc903fbea..96dfce915e909 100644
--- a/test/CodeGen/Mips/simplebr.ll
+++ b/test/CodeGen/Mips/simplebr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mips16-hard-float -mattr=+soft-float -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mips16-hard-float -mattr=+soft-float -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
 
 ; ModuleID = 'simplebr.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
diff --git a/test/CodeGen/Mips/sitofp-selectcc-opt.ll b/test/CodeGen/Mips/sitofp-selectcc-opt.ll
index c60fceb1a04c0..751fba46d72fa 100644
--- a/test/CodeGen/Mips/sitofp-selectcc-opt.ll
+++ b/test/CodeGen/Mips/sitofp-selectcc-opt.ll
@@ -7,7 +7,7 @@ entry:
 ; check that this transformation doesn't happen:
 ; (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
 ;
-; CHECK-NOT:   # double -1.000000e+00
+; CHECK-NOT:   # double -1
 
   %tobool1 = icmp ne i32 %a, 0
   %not.tobool = icmp ne i64 %b, 0
@@ -19,4 +19,3 @@ entry:
   store double %add, double* @foo12.d4, align 8
   ret double %add
 }
-
diff --git a/test/CodeGen/Mips/sll1.ll b/test/CodeGen/Mips/sll1.ll
index 4d35b64e0b58b..93b814f944c50 100644
--- a/test/CodeGen/Mips/sll1.ll
+++ b/test/CodeGen/Mips/sll1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 10, align 4
 @j = global i32 0, align 4
diff --git a/test/CodeGen/Mips/sll2.ll b/test/CodeGen/Mips/sll2.ll
index dc2236b10ccfd..f30108d14df89 100644
--- a/test/CodeGen/Mips/sll2.ll
+++ b/test/CodeGen/Mips/sll2.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 10, align 4
 @j = global i32 4, align 4
diff --git a/test/CodeGen/Mips/sr1.ll b/test/CodeGen/Mips/sr1.ll
index 69655f7b842cf..b3fdef0a17b47 100644
--- a/test/CodeGen/Mips/sr1.ll
+++ b/test/CodeGen/Mips/sr1.ll
@@ -1,6 +1,6 @@
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static  < %s | FileCheck %s 
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static  < %s | FileCheck %s 
 
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static  < %s | FileCheck %s -check-prefix=NEG
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=static  < %s | FileCheck %s -check-prefix=NEG
 
 @f = common global float 0.000000e+00, align 4
 
diff --git a/test/CodeGen/Mips/sra1.ll b/test/CodeGen/Mips/sra1.ll
index 1c7d417cb13a4..51282bd8033d3 100644
--- a/test/CodeGen/Mips/sra1.ll
+++ b/test/CodeGen/Mips/sra1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 -354, align 4
 @.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
diff --git a/test/CodeGen/Mips/sra2.ll b/test/CodeGen/Mips/sra2.ll
index 771d0f4a79e34..0a2bff9e40804 100644
--- a/test/CodeGen/Mips/sra2.ll
+++ b/test/CodeGen/Mips/sra2.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 -354, align 4
 @j = global i32 3, align 4
diff --git a/test/CodeGen/Mips/srl1.ll b/test/CodeGen/Mips/srl1.ll
index a748eabb066fb..8e97734bb313c 100644
--- a/test/CodeGen/Mips/srl1.ll
+++ b/test/CodeGen/Mips/srl1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 10654, align 4
 @j = global i32 0, align 4
diff --git a/test/CodeGen/Mips/srl2.ll b/test/CodeGen/Mips/srl2.ll
index 6e338b39350f4..4ccdefdf14137 100644
--- a/test/CodeGen/Mips/srl2.ll
+++ b/test/CodeGen/Mips/srl2.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 10654, align 4
 @j = global i32 0, align 4
diff --git a/test/CodeGen/Mips/stchar.ll b/test/CodeGen/Mips/stchar.ll
index 6bc4889931a74..4b3c8fb793152 100644
--- a/test/CodeGen/Mips/stchar.ll
+++ b/test/CodeGen/Mips/stchar.ll
@@ -1,5 +1,5 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_h
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_b
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_h
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_b
 
 @.str = private unnamed_addr constant [9 x i8] c"%hd %c \0A\00", align 1
 @sp = common global i16* null, align 4
diff --git a/test/CodeGen/Mips/stldst.ll b/test/CodeGen/Mips/stldst.ll
index 4eef5ece05893..8aecca4aed670 100644
--- a/test/CodeGen/Mips/stldst.ll
+++ b/test/CodeGen/Mips/stldst.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @kkkk = global i32 67, align 4
 @llll = global i32 33, align 4
diff --git a/test/CodeGen/Mips/sub1.ll b/test/CodeGen/Mips/sub1.ll
index 636ab8f2c5f3b..a5e6988402649 100644
--- a/test/CodeGen/Mips/sub1.ll
+++ b/test/CodeGen/Mips/sub1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 10, align 4
 @.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
diff --git a/test/CodeGen/Mips/sub2.ll b/test/CodeGen/Mips/sub2.ll
index a97f5e947ca9f..d10cddb9e6b9e 100644
--- a/test/CodeGen/Mips/sub2.ll
+++ b/test/CodeGen/Mips/sub2.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 10, align 4
 @j = global i32 20, align 4
diff --git a/test/CodeGen/Mips/tail16.ll b/test/CodeGen/Mips/tail16.ll
index 13f27fcc513b3..75a2a827f258a 100644
--- a/test/CodeGen/Mips/tail16.ll
+++ b/test/CodeGen/Mips/tail16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic   < %s | FileCheck %s 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic   < %s | FileCheck %s 
 
 ; Function Attrs: nounwind optsize
 define float @h()  {
diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll
index 6a0d64b7eed82..b0868255053a2 100644
--- a/test/CodeGen/Mips/tailcall.ll
+++ b/test/CodeGen/Mips/tailcall.ll
@@ -4,7 +4,7 @@
 ; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=STATIC32
 ; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+n64 -enable-mips-tail-calls \
 ; RUN: < %s | FileCheck %s -check-prefix=N64
-; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic \
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic \
 ; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=PIC16
 
 @g0 = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
index b61f84e037614..5de23103c9978 100644
--- a/test/CodeGen/Mips/tls-alias.ll
+++ b/test/CodeGen/Mips/tls-alias.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -disable-mips-delay-filler < %s | FileCheck %s
 
 @foo = thread_local global i32 42
-@bar = hidden thread_local alias i32* @foo
+@bar = hidden thread_local alias i32, i32* @foo
 
 define i32* @zed() {
 ; CHECK-DAG: __tls_get_addr
diff --git a/test/CodeGen/Mips/tls16.ll b/test/CodeGen/Mips/tls16.ll
index 3d324d7ed1e89..349e381af2b73 100644
--- a/test/CodeGen/Mips/tls16.ll
+++ b/test/CodeGen/Mips/tls16.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
 
 @a = thread_local global i32 4, align 4
 
diff --git a/test/CodeGen/Mips/tls16_2.ll b/test/CodeGen/Mips/tls16_2.ll
index 0a6a4123e1163..b232c8534c552 100644
--- a/test/CodeGen/Mips/tls16_2.ll
+++ b/test/CodeGen/Mips/tls16_2.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
 
 @f.i = internal thread_local unnamed_addr global i32 1, align 4
 
diff --git a/test/CodeGen/Mips/trap1.ll b/test/CodeGen/Mips/trap1.ll
index 90755130e7c22..575574a0a3b1a 100644
--- a/test/CodeGen/Mips/trap1.ll
+++ b/test/CodeGen/Mips/trap1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic
 
 declare void @llvm.trap()
 
diff --git a/test/CodeGen/Mips/ul1.ll b/test/CodeGen/Mips/ul1.ll
index ad09929546319..eb5187a8533a2 100644
--- a/test/CodeGen/Mips/ul1.ll
+++ b/test/CodeGen/Mips/ul1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 %struct.ua = type <{ i16, i32 }>
 
 @foo = common global %struct.ua zeroinitializer, align 1
diff --git a/test/CodeGen/Mips/xor1.ll b/test/CodeGen/Mips/xor1.ll
index dd51f143bb6c5..b203271a042b5 100644
--- a/test/CodeGen/Mips/xor1.ll
+++ b/test/CodeGen/Mips/xor1.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @x = global i32 65504, align 4
 @y = global i32 60929, align 4
diff --git a/test/CodeGen/NVPTX/branch-fold.ll b/test/CodeGen/NVPTX/branch-fold.ll
new file mode 100644
index 0000000000000..2b9cd0a35d929
--- /dev/null
+++ b/test/CodeGen/NVPTX/branch-fold.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -disable-cgp | FileCheck %s
+; Disable CGP which also folds branches, so that only BranchFolding is under
+; the spotlight.
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define void @foo(i32 %x, float* %output) {
+; CHECK-LABEL: .visible .func foo(
+; CHECK-NOT: bra.uni
+; CHECK-NOT: LBB0_
+  %1 = icmp eq i32 %x, 1
+  br i1 %1, label %then, label %else
+
+then:
+  br label %merge
+
+else:
+  br label %merge
+
+merge:
+  store float 2.0, float* %output
+  ret void
+}
+
+; PR24299. no crash
+define ptx_kernel void @hoge() #0 {
+; CHECK-LABEL: .visible .entry hoge(
+bb:
+  br i1 undef, label %bb1, label %bb4
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi i64 [ %tmp2, %bb1 ], [ undef, %bb ]
+  %tmp2 = add nsw i64 %tmp, 1
+  %tmp3 = icmp sle i64 %tmp, 0
+  br i1 %tmp3, label %bb1, label %bb4
+
+bb4:                                              ; preds = %bb4, %bb1, %bb
+  br label %bb4
+}
diff --git a/test/CodeGen/NVPTX/bypass-div.ll b/test/CodeGen/NVPTX/bypass-div.ll
new file mode 100644
index 0000000000000..bd98c9a5b0b10
--- /dev/null
+++ b/test/CodeGen/NVPTX/bypass-div.ll
@@ -0,0 +1,80 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+
+; 64-bit divides and rems should be split into a fast and slow path where
+; the fast path uses a 32-bit operation.
+
+define void @sdiv64(i64 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: sdiv64(
+; CHECK:        div.s64
+; CHECK:        div.u32
+; CHECK:        ret
+  %d = sdiv i64 %a, %b
+  store i64 %d, i64* %retptr
+  ret void
+}
+
+define void @udiv64(i64 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: udiv64(
+; CHECK:        div.u64
+; CHECK:        div.u32
+; CHECK:        ret
+  %d = udiv i64 %a, %b
+  store i64 %d, i64* %retptr
+  ret void
+}
+
+define void @srem64(i64 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: srem64(
+; CHECK:        rem.s64
+; CHECK:        rem.u32
+; CHECK:        ret
+  %d = srem i64 %a, %b
+  store i64 %d, i64* %retptr
+  ret void
+}
+
+define void @urem64(i64 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: urem64(
+; CHECK:        rem.u64
+; CHECK:        rem.u32
+; CHECK:        ret
+  %d = urem i64 %a, %b
+  store i64 %d, i64* %retptr
+  ret void
+}
+
+define void @sdiv32(i32 %a, i32 %b, i32* %retptr) {
+; CHECK-LABEL: sdiv32(
+; CHECK: div.s32
+; CHECK-NOT: div.
+  %d = sdiv i32 %a, %b
+  store i32 %d, i32* %retptr
+  ret void
+}
+
+define void @udiv32(i32 %a, i32 %b, i32* %retptr) {
+; CHECK-LABEL: udiv32(
+; CHECK: div.u32
+; CHECK-NOT: div.
+  %d = udiv i32 %a, %b
+  store i32 %d, i32* %retptr
+  ret void
+}
+
+define void @srem32(i32 %a, i32 %b, i32* %retptr) {
+; CHECK-LABEL: srem32(
+; CHECK: rem.s32
+; CHECK-NOT: rem.
+  %d = srem i32 %a, %b
+  store i32 %d, i32* %retptr
+  ret void
+}
+
+define void @urem32(i32 %a, i32 %b, i32* %retptr) {
+; CHECK-LABEL: urem32(
+; CHECK: rem.u32
+; CHECK-NOT: rem.
+  %d = urem i32 %a, %b
+  store i32 %d, i32* %retptr
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/combine-min-max.ll b/test/CodeGen/NVPTX/combine-min-max.ll
new file mode 100644
index 0000000000000..64bb7a37ffd2f
--- /dev/null
+++ b/test/CodeGen/NVPTX/combine-min-max.ll
@@ -0,0 +1,307 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O2 | FileCheck %s
+
+; *************************************
+; * Cases with no min/max
+
+define i32 @ab_eq_i32(i32 %a, i32 %b) {
+; LABEL: @ab_slt_i32
+; CHECK-NOT: min
+; CHECK-NOT: max
+  %cmp = icmp eq i32 %a, %b
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
+
+define i64 @ba_ne_i64(i64 %a, i64 %b) {
+; LABEL: @ab_ne_i64
+; CHECK-NOT: min
+; CHECK-NOT: max
+  %cmp = icmp ne i64 %a, %b
+  %sel = select i1 %cmp, i64 %b, i64 %a
+  ret i64 %sel
+}
+
+; PTX does have e.g. max.s16, but at least as of Kepler (sm_3x) that
+; gets compiled to SASS that converts the 16 bit parameters to 32 bit
+; before using a 32 bit instruction. That is probably not a win and
+; NVCC 7.5 does not emit 16 bit min/max either, presumably for that
+; reason.
+define i16 @ab_ugt_i16(i16 %a, i16 %b) {
+; LABEL: @ab_ugt_i16
+; CHECK-NOT: min
+; CHECK-NOT: max
+  %cmp = icmp ugt i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+
+; *************************************
+; * All variations with i32
+
+; *** ab, unsigned, i32
+define i32 @ab_ugt_i32(i32 %a, i32 %b) {
+; LABEL: @ab_ugt_i32
+; CHECK: max.u32
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
+
+define i32 @ab_uge_i32(i32 %a, i32 %b) {
+; LABEL: @ab_uge_i32
+; CHECK: max.u32
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
+
+define i32 @ab_ult_i32(i32 %a, i32 %b) {
+; LABEL: @ab_ult_i32
+; CHECK: min.u32
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
+
+define i32 @ab_ule_i32(i32 %a, i32 %b) {
+; LABEL: @ab_ule_i32
+; CHECK: min.u32
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
+
+; *** ab, signed, i32
+define i32 @ab_sgt_i32(i32 %a, i32 %b) {
+; LABEL: @ab_ugt_i32
+; CHECK: max.s32
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
+
+define i32 @ab_sge_i32(i32 %a, i32 %b) {
+; LABEL: @ab_sge_i32
+; CHECK: max.s32
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
+
+define i32 @ab_slt_i32(i32 %a, i32 %b) {
+; LABEL: @ab_slt_i32
+; CHECK: min.s32
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
+
+define i32 @ab_sle_i32(i32 %a, i32 %b) {
+; LABEL: @ab_sle_i32
+; CHECK: min.s32
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
+
+; *** ba, unsigned, i32
+define i32 @ba_ugt_i32(i32 %a, i32 %b) {
+; LABEL: @ba_ugt_i32
+; CHECK: min.u32
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, i32 %b, i32 %a
+  ret i32 %sel
+}
+
+define i32 @ba_uge_i32(i32 %a, i32 %b) {
+; LABEL: @ba_uge_i32
+; CHECK: min.u32
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, i32 %b, i32 %a
+  ret i32 %sel
+}
+
+define i32 @ba_ult_i32(i32 %a, i32 %b) {
+; LABEL: @ba_ult_i32
+; CHECK: max.u32
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, i32 %b, i32 %a
+  ret i32 %sel
+}
+
+define i32 @ba_ule_i32(i32 %a, i32 %b) {
+; LABEL: @ba_ule_i32
+; CHECK: max.u32
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, i32 %b, i32 %a
+  ret i32 %sel
+}
+
+; *** ba, signed, i32
+define i32 @ba_sgt_i32(i32 %a, i32 %b) {
+; LBAEL: @ba_ugt_i32
+; CHECK: min.s32
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, i32 %b, i32 %a
+  ret i32 %sel
+}
+
+define i32 @ba_sge_i32(i32 %a, i32 %b) {
+; LABEL: @ba_sge_i32
+; CHECK: min.s32
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, i32 %b, i32 %a
+  ret i32 %sel
+}
+
+define i32 @ba_slt_i32(i32 %a, i32 %b) {
+; LABEL: @ba_slt_i32
+; CHECK: max.s32
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, i32 %b, i32 %a
+  ret i32 %sel
+}
+
+define i32 @ba_sle_i32(i32 %a, i32 %b) {
+; LABEL: @ba_sle_i32
+; CHECK: max.s32
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, i32 %b, i32 %a
+  ret i32 %sel
+}
+
+; *************************************
+; * All variations with i64
+
+; *** ab, unsigned, i64
+define i64 @ab_ugt_i64(i64 %a, i64 %b) {
+; LABEL: @ab_ugt_i64
+; CHECK: max.u64
+  %cmp = icmp ugt i64 %a, %b
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %sel
+}
+
+define i64 @ab_uge_i64(i64 %a, i64 %b) {
+; LABEL: @ab_uge_i64
+; CHECK: max.u64
+  %cmp = icmp uge i64 %a, %b
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %sel
+}
+
+define i64 @ab_ult_i64(i64 %a, i64 %b) {
+; LABEL: @ab_ult_i64
+; CHECK: min.u64
+  %cmp = icmp ult i64 %a, %b
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %sel
+}
+
+define i64 @ab_ule_i64(i64 %a, i64 %b) {
+; LABEL: @ab_ule_i64
+; CHECK: min.u64
+  %cmp = icmp ule i64 %a, %b
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %sel
+}
+
+; *** ab, signed, i64
+define i64 @ab_sgt_i64(i64 %a, i64 %b) {
+; LABEL: @ab_ugt_i64
+; CHECK: max.s64
+  %cmp = icmp sgt i64 %a, %b
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %sel
+}
+
+define i64 @ab_sge_i64(i64 %a, i64 %b) {
+; LABEL: @ab_sge_i64
+; CHECK: max.s64
+  %cmp = icmp sge i64 %a, %b
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %sel
+}
+
+define i64 @ab_slt_i64(i64 %a, i64 %b) {
+; LABEL: @ab_slt_i64
+; CHECK: min.s64
+  %cmp = icmp slt i64 %a, %b
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %sel
+}
+
+define i64 @ab_sle_i64(i64 %a, i64 %b) {
+; LABEL: @ab_sle_i64
+; CHECK: min.s64
+  %cmp = icmp sle i64 %a, %b
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %sel
+}
+
+; *** ba, unsigned, i64
+define i64 @ba_ugt_i64(i64 %a, i64 %b) {
+; LABEL: @ba_ugt_i64
+; CHECK: min.u64
+  %cmp = icmp ugt i64 %a, %b
+  %sel = select i1 %cmp, i64 %b, i64 %a
+  ret i64 %sel
+}
+
+define i64 @ba_uge_i64(i64 %a, i64 %b) {
+; LABEL: @ba_uge_i64
+; CHECK: min.u64
+  %cmp = icmp uge i64 %a, %b
+  %sel = select i1 %cmp, i64 %b, i64 %a
+  ret i64 %sel
+}
+
+define i64 @ba_ult_i64(i64 %a, i64 %b) {
+; LABEL: @ba_ult_i64
+; CHECK: max.u64
+  %cmp = icmp ult i64 %a, %b
+  %sel = select i1 %cmp, i64 %b, i64 %a
+  ret i64 %sel
+}
+
+define i64 @ba_ule_i64(i64 %a, i64 %b) {
+; LABEL: @ba_ule_i64
+; CHECK: max.u64
+  %cmp = icmp ule i64 %a, %b
+  %sel = select i1 %cmp, i64 %b, i64 %a
+  ret i64 %sel
+}
+
+; *** ba, signed, i64
+define i64 @ba_sgt_i64(i64 %a, i64 %b) {
+; LBAEL: @ba_ugt_i64
+; CHECK: min.s64
+  %cmp = icmp sgt i64 %a, %b
+  %sel = select i1 %cmp, i64 %b, i64 %a
+  ret i64 %sel
+}
+
+define i64 @ba_sge_i64(i64 %a, i64 %b) {
+; LABEL: @ba_sge_i64
+; CHECK: min.s64
+  %cmp = icmp sge i64 %a, %b
+  %sel = select i1 %cmp, i64 %b, i64 %a
+  ret i64 %sel
+}
+
+define i64 @ba_slt_i64(i64 %a, i64 %b) {
+; LABEL: @ba_slt_i64
+; CHECK: max.s64
+  %cmp = icmp slt i64 %a, %b
+  %sel = select i1 %cmp, i64 %b, i64 %a
+  ret i64 %sel
+}
+
+define i64 @ba_sle_i64(i64 %a, i64 %b) {
+; LABEL: @ba_sle_i64
+; CHECK: max.s64
+  %cmp = icmp sle i64 %a, %b
+  %sel = select i1 %cmp, i64 %b, i64 %a
+  ret i64 %sel
+}
diff --git a/test/CodeGen/NVPTX/fma-assoc.ll b/test/CodeGen/NVPTX/fma-assoc.ll
index fc04c61dd691a..80a08a86316c8 100644
--- a/test/CodeGen/NVPTX/fma-assoc.ll
+++ b/test/CodeGen/NVPTX/fma-assoc.ll
@@ -23,3 +23,16 @@ define ptx_device double @t1_f64(double %x, double %y, double %z,
   %d = fadd double %c, %z
   ret double %d
 }
+
+define double @two_choices(double %val1, double %val2) {
+; CHECK-LABEL: two_choices(
+; CHECK: mul.f64
+; CHECK-NOT: mul.f64
+; CHECK: fma.rn.f64
+  %1 = fmul double %val1, %val2
+  %2 = fmul double %1, %1
+  %3 = fadd double %1, %2
+
+  ret double %3
+}
+
diff --git a/test/CodeGen/NVPTX/global-addrspace.ll b/test/CodeGen/NVPTX/global-addrspace.ll
new file mode 100644
index 0000000000000..4da14c7ff4fe2
--- /dev/null
+++ b/test/CodeGen/NVPTX/global-addrspace.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+; PTX32: .visible .global .align 4 .u32 i;
+; PTX32: .visible .const .align 4 .u32 j;
+; PTX32: .visible .shared .align 4 .u32 k;
+; PTX64: .visible .global .align 4 .u32 i;
+; PTX64: .visible .const .align 4 .u32 j;
+; PTX64: .visible .shared .align 4 .u32 k;
+@i = addrspace(1) externally_initialized global i32 0, align 4
+@j = addrspace(4) externally_initialized global i32 0, align 4
+@k = addrspace(3) global i32 undef, align 4
diff --git a/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
new file mode 100644
index 0000000000000..d93499b47f594
--- /dev/null
+++ b/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -0,0 +1,264 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefix=SM20 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck -check-prefix=SM35 %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+; SM20-LABEL: .visible .entry foo1(
+; SM20: ld.global.f32
+; SM35-LABEL: .visible .entry foo1(
+; SM35: ld.global.nc.f32
+define void @foo1(float * noalias readonly %from, float * %to) {
+  %1 = load float, float * %from
+  store float %1, float * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo2(
+; SM20: ld.global.f64
+; SM35-LABEL: .visible .entry foo2(
+; SM35: ld.global.nc.f64
+define void @foo2(double * noalias readonly %from, double * %to) {
+  %1 = load double, double * %from
+  store double %1, double * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo3(
+; SM20: ld.global.u16
+; SM35-LABEL: .visible .entry foo3(
+; SM35: ld.global.nc.u16
+define void @foo3(i16 * noalias readonly %from, i16 * %to) {
+  %1 = load i16, i16 * %from
+  store i16 %1, i16 * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo4(
+; SM20: ld.global.u32
+; SM35-LABEL: .visible .entry foo4(
+; SM35: ld.global.nc.u32
+define void @foo4(i32 * noalias readonly %from, i32 * %to) {
+  %1 = load i32, i32 * %from
+  store i32 %1, i32 * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo5(
+; SM20: ld.global.u64
+; SM35-LABEL: .visible .entry foo5(
+; SM35: ld.global.nc.u64
+define void @foo5(i64 * noalias readonly %from, i64 * %to) {
+  %1 = load i64, i64 * %from
+  store i64 %1, i64 * %to
+  ret void
+}
+
+; i128 is non standard integer in nvptx64
+; SM20-LABEL: .visible .entry foo6(
+; SM20: ld.global.u64
+; SM20: ld.global.u64
+; SM35-LABEL: .visible .entry foo6(
+; SM35: ld.global.nc.u64
+; SM35: ld.global.nc.u64
+define void @foo6(i128 * noalias readonly %from, i128 * %to) {
+  %1 = load i128, i128 * %from
+  store i128 %1, i128 * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo7(
+; SM20: ld.global.v2.u8
+; SM35-LABEL: .visible .entry foo7(
+; SM35: ld.global.nc.v2.u8
+define void @foo7(<2 x i8> * noalias readonly %from, <2 x i8> * %to) {
+  %1 = load <2 x i8>, <2 x i8> * %from
+  store <2 x i8> %1, <2 x i8> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo8(
+; SM20: ld.global.v2.u16
+; SM35-LABEL: .visible .entry foo8(
+; SM35: ld.global.nc.v2.u16
+define void @foo8(<2 x i16> * noalias readonly %from, <2 x i16> * %to) {
+  %1 = load <2 x i16>, <2 x i16> * %from
+  store <2 x i16> %1, <2 x i16> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo9(
+; SM20: ld.global.v2.u32
+; SM35-LABEL: .visible .entry foo9(
+; SM35: ld.global.nc.v2.u32
+define void @foo9(<2 x i32> * noalias readonly %from, <2 x i32> * %to) {
+  %1 = load <2 x i32>, <2 x i32> * %from
+  store <2 x i32> %1, <2 x i32> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo10(
+; SM20: ld.global.v2.u64
+; SM35-LABEL: .visible .entry foo10(
+; SM35: ld.global.nc.v2.u64
+define void @foo10(<2 x i64> * noalias readonly %from, <2 x i64> * %to) {
+  %1 = load <2 x i64>, <2 x i64> * %from
+  store <2 x i64> %1, <2 x i64> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo11(
+; SM20: ld.global.v2.f32
+; SM35-LABEL: .visible .entry foo11(
+; SM35: ld.global.nc.v2.f32
+define void @foo11(<2 x float> * noalias readonly %from, <2 x float> * %to) {
+  %1 = load <2 x float>, <2 x float> * %from
+  store <2 x float> %1, <2 x float> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo12(
+; SM20: ld.global.v2.f64
+; SM35-LABEL: .visible .entry foo12(
+; SM35: ld.global.nc.v2.f64
+define void @foo12(<2 x double> * noalias readonly %from, <2 x double> * %to) {
+  %1 = load <2 x double>, <2 x double> * %from
+  store <2 x double> %1, <2 x double> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo13(
+; SM20: ld.global.v4.u8
+; SM35-LABEL: .visible .entry foo13(
+; SM35: ld.global.nc.v4.u8
+define void @foo13(<4 x i8> * noalias readonly %from, <4 x i8> * %to) {
+  %1 = load <4 x i8>, <4 x i8> * %from
+  store <4 x i8> %1, <4 x i8> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo14(
+; SM20: ld.global.v4.u16
+; SM35-LABEL: .visible .entry foo14(
+; SM35: ld.global.nc.v4.u16
+define void @foo14(<4 x i16> * noalias readonly %from, <4 x i16> * %to) {
+  %1 = load <4 x i16>, <4 x i16> * %from
+  store <4 x i16> %1, <4 x i16> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo15(
+; SM20: ld.global.v4.u32
+; SM35-LABEL: .visible .entry foo15(
+; SM35: ld.global.nc.v4.u32
+define void @foo15(<4 x i32> * noalias readonly %from, <4 x i32> * %to) {
+  %1 = load <4 x i32>, <4 x i32> * %from
+  store <4 x i32> %1, <4 x i32> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo16(
+; SM20: ld.global.v4.f32
+; SM35-LABEL: .visible .entry foo16(
+; SM35: ld.global.nc.v4.f32
+define void @foo16(<4 x float> * noalias readonly %from, <4 x float> * %to) {
+  %1 = load <4 x float>, <4 x float> * %from
+  store <4 x float> %1, <4 x float> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo17(
+; SM20: ld.global.v2.f64
+; SM20: ld.global.v2.f64
+; SM35-LABEL: .visible .entry foo17(
+; SM35: ld.global.nc.v2.f64
+; SM35: ld.global.nc.v2.f64
+define void @foo17(<4 x double> * noalias readonly %from, <4 x double> * %to) {
+  %1 = load <4 x double>, <4 x double> * %from
+  store <4 x double> %1, <4 x double> * %to
+  ret void
+}
+
+; SM20-LABEL: .visible .entry foo18(
+; SM20: ld.global.u64
+; SM35-LABEL: .visible .entry foo18(
+; SM35: ld.global.nc.u64
+define void @foo18(float ** noalias readonly %from, float ** %to) {
+  %1 = load float *, float ** %from
+  store float * %1, float ** %to
+  ret void
+}
+
+; Test that we can infer a cached load for a pointer induction variable.
+; SM20-LABEL: .visible .entry foo19(
+; SM20: ld.global.f32
+; SM35-LABEL: .visible .entry foo19(
+; SM35: ld.global.nc.f32
+define void @foo19(float * noalias readonly %from, float * %to, i32 %n) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %nexti, %loop ]
+  %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ]
+  %ptr = getelementptr inbounds float, float * %from, i32 %i
+  %value = load float, float * %ptr, align 4
+  %nextsum = fadd float %value, %sum
+  %nexti = add nsw i32 %i, 1
+  %exitcond = icmp eq i32 %nexti, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  store float %nextsum, float * %to
+  ret void
+}
+
+; This test captures the case of a non-kernel function. In a
+; non-kernel function, without interprocedural analysis, we do not
+; know that the parameter is global. We also do not know that the
+; pointed-to memory is never written to (for the duration of the
+; kernel). For both reasons, we cannot use a cached load here.
+; SM20-LABEL: notkernel(
+; SM20: ld.f32
+; SM35-LABEL: notkernel(
+; SM35: ld.f32
+define void @notkernel(float * noalias readonly %from, float * %to) {
+  %1 = load float, float * %from
+  store float %1, float * %to
+  ret void
+}
+
+; As @notkernel, but with the parameter explicitly marked as global. We still
+; do not know that the parameter is never written to (for the duration of the
+; kernel). This case does not currently come up normally since we do not infer
+; that pointers are global interprocedurally as of 2015-08-05.
+; SM20-LABEL: notkernel2(
+; SM20: ld.global.f32
+; SM35-LABEL: notkernel2(
+; SM35: ld.global.f32
+define void @notkernel2(float addrspace(1) * noalias readonly %from, float * %to) {
+  %1 = load float, float addrspace(1) * %from
+  store float %1, float * %to
+  ret void
+}
+
+!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19}
+!1 = !{void (float *, float *)* @foo1, !"kernel", i32 1}
+!2 = !{void (double *, double *)* @foo2, !"kernel", i32 1}
+!3 = !{void (i16 *, i16 *)* @foo3, !"kernel", i32 1}
+!4 = !{void (i32 *, i32 *)* @foo4, !"kernel", i32 1}
+!5 = !{void (i64 *, i64 *)* @foo5, !"kernel", i32 1}
+!6 = !{void (i128 *, i128 *)* @foo6, !"kernel", i32 1}
+!7 = !{void (<2 x i8> *, <2 x i8> *)* @foo7, !"kernel", i32 1}
+!8 = !{void (<2 x i16> *, <2 x i16> *)* @foo8, !"kernel", i32 1}
+!9 = !{void (<2 x i32> *, <2 x i32> *)* @foo9, !"kernel", i32 1}
+!10 = !{void (<2 x i64> *, <2 x i64> *)* @foo10, !"kernel", i32 1}
+!11 = !{void (<2 x float> *, <2 x float> *)* @foo11, !"kernel", i32 1}
+!12 = !{void (<2 x double> *, <2 x double> *)* @foo12, !"kernel", i32 1}
+!13 = !{void (<4 x i8> *, <4 x i8> *)* @foo13, !"kernel", i32 1}
+!14 = !{void (<4 x i16> *, <4 x i16> *)* @foo14, !"kernel", i32 1}
+!15 = !{void (<4 x i32> *, <4 x i32> *)* @foo15, !"kernel", i32 1}
+!16 = !{void (<4 x float> *, <4 x float> *)* @foo16, !"kernel", i32 1}
+!17 = !{void (<4 x double> *, <4 x double> *)* @foo17, !"kernel", i32 1}
+!18 = !{void (float **, float **)* @foo18, !"kernel", i32 1}
+!19 = !{void (float *, float *, i32)* @foo19, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/lower-aggr-copies.ll b/test/CodeGen/NVPTX/lower-aggr-copies.ll
index c3adfc4646cf1..ef570982b8081 100644
--- a/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -1,35 +1,68 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
+; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR
 
 ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
 ; llvm.mem* intrinsics get lowered to loops.
 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "nvptx64-unknown-unknown"
+
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
 
 define i8* @memcpy_caller(i8* %dst, i8* %src, i64 %n) #0 {
 entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 false)
   ret i8* %dst
-; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memcpy_caller
-; CHECK: LBB[[LABEL:[_0-9]+]]:
-; CHECK:      ld.u8 %rs[[REG:[0-9]+]]
-; CHECK:      st.u8 [%r{{[0-9]+}}], %rs[[REG]]
-; CHECK:      add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
-; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
-; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
+
+; IR-LABEL:   @memcpy_caller
+; IR:         loadstoreloop:
+; IR:         [[LOADPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64
+; IR-NEXT:    [[VAL:%[0-9]+]] = load i8, i8* [[LOADPTR]]
+; IR-NEXT:    [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64
+; IR-NEXT:    store i8 [[VAL]], i8* [[STOREPTR]]
+
+; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memcpy_caller
+; PTX:        LBB[[LABEL:[_0-9]+]]:
+; PTX:        ld.u8 %rs[[REG:[0-9]+]]
+; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
+; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
+; PTX-NEXT:   setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; PTX-NEXT:   @%p[[PRED]] bra LBB[[LABEL]]
 }
 
 define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
 entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 true)
   ret i8* %dst
-; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memcpy_volatile_caller
-; CHECK: LBB[[LABEL:[_0-9]+]]:
-; CHECK:      ld.volatile.u8 %rs[[REG:[0-9]+]]
-; CHECK:      st.volatile.u8 [%r{{[0-9]+}}], %rs[[REG]]
-; CHECK:      add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
-; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
-; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
+
+; IR-LABEL:   @memcpy_volatile_caller
+; IR:         load volatile
+; IR:         store volatile
+
+; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memcpy_volatile_caller
+; PTX:        LBB[[LABEL:[_0-9]+]]:
+; PTX:        ld.volatile.u8 %rs[[REG:[0-9]+]]
+; PTX:        st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]]
+; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
+; PTX-NEXT:   setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; PTX-NEXT:   @%p[[PRED]] bra LBB[[LABEL]]
+}
+
+define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
+entry:
+  %0 = bitcast i32* %dst to i8*
+  %1 = bitcast i32* %src to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 %n, i32 1, i1 false)
+  ret i8* %0
+
+; Check that casts in calls to memcpy are handled properly
+; IR-LABEL:   @memcpy_casting_caller
+; IR:         [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8*
+; IR:         [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
+; IR:         getelementptr inbounds i8, i8* [[SRCCAST]]
+; IR:         getelementptr inbounds i8, i8* [[DSTCAST]]
 }
 
 define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
@@ -37,11 +70,52 @@ entry:
   %0 = trunc i32 %c to i8
   tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i32 1, i1 false)
   ret i8* %dst
-; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memset_caller(
-; CHECK:      ld.param.u8 %rs[[REG:[0-9]+]]
-; CHECK:      LBB[[LABEL:[_0-9]+]]:
-; CHECK:      st.u8 [%r{{[0-9]+}}], %rs[[REG]]
-; CHECK:      add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
-; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
-; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
+
+; IR-LABEL:   @memset_caller
+; IR:         [[VAL:%[0-9]+]] = trunc i32 %c to i8
+; IR:         loadstoreloop:
+; IR:         [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64
+; IR-NEXT:    store i8 [[VAL]], i8* [[STOREPTR]]
+
+; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memset_caller(
+; PTX:        ld.param.u8 %rs[[REG:[0-9]+]]
+; PTX:        LBB[[LABEL:[_0-9]+]]:
+; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
+; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
+; PTX-NEXT:   setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; PTX-NEXT:   @%p[[PRED]] bra LBB[[LABEL]]
+}
+
+define i8* @memmove_caller(i8* %dst, i8* %src, i64 %n) #0 {
+entry:
+  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 false)
+  ret i8* %dst
+
+; IR-LABEL:   @memmove_caller
+; IR:         icmp ult i8* %src, %dst
+; IR:         [[PHIVAL:%[0-9a-zA-Z_]+]] = phi i64
+; IR-NEXT:    %index_ptr = sub i64 [[PHIVAL]], 1
+; IR:         [[FWDPHIVAL:%[0-9a-zA-Z_]+]] = phi i64
+; IR:         {{%[0-9a-zA-Z_]+}} = add i64 [[FWDPHIVAL]], 1
+
+; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memmove_caller(
+; PTX:        ld.param.u64 %rd[[N:[0-9]+]]
+; PTX:        setp.eq.s64 %p[[NEQ0:[0-9]+]], %rd[[N]], 0
+; PTX:        setp.ge.u64 %p[[SRC_GT_THAN_DST:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; PTX-NEXT:   @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]]
+; -- this is the backwards copying BB
+; PTX:        @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]]
+; PTX:        add.s64 %rd[[N]], %rd[[N]], -1
+; PTX:        ld.u8 %rs[[ELEMENT:[0-9]+]]
+; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]]
+; -- this is the forwards copying BB
+; PTX:        LBB[[FORWARD_BB]]:
+; PTX:        @%p[[NEQ0]] bra LBB[[EXIT]]
+; PTX:        ld.u8 %rs[[ELEMENT2:[0-9]+]]
+; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]]
+; PTX:        add.s64 %rd[[INDEX:[0-9]+]], %rd[[INDEX]], 1
+; -- exit block
+; PTX:        LBB[[EXIT]]:
+; PTX-NEXT:   st.param.b64 [func_retval0
+; PTX-NEXT:   ret
 }
diff --git a/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
index 0de72c4a1aed0..2fffa3eeac15f 100644
--- a/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-target triple = "nvptx64-unknown-unknown"
+target triple = "nvptx64-nvidia-cuda"
 
 ; Verify that both %input and %output are converted to global pointers and then
 ; addrspacecast'ed back to the original type.
@@ -26,6 +26,22 @@ define void @kernel2(float addrspace(1)* %input, float addrspace(1)* %output) {
   ret void
 }
 
-!nvvm.annotations = !{!0, !1}
+%struct.S = type { i32*, i32* }
+
+define void @ptr_in_byval(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .entry ptr_in_byval(
+; CHECK: cvta.to.global.u64
+; CHECK: cvta.to.global.u64
+  %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
+  %b = load i32*, i32** %b_ptr, align 4
+  %v = load i32, i32* %b, align 4
+; CHECK: ld.global.u32
+  store i32 %v, i32* %output, align 4
+; CHECK: st.global.u32
+  ret void
+}
+
+!nvvm.annotations = !{!0, !1, !2}
 !0 = !{void (float*, float*)* @kernel, !"kernel", i32 1}
 !1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1}
+!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/reg-copy.ll b/test/CodeGen/NVPTX/reg-copy.ll
new file mode 100644
index 0000000000000..98ee49d39023f
--- /dev/null
+++ b/test/CodeGen/NVPTX/reg-copy.ll
@@ -0,0 +1,224 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+define void @PR24303(float* %f) {
+; CHECK-LABEL: .visible .entry PR24303(
+; Do not use mov.f or mov.u to convert between float and int.
+; CHECK-NOT: mov.{{f|u}}{{32|64}} %f{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK-NOT: mov.{{f|u}}{{32|64}} %r{{[0-9]+}}, %f{{[0-9]+}}
+entry:
+  %arrayidx1 = getelementptr inbounds float, float* %f, i64 1
+  %0 = load float, float* %f, align 4
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %f, i64 2
+  %arrayidx3 = getelementptr inbounds float, float* %f, i64 3
+  %2 = load float, float* %arrayidx2, align 4
+  %3 = load float, float* %arrayidx3, align 4
+  %mul.i = fmul float %0, %2
+  %mul4.i = fmul float %1, %3
+  %mul5.i = fmul float %0, %3
+  %mul6.i = fmul float %1, %2
+  %sub.i = fsub float %mul.i, %mul4.i
+  %4 = bitcast float %sub.i to i32
+  %add.i = fadd float %mul6.i, %mul5.i
+  %5 = bitcast float %add.i to i32
+  %6 = tail call float @llvm.nvvm.fabs.f(float %sub.i) #2
+  %7 = fcmp ugt float %6, 0x7FF0000000000000
+  br i1 %7, label %land.lhs.true.i, label %_ZN12cuda_builtinmlIfEENS_7complexIT_EERKS3_S5_.exit
+
+land.lhs.true.i:                                  ; preds = %entry
+  %8 = tail call float @llvm.nvvm.fabs.f(float %add.i) #2
+  %9 = fcmp ugt float %8, 0x7FF0000000000000
+  br i1 %9, label %if.then.i, label %_ZN12cuda_builtinmlIfEENS_7complexIT_EERKS3_S5_.exit
+
+if.then.i:                                        ; preds = %land.lhs.true.i
+  %10 = tail call float @llvm.nvvm.fabs.f(float %0) #2
+  %11 = fcmp oeq float %10, 0x7FF0000000000000
+  %.pre.i = tail call float @llvm.nvvm.fabs.f(float %1) #2
+  %12 = fcmp oeq float %.pre.i, 0x7FF0000000000000
+  %or.cond.i = or i1 %11, %12
+  br i1 %or.cond.i, label %if.then.14.i, label %if.end.31.i
+
+if.then.14.i:                                     ; preds = %if.then.i
+  %13 = bitcast float %0 to i32
+  %14 = and i32 %13, -2147483648
+  %15 = select i1 %11, i32 1065353216, i32 0
+  %16 = or i32 %15, %14
+  %17 = bitcast i32 %16 to float
+  %18 = bitcast float %1 to i32
+  %19 = and i32 %18, -2147483648
+  %20 = select i1 %12, i32 1065353216, i32 0
+  %21 = or i32 %20, %19
+  %22 = bitcast i32 %21 to float
+  %23 = tail call float @llvm.nvvm.fabs.f(float %2) #2
+  %24 = fcmp ugt float %23, 0x7FF0000000000000
+  br i1 %24, label %if.then.24.i, label %if.end.i
+
+if.then.24.i:                                     ; preds = %if.then.14.i
+  %25 = bitcast float %2 to i32
+  %26 = and i32 %25, -2147483648
+  %27 = bitcast i32 %26 to float
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %if.then.24.i, %if.then.14.i
+  %__c.0.i = phi float [ %27, %if.then.24.i ], [ %2, %if.then.14.i ]
+  %28 = tail call float @llvm.nvvm.fabs.f(float %3) #2
+  %29 = fcmp ugt float %28, 0x7FF0000000000000
+  br i1 %29, label %if.then.28.i, label %if.end.31.i
+
+if.then.28.i:                                     ; preds = %if.end.i
+  %30 = bitcast float %3 to i32
+  %31 = and i32 %30, -2147483648
+  %32 = bitcast i32 %31 to float
+  br label %if.end.31.i
+
+if.end.31.i:                                      ; preds = %if.then.28.i, %if.end.i, %if.then.i
+  %__d.1.i = phi float [ %32, %if.then.28.i ], [ %3, %if.end.i ], [ %3, %if.then.i ]
+  %__c.1.i = phi float [ %__c.0.i, %if.then.28.i ], [ %__c.0.i, %if.end.i ], [ %2, %if.then.i ]
+  %__b.0.i = phi float [ %22, %if.then.28.i ], [ %22, %if.end.i ], [ %1, %if.then.i ]
+  %__a.0.i = phi float [ %17, %if.then.28.i ], [ %17, %if.end.i ], [ %0, %if.then.i ]
+  %__recalc.0.off0.i = phi i1 [ true, %if.then.28.i ], [ true, %if.end.i ], [ false, %if.then.i ]
+  %33 = tail call float @llvm.nvvm.fabs.f(float %__c.1.i) #2
+  %34 = fcmp oeq float %33, 0x7FF0000000000000
+  %.pre6.i = tail call float @llvm.nvvm.fabs.f(float %__d.1.i) #2
+  %35 = fcmp oeq float %.pre6.i, 0x7FF0000000000000
+  %or.cond8.i = or i1 %34, %35
+  br i1 %or.cond8.i, label %if.then.37.i, label %if.end.56.i
+
+if.then.37.i:                                     ; preds = %if.end.31.i
+  %36 = bitcast float %__c.1.i to i32
+  %37 = and i32 %36, -2147483648
+  %38 = select i1 %34, i32 1065353216, i32 0
+  %39 = or i32 %38, %37
+  %40 = bitcast i32 %39 to float
+  %41 = bitcast float %__d.1.i to i32
+  %42 = and i32 %41, -2147483648
+  %43 = select i1 %35, i32 1065353216, i32 0
+  %44 = or i32 %43, %42
+  %45 = bitcast i32 %44 to float
+  %46 = tail call float @llvm.nvvm.fabs.f(float %__a.0.i) #2
+  %47 = fcmp ugt float %46, 0x7FF0000000000000
+  br i1 %47, label %if.then.48.i, label %if.end.50.i
+
+if.then.48.i:                                     ; preds = %if.then.37.i
+  %48 = bitcast float %__a.0.i to i32
+  %49 = and i32 %48, -2147483648
+  %50 = bitcast i32 %49 to float
+  br label %if.end.50.i
+
+if.end.50.i:                                      ; preds = %if.then.48.i, %if.then.37.i
+  %__a.1.i = phi float [ %50, %if.then.48.i ], [ %__a.0.i, %if.then.37.i ]
+  %51 = tail call float @llvm.nvvm.fabs.f(float %__b.0.i) #2
+  %52 = fcmp ugt float %51, 0x7FF0000000000000
+  br i1 %52, label %if.then.53.i, label %if.then.93.i
+
+if.then.53.i:                                     ; preds = %if.end.50.i
+  %53 = bitcast float %__b.0.i to i32
+  %54 = and i32 %53, -2147483648
+  %55 = bitcast i32 %54 to float
+  br label %if.then.93.i
+
+if.end.56.i:                                      ; preds = %if.end.31.i
+  br i1 %__recalc.0.off0.i, label %if.then.93.i, label %land.lhs.true.58.i
+
+land.lhs.true.58.i:                               ; preds = %if.end.56.i
+  %56 = tail call float @llvm.nvvm.fabs.f(float %mul.i) #2
+  %57 = fcmp oeq float %56, 0x7FF0000000000000
+  br i1 %57, label %if.then.70.i, label %lor.lhs.false.61.i
+
+lor.lhs.false.61.i:                               ; preds = %land.lhs.true.58.i
+  %58 = tail call float @llvm.nvvm.fabs.f(float %mul4.i) #2
+  %59 = fcmp oeq float %58, 0x7FF0000000000000
+  br i1 %59, label %if.then.70.i, label %lor.lhs.false.64.i
+
+lor.lhs.false.64.i:                               ; preds = %lor.lhs.false.61.i
+  %60 = tail call float @llvm.nvvm.fabs.f(float %mul5.i) #2
+  %61 = fcmp oeq float %60, 0x7FF0000000000000
+  br i1 %61, label %if.then.70.i, label %lor.lhs.false.67.i
+
+lor.lhs.false.67.i:                               ; preds = %lor.lhs.false.64.i
+  %62 = tail call float @llvm.nvvm.fabs.f(float %mul6.i) #2
+  %63 = fcmp oeq float %62, 0x7FF0000000000000
+  br i1 %63, label %if.then.70.i, label %_ZN12cuda_builtinmlIfEENS_7complexIT_EERKS3_S5_.exit
+
+if.then.70.i:                                     ; preds = %lor.lhs.false.67.i, %lor.lhs.false.64.i, %lor.lhs.false.61.i, %land.lhs.true.58.i
+  %64 = tail call float @llvm.nvvm.fabs.f(float %__a.0.i) #2
+  %65 = fcmp ugt float %64, 0x7FF0000000000000
+  br i1 %65, label %if.then.73.i, label %if.end.75.i
+
+if.then.73.i:                                     ; preds = %if.then.70.i
+  %66 = bitcast float %__a.0.i to i32
+  %67 = and i32 %66, -2147483648
+  %68 = bitcast i32 %67 to float
+  br label %if.end.75.i
+
+if.end.75.i:                                      ; preds = %if.then.73.i, %if.then.70.i
+  %__a.3.i = phi float [ %68, %if.then.73.i ], [ %__a.0.i, %if.then.70.i ]
+  %69 = tail call float @llvm.nvvm.fabs.f(float %__b.0.i) #2
+  %70 = fcmp ugt float %69, 0x7FF0000000000000
+  br i1 %70, label %if.then.78.i, label %if.end.80.i
+
+if.then.78.i:                                     ; preds = %if.end.75.i
+  %71 = bitcast float %__b.0.i to i32
+  %72 = and i32 %71, -2147483648
+  %73 = bitcast i32 %72 to float
+  br label %if.end.80.i
+
+if.end.80.i:                                      ; preds = %if.then.78.i, %if.end.75.i
+  %__b.3.i = phi float [ %73, %if.then.78.i ], [ %__b.0.i, %if.end.75.i ]
+  %74 = fcmp ugt float %33, 0x7FF0000000000000
+  br i1 %74, label %if.then.83.i, label %if.end.85.i
+
+if.then.83.i:                                     ; preds = %if.end.80.i
+  %75 = bitcast float %__c.1.i to i32
+  %76 = and i32 %75, -2147483648
+  %77 = bitcast i32 %76 to float
+  br label %if.end.85.i
+
+if.end.85.i:                                      ; preds = %if.then.83.i, %if.end.80.i
+  %__c.3.i = phi float [ %77, %if.then.83.i ], [ %__c.1.i, %if.end.80.i ]
+  %78 = fcmp ugt float %.pre6.i, 0x7FF0000000000000
+  br i1 %78, label %if.then.88.i, label %if.then.93.i
+
+if.then.88.i:                                     ; preds = %if.end.85.i
+  %79 = bitcast float %__d.1.i to i32
+  %80 = and i32 %79, -2147483648
+  %81 = bitcast i32 %80 to float
+  br label %if.then.93.i
+
+if.then.93.i:                                     ; preds = %if.then.88.i, %if.end.85.i, %if.end.56.i, %if.then.53.i, %if.end.50.i
+  %__d.4.ph.i = phi float [ %__d.1.i, %if.end.85.i ], [ %81, %if.then.88.i ], [ %__d.1.i, %if.end.56.i ], [ %45, %if.end.50.i ], [ %45, %if.then.53.i ]
+  %__c.4.ph.i = phi float [ %__c.3.i, %if.end.85.i ], [ %__c.3.i, %if.then.88.i ], [ %__c.1.i, %if.end.56.i ], [ %40, %if.end.50.i ], [ %40, %if.then.53.i ]
+  %__b.4.ph.i = phi float [ %__b.3.i, %if.end.85.i ], [ %__b.3.i, %if.then.88.i ], [ %__b.0.i, %if.end.56.i ], [ %__b.0.i, %if.end.50.i ], [ %55, %if.then.53.i ]
+  %__a.4.ph.i = phi float [ %__a.3.i, %if.end.85.i ], [ %__a.3.i, %if.then.88.i ], [ %__a.0.i, %if.end.56.i ], [ %__a.1.i, %if.end.50.i ], [ %__a.1.i, %if.then.53.i ]
+  %mul95.i = fmul float %__c.4.ph.i, %__a.4.ph.i
+  %mul96.i = fmul float %__d.4.ph.i, %__b.4.ph.i
+  %sub97.i = fsub float %mul95.i, %mul96.i
+  %mul98.i = fmul float %sub97.i, 0x7FF0000000000000
+  %82 = bitcast float %mul98.i to i32
+  %mul100.i = fmul float %__d.4.ph.i, %__a.4.ph.i
+  %mul101.i = fmul float %__c.4.ph.i, %__b.4.ph.i
+  %add102.i = fadd float %mul101.i, %mul100.i
+  %mul103.i = fmul float %add102.i, 0x7FF0000000000000
+  %83 = bitcast float %mul103.i to i32
+  br label %_ZN12cuda_builtinmlIfEENS_7complexIT_EERKS3_S5_.exit
+
+_ZN12cuda_builtinmlIfEENS_7complexIT_EERKS3_S5_.exit: ; preds = %if.then.93.i, %lor.lhs.false.67.i, %land.lhs.true.i, %entry
+  %84 = phi i32 [ %4, %land.lhs.true.i ], [ %4, %entry ], [ %82, %if.then.93.i ], [ %4, %lor.lhs.false.67.i ]
+  %85 = phi i32 [ %5, %land.lhs.true.i ], [ %5, %entry ], [ %83, %if.then.93.i ], [ %5, %lor.lhs.false.67.i ]
+  %arrayidx5 = getelementptr inbounds float, float* %f, i64 5
+  %86 = bitcast float* %arrayidx5 to i32*
+  store i32 %84, i32* %86, align 4
+  %arrayidx7 = getelementptr inbounds float, float* %f, i64 6
+  %87 = bitcast float* %arrayidx7 to i32*
+  store i32 %85, i32* %87, align 4
+  ret void
+}
+
+declare float @llvm.nvvm.fabs.f(float)
+
+!nvvm.annotations = !{!0}
+
+!0 = !{void (float*)* @PR24303, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/symbol-naming.ll b/test/CodeGen/NVPTX/symbol-naming.ll
index 0f176934ca395..7a3e6310ffdf9 100644
--- a/test/CodeGen/NVPTX/symbol-naming.ll
+++ b/test/CodeGen/NVPTX/symbol-naming.ll
@@ -7,10 +7,10 @@
 ; PTX32-NOT: .str
 ; PTX64-NOT: .str
 
-; PTX32-DAG: _$_str1
+; PTX32-DAG: _$_str.1
 ; PTX32-DAG: _$_str
 
-; PTX64-DAG: _$_str1
+; PTX64-DAG: _$_str.1
 ; PTX64-DAG: _$_str
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll
index a03d7fd41914e..968d1d4a5f51a 100644
--- a/test/CodeGen/NVPTX/vector-call.ll
+++ b/test/CodeGen/NVPTX/vector-call.ll
@@ -4,7 +4,7 @@ target triple = "nvptx-unknown-cuda"
 
 declare void @bar(<4 x i32>)
 
-; CHECK-LABEL @foo
+; CHECK-LABEL: @foo
 define void @foo(<4 x i32> %a) {
 ; CHECK: st.param.v4.b32
   tail call void @bar(<4 x i32> %a)
diff --git a/test/CodeGen/PowerPC/2006-01-20-ShiftPartsCrash.ll b/test/CodeGen/PowerPC/2006-01-20-ShiftPartsCrash.ll
index fde330321aa43..d20e3b05c0916 100644
--- a/test/CodeGen/PowerPC/2006-01-20-ShiftPartsCrash.ll
+++ b/test/CodeGen/PowerPC/2006-01-20-ShiftPartsCrash.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; REQUIRES: default_triple
 
 define void @iterative_hash_host_wide_int() {
         %zero = alloca i32              ; <i32*> [#uses=2]
diff --git a/test/CodeGen/PowerPC/2006-08-15-SelectionCrash.ll b/test/CodeGen/PowerPC/2006-08-15-SelectionCrash.ll
index c63fd9ae17006..3d5fa52d0abd5 100644
--- a/test/CodeGen/PowerPC/2006-08-15-SelectionCrash.ll
+++ b/test/CodeGen/PowerPC/2006-08-15-SelectionCrash.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; REQUIRES: default_triple
 
 	%struct..0anon = type { i32 }
 	%struct.rtx_def = type { i16, i8, i8, [1 x %struct..0anon] }
diff --git a/test/CodeGen/PowerPC/2006-12-07-LargeAlloca.ll b/test/CodeGen/PowerPC/2006-12-07-LargeAlloca.ll
index 0e7709857406e..c064c273173f7 100644
--- a/test/CodeGen/PowerPC/2006-12-07-LargeAlloca.ll
+++ b/test/CodeGen/PowerPC/2006-12-07-LargeAlloca.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=ppc64
 ; RUN: llc < %s -march=ppc32
 ; RUN: llc < %s 
+; REQUIRES: default_triple
 
 define void @bitap() {
 entry:
diff --git a/test/CodeGen/PowerPC/2006-12-07-SelectCrash.ll b/test/CodeGen/PowerPC/2006-12-07-SelectCrash.ll
index 9660d450cb4ce..8536dda0a9ba7 100644
--- a/test/CodeGen/PowerPC/2006-12-07-SelectCrash.ll
+++ b/test/CodeGen/PowerPC/2006-12-07-SelectCrash.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=ppc64
 ; RUN: llc < %s -march=ppc32
 ; RUN: llc < %s
+; REQUIRES: default_triple
 
 @qsz.b = external global i1             ; <i1*> [#uses=1]
 
diff --git a/test/CodeGen/PowerPC/2007-11-19-VectorSplitting.ll b/test/CodeGen/PowerPC/2007-11-19-VectorSplitting.ll
index 4830ca60f9ff8..aa39dfd037487 100644
--- a/test/CodeGen/PowerPC/2007-11-19-VectorSplitting.ll
+++ b/test/CodeGen/PowerPC/2007-11-19-VectorSplitting.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -march=ppc32 -mcpu=g3
 ; RUN: llc < %s -march=ppc32 -mcpu=g5
 ; PR1811
+; REQUIRES: default_triple
 
 define void @execute_shader(<4 x float>* %OUT, <4 x float>* %IN, <4 x float>*
 %CONST) {
diff --git a/test/CodeGen/PowerPC/BoolRetToIntTest.ll b/test/CodeGen/PowerPC/BoolRetToIntTest.ll
new file mode 100644
index 0000000000000..a7b79789b4ca8
--- /dev/null
+++ b/test/CodeGen/PowerPC/BoolRetToIntTest.ll
@@ -0,0 +1,203 @@
+; RUN: opt -bool-ret-to-int -S -o - < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; CHECK-LABEL: notBoolRet
+define signext i32 @notBoolRet() {
+entry:
+; CHECK: ret i32 1
+  ret i32 1
+}
+
+; CHECK-LABEL: find
+define zeroext i1 @find(i8** readonly %begin, i8** readnone %end, i1 (i8*)* nocapture %hasProp) {
+entry:
+  %cmp.4 = icmp eq i8** %begin, %end
+  br i1 %cmp.4, label %cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i8** %incdec.ptr, %end
+  br i1 %cmp, label %cleanup.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %curr.05 = phi i8** [ %incdec.ptr, %for.cond ], [ %begin, %for.body.preheader ]
+  %0 = load i8*, i8** %curr.05, align 8
+  %call = tail call zeroext i1 %hasProp(i8* %0)
+  %incdec.ptr = getelementptr inbounds i8*, i8** %curr.05, i64 1
+  br i1 %call, label %cleanup.loopexit, label %for.cond
+
+cleanup.loopexit:                                 ; preds = %for.body, %for.cond
+; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
+  br label %cleanup
+
+cleanup:                                          ; preds = %cleanup.loopexit, %entry
+; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+  %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  ret i1 %cleanup.dest.slot.0
+}
+
+; CHECK-LABEL: retFalse
+define zeroext i1 @retFalse() {
+entry:
+; CHECK: ret i1 false
+  ret i1 false
+}
+
+; CHECK-LABEL: retCvtFalse
+define zeroext i1 @retCvtFalse() {
+entry:
+; CHECK: ret i1 false
+  ret i1 trunc(i32 0 to i1)
+}
+
+; CHECK-LABEL: find_cont
+define void @find_cont(i8** readonly %begin, i8** readnone %end, i1 (i8*)* nocapture %hasProp, void (i1)* nocapture %cont) {
+entry:
+  %cmp.4 = icmp eq i8** %begin, %end
+  br i1 %cmp.4, label %cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i8** %incdec.ptr, %end
+  br i1 %cmp, label %cleanup.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %curr.05 = phi i8** [ %incdec.ptr, %for.cond ], [ %begin, %for.body.preheader ]
+  %0 = load i8*, i8** %curr.05, align 8
+  %call = tail call zeroext i1 %hasProp(i8* %0)
+  %incdec.ptr = getelementptr inbounds i8*, i8** %curr.05, i64 1
+  br i1 %call, label %cleanup.loopexit, label %for.cond
+
+cleanup.loopexit:                                 ; preds = %for.body, %for.cond
+; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
+  br label %cleanup
+
+cleanup:                                          ; preds = %cleanup.loopexit, %entry
+; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+  %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: call void %cont(i1 [[REG]]
+  tail call void %cont(i1 %cleanup.dest.slot.0)
+  ret void
+}
+
+; CHECK-LABEL: find_cont_ret
+define zeroext i1 @find_cont_ret(i8** readonly %begin, i8** readnone %end, i1 (i8*)* nocapture %hasProp, void (i1)* nocapture %cont) {
+entry:
+  %cmp.4 = icmp eq i8** %begin, %end
+  br i1 %cmp.4, label %cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i8** %incdec.ptr, %end
+  br i1 %cmp, label %cleanup.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %curr.05 = phi i8** [ %incdec.ptr, %for.cond ], [ %begin, %for.body.preheader ]
+  %0 = load i8*, i8** %curr.05, align 8
+  %call = tail call zeroext i1 %hasProp(i8* %0)
+  %incdec.ptr = getelementptr inbounds i8*, i8** %curr.05, i64 1
+  br i1 %call, label %cleanup.loopexit, label %for.cond
+
+cleanup.loopexit:                                 ; preds = %for.body, %for.cond
+; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
+  br label %cleanup
+
+cleanup:                                          ; preds = %cleanup.loopexit, %entry
+; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+  %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: call void %cont(i1 [[REG]]
+  tail call void %cont(i1 %cleanup.dest.slot.0)
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  ret i1 %cleanup.dest.slot.0
+}
+
+; CHECK-LABEL: arg_operand
+define zeroext i1 @arg_operand(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  br label %cleanup
+
+cleanup:
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ false, %foo ], [ %operand, %entry ]
+  ret i1 %result
+}
+
+; CHECK-LABEL: bad_use
+define zeroext i1 @bad_use(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  br label %cleanup
+
+cleanup:
+; CHECK: [[REG:%.+]] = phi i1
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ false, %foo], [ true, %entry ]
+  %0 = icmp eq i1 %result, %operand
+  ret i1 %result
+}
+
+; CHECK-LABEL: bad_use_closure
+define zeroext i1 @bad_use_closure(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  %bar = phi i1 [ false, %entry ]
+  %0 = icmp eq i1 %bar, %operand
+  br label %cleanup
+
+cleanup:
+; CHECK: [[REG:%.+]] = phi i1 [ true
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ true, %entry ], [ %bar, %foo]
+  ret i1 %result
+}
+
+; CHECK-LABEL: arg_test
+define zeroext i1 @arg_test(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  %bar = phi i1 [ false, %entry ]
+  br label %cleanup
+
+; CHECK-LABEL: cleanup
+cleanup:
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ %bar, %foo], [ %operand, %entry ]
+  ret i1 %result
+}
+
+declare zeroext i1 @return_i1()
+
+; CHECK-LABEL: call_test
+define zeroext i1 @call_test() {
+; CHECK: [[REG:%.+]] = call i1
+  %result = call i1 @return_i1()
+; CHECK: ret i1 [[REG]]
+  ret i1 %result
+}
\ No newline at end of file
diff --git a/test/CodeGen/PowerPC/BreakableToken-reduced.ll b/test/CodeGen/PowerPC/BreakableToken-reduced.ll
new file mode 100644
index 0000000000000..2077dbb820f79
--- /dev/null
+++ b/test/CodeGen/PowerPC/BreakableToken-reduced.ll
@@ -0,0 +1,335 @@
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-shrink-wrap=true %s -o - | FileCheck %s --check-prefix=CHECK
+;
+; Test the use of a non-R0 register to save/restore the LR in function 
+; prologue/epilogue.
+; This problem can occur as a result of shrink wrapping, where the function
+; prologue and epilogue are moved from the beginning/ending of the function. If
+; register R0 is used before the prologue/epilogue blocks, then it cannot be
+; used to save/restore the LR.
+;
+; TODO: Convert this to an MIR test once the infrastructure can support it.
+;       To convert this to an MIR pass, generate MIR after register allocation
+;       but before shrink wrapping and verify that has been used in the body of
+;       the function. This can be done with something like: 
+;         llc -stop-after stack-slot-coloring BreakableToken-reduced.ll > BreakableToken-reduced.mir
+;
+;       The resulting MIR file can then be used as input to llc, and only run
+;       shrink wrapping and Prologue/Epilogue insertion on it. For example:
+;         llc -start-after stack-slot-coloring -stop-after prologepilog BreakableToken-reduced.mir
+;       
+;       Verify in the resulting code that R0 is not used in the prologue/epilogue.
+;
+;       This currently cannot be done because the PrologEpilogInserter pass has
+;       a dependency on the TargetPassConfig and StackProtector classes, which
+;       are currently not serialized when generating the MIR.
+;
+
+; ModuleID = 'BreakableToken.cpp'
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%"class.clang::format::BreakableStringLiteral" = type { %"class.clang::format::BreakableSingleLineToken" }
+%"class.clang::format::BreakableSingleLineToken" = type { %"class.clang::format::BreakableToken", i32, %"class.llvm::StringRef", %"class.llvm::StringRef", %"class.llvm::StringRef" }
+%"class.clang::format::BreakableToken" = type { i32 (...)**, %"struct.clang::format::FormatToken"*, i32, i8, i32, %"struct.clang::format::FormatStyle"* }
+%"class.llvm::StringRef" = type { i8*, i64 }
+%"struct.clang::format::FormatToken" = type <{ %"class.clang::Token", i32, i8, [3 x i8], %"class.clang::SourceRange", i32, i32, i32, i8, i8, i8, i8, %"class.llvm::StringRef", i8, [3 x i8], i32, i32, i32, i8, i8, [2 x i8], i32, i32, i16, [2 x i8], %"class.std::unique_ptr", i32, i32, i32, i32, i32, i32, i32, i32, %"class.llvm::SmallVector", i32, i8, i8, [2 x i8], i32, i8, i8, [2 x i8], %"struct.clang::format::FormatToken"*, %"struct.clang::format::FormatToken"*, %"struct.clang::format::FormatToken"*, %"class.llvm::SmallVector.6", i32, i8, [3 x i8] }>
+%"class.clang::Token" = type <{ i32, i32, i8*, i16, i16, [4 x i8] }>
+%"class.clang::SourceRange" = type { %"class.clang::SourceLocation", %"class.clang::SourceLocation" }
+%"class.clang::SourceLocation" = type { i32 }
+%"class.std::unique_ptr" = type { %"class.std::tuple" }
+%"class.std::tuple" = type { %"struct.std::_Tuple_impl" }
+%"struct.std::_Tuple_impl" = type { %"struct.std::_Head_base.2" }
+%"struct.std::_Head_base.2" = type { %"class.clang::format::TokenRole"* }
+%"class.clang::format::TokenRole" = type { i32 (...)**, %"struct.clang::format::FormatStyle"* }
+%"class.llvm::SmallVector" = type { %"class.llvm::SmallVectorImpl.base", %"struct.llvm::SmallVectorStorage" }
+%"class.llvm::SmallVectorImpl.base" = type { %"class.llvm::SmallVectorTemplateBase.base" }
+%"class.llvm::SmallVectorTemplateBase.base" = type { %"class.llvm::SmallVectorTemplateCommon.base" }
+%"class.llvm::SmallVectorTemplateCommon.base" = type <{ %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion" }>
+%"class.llvm::SmallVectorBase" = type { i8*, i8*, i8* }
+%"struct.llvm::AlignedCharArrayUnion" = type { %"struct.llvm::AlignedCharArray" }
+%"struct.llvm::AlignedCharArray" = type { [4 x i8] }
+%"struct.llvm::SmallVectorStorage" = type { [3 x %"struct.llvm::AlignedCharArrayUnion"] }
+%"class.llvm::SmallVector.6" = type <{ %"class.llvm::SmallVectorImpl.7", %"struct.llvm::SmallVectorStorage.12", [7 x i8] }>
+%"class.llvm::SmallVectorImpl.7" = type { %"class.llvm::SmallVectorTemplateBase.8" }
+%"class.llvm::SmallVectorTemplateBase.8" = type { %"class.llvm::SmallVectorTemplateCommon.9" }
+%"class.llvm::SmallVectorTemplateCommon.9" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.10" }
+%"struct.llvm::AlignedCharArrayUnion.10" = type { %"struct.llvm::AlignedCharArray.11" }
+%"struct.llvm::AlignedCharArray.11" = type { [8 x i8] }
+%"struct.llvm::SmallVectorStorage.12" = type { i8 }
+%"struct.clang::format::FormatStyle" = type { i32, i8, i8, i8, i8, i8, i8, i8, i8, i32, i8, i8, i32, i8, i8, i8, i8, i32, i32, i8, i8, i32, %"class.std::basic_string", i8, i32, i32, i8, i8, i8, i8, %"class.std::vector", i8, i32, i8, i8, i32, %"class.std::basic_string", %"class.std::basic_string", i32, i32, i32, i8, i8, i32, i32, i32, i32, i32, i32, i32, i8, i8, i32, i8, i32, i8, i8, i8, i8, i8, i32, i32, i32 }
+%"class.std::vector" = type { %"struct.std::_Vector_base" }
+%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<std::basic_string<char>, std::allocator<std::basic_string<char> > >::_Vector_impl" }
+%"struct.std::_Vector_base<std::basic_string<char>, std::allocator<std::basic_string<char> > >::_Vector_impl" = type { %"class.std::basic_string"*, %"class.std::basic_string"*, %"class.std::basic_string"* }
+%"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
+%"struct.llvm::AlignedCharArray.52" = type { [16 x i8] }
+%"class.clang::format::WhitespaceManager" = type <{ %"class.llvm::SmallVector.13", %"class.clang::SourceManager"*, %"class.std::set", %"struct.clang::format::FormatStyle"*, i8, [7 x i8] }>
+%"class.llvm::SmallVector.13" = type { %"class.llvm::SmallVectorImpl.14", %"struct.llvm::SmallVectorStorage.19" }
+%"class.llvm::SmallVectorImpl.14" = type { %"class.llvm::SmallVectorTemplateBase.15" }
+%"class.llvm::SmallVectorTemplateBase.15" = type { %"class.llvm::SmallVectorTemplateCommon.16" }
+%"class.llvm::SmallVectorTemplateCommon.16" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.17" }
+%"struct.llvm::AlignedCharArrayUnion.17" = type { %"struct.llvm::AlignedCharArray.18" }
+%"struct.llvm::AlignedCharArray.18" = type { [88 x i8] }
+%"struct.llvm::SmallVectorStorage.19" = type { [15 x %"struct.llvm::AlignedCharArrayUnion.17"] }
+%"class.clang::SourceManager" = type { %"class.llvm::RefCountedBase", %"class.clang::DiagnosticsEngine"*, %"class.clang::FileManager"*, %"class.llvm::BumpPtrAllocatorImpl", %"class.llvm::DenseMap.65", i8, i8, %"class.std::unique_ptr.78", %"class.std::vector.94", %"class.llvm::SmallVector.99", %"class.llvm::SmallVector.99", i32, i32, %"class.std::vector.107", %"class.clang::ExternalSLocEntrySource"*, %"class.clang::FileID", %"class.clang::LineTableInfo"*, %"class.clang::FileID", %"class.clang::SrcMgr::ContentCache"*, i32, i32, %"class.clang::FileID", %"class.clang::FileID", i32, i32, %"class.llvm::DenseMap.111", %"class.llvm::DenseMap.115", %"class.clang::InBeforeInTUCacheEntry", %"class.std::unique_ptr.119", %"class.std::unique_ptr.127", %"class.llvm::DenseMap.135", %"class.llvm::SmallVector.139" }
+%"class.llvm::RefCountedBase" = type { i32 }
+%"class.clang::DiagnosticsEngine" = type opaque
+%"class.clang::FileManager" = type { %"class.llvm::RefCountedBase.20", %"class.llvm::IntrusiveRefCntPtr", %"class.clang::FileSystemOptions", %"class.std::map", %"class.std::map.24", %"class.llvm::SmallVector.29", %"class.llvm::SmallVector.35", %"class.llvm::StringMap", %"class.llvm::StringMap.56", %"class.llvm::DenseMap", %"class.llvm::BumpPtrAllocatorImpl", i32, i32, i32, i32, i32, %"class.std::unique_ptr.57" }
+%"class.llvm::RefCountedBase.20" = type { i32 }
+%"class.llvm::IntrusiveRefCntPtr" = type { %"class.clang::vfs::FileSystem"* }
+%"class.clang::vfs::FileSystem" = type <{ i32 (...)**, %"class.llvm::ThreadSafeRefCountedBase", [4 x i8] }>
+%"class.llvm::ThreadSafeRefCountedBase" = type { %"struct.std::atomic" }
+%"struct.std::atomic" = type { %"struct.std::__atomic_base" }
+%"struct.std::__atomic_base" = type { i32 }
+%"class.clang::FileSystemOptions" = type { %"class.std::basic_string" }
+%"class.std::map" = type { %"class.std::_Rb_tree" }
+%"class.std::_Rb_tree" = type { %"struct.std::_Rb_tree<llvm::sys::fs::UniqueID, std::pair<const llvm::sys::fs::UniqueID, clang::DirectoryEntry>, std::_Select1st<std::pair<const llvm::sys::fs::UniqueID, clang::DirectoryEntry> >, std::less<llvm::sys::fs::UniqueID>, std::allocator<std::pair<const llvm::sys::fs::UniqueID, clang::DirectoryEntry> > >::_Rb_tree_impl" }
+%"struct.std::_Rb_tree<llvm::sys::fs::UniqueID, std::pair<const llvm::sys::fs::UniqueID, clang::DirectoryEntry>, std::_Select1st<std::pair<const llvm::sys::fs::UniqueID, clang::DirectoryEntry> >, std::less<llvm::sys::fs::UniqueID>, std::allocator<std::pair<const llvm::sys::fs::UniqueID, clang::DirectoryEntry> > >::_Rb_tree_impl" = type { %"struct.std::less", %"struct.std::_Rb_tree_node_base", i64 }
+%"struct.std::less" = type { i8 }
+%"struct.std::_Rb_tree_node_base" = type { i32, %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"*, %"struct.std::_Rb_tree_node_base"* }
+%"class.std::map.24" = type { %"class.std::_Rb_tree.25" }
+%"class.std::_Rb_tree.25" = type { %"struct.std::_Rb_tree<llvm::sys::fs::UniqueID, std::pair<const llvm::sys::fs::UniqueID, clang::FileEntry>, std::_Select1st<std::pair<const llvm::sys::fs::UniqueID, clang::FileEntry> >, std::less<llvm::sys::fs::UniqueID>, std::allocator<std::pair<const llvm::sys::fs::UniqueID, clang::FileEntry> > >::_Rb_tree_impl" }
+%"struct.std::_Rb_tree<llvm::sys::fs::UniqueID, std::pair<const llvm::sys::fs::UniqueID, clang::FileEntry>, std::_Select1st<std::pair<const llvm::sys::fs::UniqueID, clang::FileEntry> >, std::less<llvm::sys::fs::UniqueID>, std::allocator<std::pair<const llvm::sys::fs::UniqueID, clang::FileEntry> > >::_Rb_tree_impl" = type { %"struct.std::less", %"struct.std::_Rb_tree_node_base", i64 }
+%"class.llvm::SmallVector.29" = type { %"class.llvm::SmallVectorImpl.30", %"struct.llvm::SmallVectorStorage.34" }
+%"class.llvm::SmallVectorImpl.30" = type { %"class.llvm::SmallVectorTemplateBase.31" }
+%"class.llvm::SmallVectorTemplateBase.31" = type { %"class.llvm::SmallVectorTemplateCommon.32" }
+%"class.llvm::SmallVectorTemplateCommon.32" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.33" }
+%"struct.llvm::AlignedCharArrayUnion.33" = type { %"struct.llvm::AlignedCharArray.11" }
+%"struct.llvm::SmallVectorStorage.34" = type { [3 x %"struct.llvm::AlignedCharArrayUnion.33"] }
+%"class.llvm::SmallVector.35" = type { %"class.llvm::SmallVectorImpl.36", %"struct.llvm::SmallVectorStorage.40" }
+%"class.llvm::SmallVectorImpl.36" = type { %"class.llvm::SmallVectorTemplateBase.37" }
+%"class.llvm::SmallVectorTemplateBase.37" = type { %"class.llvm::SmallVectorTemplateCommon.38" }
+%"class.llvm::SmallVectorTemplateCommon.38" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.39" }
+%"struct.llvm::AlignedCharArrayUnion.39" = type { %"struct.llvm::AlignedCharArray.11" }
+%"struct.llvm::SmallVectorStorage.40" = type { [3 x %"struct.llvm::AlignedCharArrayUnion.39"] }
+%"class.llvm::StringMap" = type { %"class.llvm::StringMapImpl", %"class.llvm::BumpPtrAllocatorImpl" }
+%"class.llvm::StringMapImpl" = type { %"class.llvm::StringMapEntryBase"**, i32, i32, i32, i32 }
+%"class.llvm::StringMapEntryBase" = type { i32 }
+%"class.llvm::StringMap.56" = type { %"class.llvm::StringMapImpl", %"class.llvm::BumpPtrAllocatorImpl" }
+%"class.llvm::DenseMap" = type <{ %"struct.llvm::detail::DenseMapPair"*, i32, i32, i32, [4 x i8] }>
+%"struct.llvm::detail::DenseMapPair" = type opaque
+%"class.std::unique_ptr.57" = type { %"class.std::tuple.58" }
+%"class.std::tuple.58" = type { %"struct.std::_Tuple_impl.59" }
+%"struct.std::_Tuple_impl.59" = type { %"struct.std::_Head_base.64" }
+%"struct.std::_Head_base.64" = type { %"class.clang::FileSystemStatCache"* }
+%"class.clang::FileSystemStatCache" = type opaque
+%"class.llvm::BumpPtrAllocatorImpl" = type <{ i8*, i8*, %"class.llvm::SmallVector.41", %"class.llvm::SmallVector.47", i64, %"class.llvm::MallocAllocator", [7 x i8] }>
+%"class.llvm::SmallVector.41" = type { %"class.llvm::SmallVectorImpl.42", %"struct.llvm::SmallVectorStorage.46" }
+%"class.llvm::SmallVectorImpl.42" = type { %"class.llvm::SmallVectorTemplateBase.43" }
+%"class.llvm::SmallVectorTemplateBase.43" = type { %"class.llvm::SmallVectorTemplateCommon.44" }
+%"class.llvm::SmallVectorTemplateCommon.44" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.45" }
+%"struct.llvm::AlignedCharArrayUnion.45" = type { %"struct.llvm::AlignedCharArray.11" }
+%"struct.llvm::SmallVectorStorage.46" = type { [3 x %"struct.llvm::AlignedCharArrayUnion.45"] }
+%"class.llvm::SmallVector.47" = type <{ %"class.llvm::SmallVectorImpl.48", %"struct.llvm::SmallVectorStorage.53", [7 x i8] }>
+%"class.llvm::SmallVectorImpl.48" = type { %"class.llvm::SmallVectorTemplateBase.49" }
+%"class.llvm::SmallVectorTemplateBase.49" = type { %"class.llvm::SmallVectorTemplateCommon.50" }
+%"class.llvm::SmallVectorTemplateCommon.50" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.51" }
+%"struct.llvm::AlignedCharArrayUnion.51" = type { %"struct.llvm::AlignedCharArray.52" }
+%"struct.llvm::SmallVectorStorage.53" = type { i8 }
+%"class.llvm::MallocAllocator" = type { i8 }
+%"class.llvm::DenseMap.65" = type <{ %"struct.llvm::detail::DenseMapPair.67"*, i32, i32, i32, [4 x i8] }>
+%"struct.llvm::detail::DenseMapPair.67" = type { %"struct.std::pair.68" }
+%"struct.std::pair.68" = type { %"class.clang::FileEntry"*, %"class.clang::SrcMgr::ContentCache"* }
+%"class.clang::FileEntry" = type { i8*, i64, i64, %"class.clang::DirectoryEntry"*, i32, %"class.llvm::sys::fs::UniqueID", i8, i8, i8, %"class.std::unique_ptr.69" }
+%"class.clang::DirectoryEntry" = type { i8* }
+%"class.llvm::sys::fs::UniqueID" = type { i64, i64 }
+%"class.std::unique_ptr.69" = type { %"class.std::tuple.70" }
+%"class.std::tuple.70" = type { %"struct.std::_Tuple_impl.71" }
+%"struct.std::_Tuple_impl.71" = type { %"struct.std::_Head_base.76" }
+%"struct.std::_Head_base.76" = type { %"class.clang::vfs::File"* }
+%"class.clang::vfs::File" = type { i32 (...)** }
+%"class.std::unique_ptr.78" = type { %"class.std::tuple.79" }
+%"class.std::tuple.79" = type { %"struct.std::_Tuple_impl.80" }
+%"struct.std::_Tuple_impl.80" = type { %"struct.std::_Head_base.85" }
+%"struct.std::_Head_base.85" = type { %"struct.clang::SourceManager::OverriddenFilesInfoTy"* }
+%"struct.clang::SourceManager::OverriddenFilesInfoTy" = type { %"class.llvm::DenseMap.86", %"class.llvm::DenseSet" }
+%"class.llvm::DenseMap.86" = type <{ %"struct.llvm::detail::DenseMapPair.88"*, i32, i32, i32, [4 x i8] }>
+%"struct.llvm::detail::DenseMapPair.88" = type { %"struct.std::pair.89" }
+%"struct.std::pair.89" = type { %"class.clang::FileEntry"*, %"class.clang::FileEntry"* }
+%"class.llvm::DenseSet" = type { %"class.llvm::DenseMap.91" }
+%"class.llvm::DenseMap.91" = type <{ %"class.llvm::detail::DenseSetPair"*, i32, i32, i32, [4 x i8] }>
+%"class.llvm::detail::DenseSetPair" = type { %"class.clang::FileEntry"* }
+%"class.std::vector.94" = type { %"struct.std::_Vector_base.95" }
+%"struct.std::_Vector_base.95" = type { %"struct.std::_Vector_base<clang::SrcMgr::ContentCache *, std::allocator<clang::SrcMgr::ContentCache *> >::_Vector_impl" }
+%"struct.std::_Vector_base<clang::SrcMgr::ContentCache *, std::allocator<clang::SrcMgr::ContentCache *> >::_Vector_impl" = type { %"class.clang::SrcMgr::ContentCache"**, %"class.clang::SrcMgr::ContentCache"**, %"class.clang::SrcMgr::ContentCache"** }
+%"class.llvm::SmallVector.99" = type <{ %"class.llvm::SmallVectorImpl.100", %"struct.llvm::SmallVectorStorage.105", [7 x i8] }>
+%"class.llvm::SmallVectorImpl.100" = type { %"class.llvm::SmallVectorTemplateBase.101" }
+%"class.llvm::SmallVectorTemplateBase.101" = type { %"class.llvm::SmallVectorTemplateCommon.102" }
+%"class.llvm::SmallVectorTemplateCommon.102" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.103" }
+%"struct.llvm::AlignedCharArrayUnion.103" = type { %"struct.llvm::AlignedCharArray.104" }
+%"struct.llvm::AlignedCharArray.104" = type { [24 x i8] }
+%"struct.llvm::SmallVectorStorage.105" = type { i8 }
+%"class.std::vector.107" = type { %"struct.std::_Bvector_base" }
+%"struct.std::_Bvector_base" = type { %"struct.std::_Bvector_base<std::allocator<bool> >::_Bvector_impl" }
+%"struct.std::_Bvector_base<std::allocator<bool> >::_Bvector_impl" = type { %"struct.std::_Bit_iterator", %"struct.std::_Bit_iterator", i64* }
+%"struct.std::_Bit_iterator" = type { %"struct.std::_Bit_iterator_base.base", [4 x i8] }
+%"struct.std::_Bit_iterator_base.base" = type <{ i64*, i32 }>
+%"class.clang::ExternalSLocEntrySource" = type { i32 (...)** }
+%"class.clang::LineTableInfo" = type opaque
+%"class.clang::SrcMgr::ContentCache" = type <{ %"class.llvm::PointerIntPair", %"class.clang::FileEntry"*, %"class.clang::FileEntry"*, i32*, [5 x i8], [3 x i8] }>
+%"class.llvm::PointerIntPair" = type { i64 }
+%"class.clang::FileID" = type { i32 }
+%"class.llvm::DenseMap.111" = type <{ %"struct.llvm::detail::DenseMapPair.113"*, i32, i32, i32, [4 x i8] }>
+%"struct.llvm::detail::DenseMapPair.113" = type opaque
+%"class.llvm::DenseMap.115" = type <{ %"struct.llvm::detail::DenseMapPair.117"*, i32, i32, i32, [4 x i8] }>
+%"struct.llvm::detail::DenseMapPair.117" = type opaque
+%"class.clang::InBeforeInTUCacheEntry" = type { %"class.clang::FileID", %"class.clang::FileID", i8, %"class.clang::FileID", i32, i32 }
+%"class.std::unique_ptr.119" = type { %"class.std::tuple.120" }
+%"class.std::tuple.120" = type { %"struct.std::_Tuple_impl.121" }
+%"struct.std::_Tuple_impl.121" = type { %"struct.std::_Head_base.126" }
+%"struct.std::_Head_base.126" = type { %"class.llvm::MemoryBuffer"* }
+%"class.llvm::MemoryBuffer" = type { i32 (...)**, i8*, i8* }
+%"class.std::unique_ptr.127" = type { %"class.std::tuple.128" }
+%"class.std::tuple.128" = type { %"struct.std::_Tuple_impl.129" }
+%"struct.std::_Tuple_impl.129" = type { %"struct.std::_Head_base.134" }
+%"struct.std::_Head_base.134" = type { %"class.clang::SrcMgr::ContentCache"* }
+%"class.llvm::DenseMap.135" = type <{ %"struct.llvm::detail::DenseMapPair.137"*, i32, i32, i32, [4 x i8] }>
+%"struct.llvm::detail::DenseMapPair.137" = type opaque
+%"class.llvm::SmallVector.139" = type { %"class.llvm::SmallVectorImpl.140", %"struct.llvm::SmallVectorStorage.144" }
+%"class.llvm::SmallVectorImpl.140" = type { %"class.llvm::SmallVectorTemplateBase.141" }
+%"class.llvm::SmallVectorTemplateBase.141" = type { %"class.llvm::SmallVectorTemplateCommon.142" }
+%"class.llvm::SmallVectorTemplateCommon.142" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.143" }
+%"struct.llvm::AlignedCharArrayUnion.143" = type { %"struct.llvm::AlignedCharArray.104" }
+%"struct.llvm::SmallVectorStorage.144" = type { [1 x %"struct.llvm::AlignedCharArrayUnion.143"] }
+%"class.std::set" = type { %"class.std::_Rb_tree.145" }
+%"class.std::_Rb_tree.145" = type { %"struct.std::_Rb_tree<clang::tooling::Replacement, clang::tooling::Replacement, std::_Identity<clang::tooling::Replacement>, std::less<clang::tooling::Replacement>, std::allocator<clang::tooling::Replacement> >::_Rb_tree_impl" }
+%"struct.std::_Rb_tree<clang::tooling::Replacement, clang::tooling::Replacement, std::_Identity<clang::tooling::Replacement>, std::less<clang::tooling::Replacement>, std::allocator<clang::tooling::Replacement> >::_Rb_tree_impl" = type { %"struct.std::less.149", %"struct.std::_Rb_tree_node_base", i64 }
+%"struct.std::less.149" = type { i8 }
+
+
+; Function Attrs: nounwind
+; CHECK-LABEL: @_ZN5clang6format22BreakableStringLiteral11insertBreakEjjSt4pairImjERNS0_17WhitespaceManagerE
+
+; Load a value into R0 before saving the LR
+; CHECK: lwz 0, {{[0-9]+([0-9]+)}}
+
+; Ensure the LR is saved using a different register
+; CHECK: mflr {{[1-9]+}}
+
+; Ensure the LR is restored using a different register
+; CHECK: mtlr {{[0-9]+}}
+; CHECK: blr
+define void @_ZN5clang6format22BreakableStringLiteral11insertBreakEjjSt4pairImjERNS0_17WhitespaceManagerE(%"class.clang::format::BreakableStringLiteral"* nocapture readonly %this, i32 zeroext %LineIndex, i32 zeroext %TailOffset, [2 x i64] %Split.coerce, %"class.clang::format::WhitespaceManager"* dereferenceable(1504) %Whitespaces) unnamed_addr #1 align 2 {
+entry:
+  %Split.coerce.fca.0.extract = extractvalue [2 x i64] %Split.coerce, 0
+  %Split.coerce.fca.1.extract = extractvalue [2 x i64] %Split.coerce, 1
+  %StartColumn = getelementptr inbounds %"class.clang::format::BreakableStringLiteral", %"class.clang::format::BreakableStringLiteral"* %this, i64 0, i32 0, i32 1
+  %0 = load i32, i32* %StartColumn, align 8, !tbaa !2
+  %Prefix = getelementptr inbounds %"class.clang::format::BreakableStringLiteral", %"class.clang::format::BreakableStringLiteral"* %this, i64 0, i32 0, i32 2
+  %Length.i.19 = getelementptr inbounds %"class.clang::format::BreakableStringLiteral", %"class.clang::format::BreakableStringLiteral"* %this, i64 0, i32 0, i32 2, i32 1
+  %1 = load i64, i64* %Length.i.19, align 8, !tbaa !10
+  %cmp.i = icmp eq i64 %1, 0
+  br i1 %cmp.i, label %entry._ZNK4llvm9StringRef10startswithES0_.exit_crit_edge, label %if.end.i.i
+
+entry._ZNK4llvm9StringRef10startswithES0_.exit_crit_edge: ; preds = %entry
+  %agg.tmp7.sroa.0.0..sroa_cast.phi.trans.insert = bitcast %"class.llvm::StringRef"* %Prefix to i64*
+  %agg.tmp7.sroa.0.0.copyload.pre = load i64, i64* %agg.tmp7.sroa.0.0..sroa_cast.phi.trans.insert, align 8
+  br label %_ZNK4llvm9StringRef10startswithES0_.exit
+
+if.end.i.i:                                       ; preds = %entry
+  %Data.i.20 = getelementptr inbounds %"class.llvm::StringRef", %"class.llvm::StringRef"* %Prefix, i64 0, i32 0
+  %2 = load i8*, i8** %Data.i.20, align 8, !tbaa !12
+  %lhsc = load i8, i8* %2, align 1
+  %phitmp.i = icmp eq i8 %lhsc, 64
+  %3 = ptrtoint i8* %2 to i64
+  br label %_ZNK4llvm9StringRef10startswithES0_.exit
+
+_ZNK4llvm9StringRef10startswithES0_.exit:         ; preds = %entry._ZNK4llvm9StringRef10startswithES0_.exit_crit_edge, %if.end.i.i
+  %agg.tmp7.sroa.0.0.copyload = phi i64 [ %agg.tmp7.sroa.0.0.copyload.pre, %entry._ZNK4llvm9StringRef10startswithES0_.exit_crit_edge ], [ %3, %if.end.i.i ]
+  %4 = phi i1 [ false, %entry._ZNK4llvm9StringRef10startswithES0_.exit_crit_edge ], [ %phitmp.i, %if.end.i.i ]
+  %dec = sext i1 %4 to i32
+  %dec. = add i32 %dec, %0
+  %Tok = getelementptr inbounds %"class.clang::format::BreakableStringLiteral", %"class.clang::format::BreakableStringLiteral"* %this, i64 0, i32 0, i32 0, i32 1
+  %ref = load %"struct.clang::format::FormatToken"*, %"struct.clang::format::FormatToken"** %Tok, align 8, !tbaa !13
+  %conv = zext i32 %TailOffset to i64
+  %add = add i64 %Split.coerce.fca.0.extract, %conv
+  %add4 = add i64 %add, %1
+  %conv5 = trunc i64 %add4 to i32
+  %Split.sroa.2.8.extract.trunc = trunc i64 %Split.coerce.fca.1.extract to i32
+  %agg.tmp6.sroa.0.0..sroa_idx13 = getelementptr inbounds %"class.clang::format::BreakableStringLiteral", %"class.clang::format::BreakableStringLiteral"* %this, i64 0, i32 0, i32 3
+  %agg.tmp6.sroa.0.0..sroa_cast = bitcast %"class.llvm::StringRef"* %agg.tmp6.sroa.0.0..sroa_idx13 to i64*
+  %agg.tmp6.sroa.0.0.copyload = load i64, i64* %agg.tmp6.sroa.0.0..sroa_cast, align 8
+  %agg.tmp6.sroa.2.0..sroa_idx14 = getelementptr inbounds %"class.clang::format::BreakableStringLiteral", %"class.clang::format::BreakableStringLiteral"* %this, i64 0, i32 0, i32 3, i32 1
+  %agg.tmp6.sroa.2.0.copyload = load i64, i64* %agg.tmp6.sroa.2.0..sroa_idx14, align 8
+  %InPPDirective = getelementptr inbounds %"class.clang::format::BreakableStringLiteral", %"class.clang::format::BreakableStringLiteral"* %this, i64 0, i32 0, i32 0, i32 3
+  %5 = load i8, i8* %InPPDirective, align 4, !tbaa !34, !range !39
+  %tobool = icmp ne i8 %5, 0
+  %IndentLevel = getelementptr inbounds %"class.clang::format::BreakableStringLiteral", %"class.clang::format::BreakableStringLiteral"* %this, i64 0, i32 0, i32 0, i32 2
+  %6 = load i32, i32* %IndentLevel, align 8, !tbaa !33
+  %.fca.0.insert11 = insertvalue [2 x i64] undef, i64 %agg.tmp6.sroa.0.0.copyload, 0
+  %.fca.1.insert12 = insertvalue [2 x i64] %.fca.0.insert11, i64 %agg.tmp6.sroa.2.0.copyload, 1
+  %.fca.0.insert = insertvalue [2 x i64] undef, i64 %agg.tmp7.sroa.0.0.copyload, 0
+  %.fca.1.insert = insertvalue [2 x i64] %.fca.0.insert, i64 %1, 1
+  tail call void @_ZN5clang6format17WhitespaceManager24replaceWhitespaceInTokenERKNS0_11FormatTokenEjjN4llvm9StringRefES6_bjji(%"class.clang::format::WhitespaceManager"* nonnull %Whitespaces, %"struct.clang::format::FormatToken"* dereferenceable(272) %ref, i32 zeroext %conv5, i32 zeroext %Split.sroa.2.8.extract.trunc, [2 x i64] %.fca.1.insert12, [2 x i64] %.fca.1.insert, i1 zeroext %tobool, i32 zeroext 1, i32 zeroext %6, i32 signext %dec.) #9
+  ret void
+}
+
+; Function Attrs: nounwind argmemonly
+declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+
+declare void @_ZN5clang6format17WhitespaceManager24replaceWhitespaceInTokenERKNS0_11FormatTokenEjjN4llvm9StringRefES6_bjji(%"class.clang::format::WhitespaceManager"*, %"struct.clang::format::FormatToken"* dereferenceable(272), i32 zeroext, i32 zeroext, [2 x i64], [2 x i64], i1 zeroext, i32 zeroext, i32 zeroext, i32 signext) #3
+
+; Function Attrs: nounwind argmemonly
+declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+
+attributes #9 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.8.0 (trunk 248714) (llvm/trunk 248719)"}
+!2 = !{!3, !4, i64 40}
+!3 = !{!"_ZTSN5clang6format24BreakableSingleLineTokenE", !4, i64 40, !7, i64 48, !7, i64 64, !7, i64 80}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!"_ZTSN4llvm9StringRefE", !8, i64 0, !9, i64 8}
+!8 = !{!"any pointer", !5, i64 0}
+!9 = !{!"long", !5, i64 0}
+!10 = !{!7, !9, i64 8}
+!11 = !{!9, !9, i64 0}
+!12 = !{!7, !8, i64 0}
+!13 = !{!5, !5, i64 0}
+!14 = !{!15, !4, i64 200}
+!15 = !{!"_ZTSN5clang6format11FormatStyleE", !4, i64 0, !16, i64 4, !16, i64 5, !16, i64 6, !16, i64 7, !16, i64 8, !16, i64 9, !16, i64 10, !16, i64 11, !17, i64 12, !16, i64 16, !16, i64 17, !18, i64 20, !16, i64 24, !16, i64 25, !16, i64 26, !16, i64 27, !19, i64 28, !20, i64 32, !16, i64 36, !16, i64 37, !4, i64 40, !21, i64 48, !16, i64 56, !4, i64 60, !4, i64 64, !16, i64 68, !16, i64 69, !16, i64 70, !16, i64 71, !23, i64 72, !16, i64 96, !4, i64 100, !16, i64 104, !16, i64 105, !24, i64 108, !21, i64 112, !21, i64 120, !4, i64 128, !25, i64 132, !4, i64 136, !16, i64 140, !16, i64 141, !4, i64 144, !4, i64 148, !4, i64 152, !4, i64 156, !4, i64 160, !4, i64 164, !26, i64 168, !16, i64 172, !16, i64 173, !27, i64 176, !16, i64 180, !4, i64 184, !16, i64 188, !16, i64 189, !16, i64 190, !16, i64 191, !16, i64 192, !28, i64 196, !4, i64 200, !29, i64 204}
+!16 = !{!"bool", !5, i64 0}
+!17 = !{!"_ZTSN5clang6format11FormatStyle18ShortFunctionStyleE", !5, i64 0}
+!18 = !{!"_ZTSN5clang6format11FormatStyle33DefinitionReturnTypeBreakingStyleE", !5, i64 0}
+!19 = !{!"_ZTSN5clang6format11FormatStyle19BinaryOperatorStyleE", !5, i64 0}
+!20 = !{!"_ZTSN5clang6format11FormatStyle18BraceBreakingStyleE", !5, i64 0}
+!21 = !{!"_ZTSSs", !22, i64 0}
+!22 = !{!"_ZTSNSs12_Alloc_hiderE", !8, i64 0}
+!23 = !{!"_ZTSSt6vectorISsSaISsEE"}
+!24 = !{!"_ZTSN5clang6format11FormatStyle12LanguageKindE", !5, i64 0}
+!25 = !{!"_ZTSN5clang6format11FormatStyle24NamespaceIndentationKindE", !5, i64 0}
+!26 = !{!"_ZTSN5clang6format11FormatStyle21PointerAlignmentStyleE", !5, i64 0}
+!27 = !{!"_ZTSN5clang6format11FormatStyle24SpaceBeforeParensOptionsE", !5, i64 0}
+!28 = !{!"_ZTSN5clang6format11FormatStyle16LanguageStandardE", !5, i64 0}
+!29 = !{!"_ZTSN5clang6format11FormatStyle11UseTabStyleE", !5, i64 0}
+!30 = !{!31, !32, i64 24}
+!31 = !{!"_ZTSN5clang6format14BreakableTokenE", !5, i64 8, !4, i64 16, !16, i64 20, !32, i64 24, !5, i64 32}
+!32 = !{!"_ZTSN5clang6format8encoding8EncodingE", !5, i64 0}
+!33 = !{!31, !4, i64 16}
+!34 = !{!31, !16, i64 20}
+!35 = !{!36, !36, i64 0}
+!36 = !{!"vtable pointer", !6, i64 0}
+!37 = !{!38, !38, i64 0}
+!38 = !{!"short", !5, i64 0}
+!39 = !{i8 0, i8 2}
+!40 = !{i64 0, i64 8, !41, i64 8, i64 8, !11}
+!41 = !{!8, !8, i64 0}
+!42 = !{!43, !8, i64 16}
+!43 = !{!"_ZTSN4llvm15SmallVectorBaseE", !8, i64 0, !8, i64 8, !8, i64 16}
+!44 = !{!43, !8, i64 8}
+!45 = !{!43, !8, i64 0}
+!46 = !{!4, !4, i64 0}
+!47 = !{!48, !16, i64 500}
+!48 = !{!"_ZTSN5clang6format21BreakableBlockCommentE", !49, i64 40, !51, i64 320, !53, i64 408, !4, i64 496, !16, i64 500, !7, i64 504}
+!49 = !{!"_ZTSN4llvm11SmallVectorINS_9StringRefELj16EEE", !50, i64 40}
+!50 = !{!"_ZTSN4llvm18SmallVectorStorageINS_9StringRefELj16EEE", !5, i64 0}
+!51 = !{!"_ZTSN4llvm11SmallVectorIjLj16EEE", !52, i64 28}
+!52 = !{!"_ZTSN4llvm18SmallVectorStorageIjLj16EEE", !5, i64 0}
+!53 = !{!"_ZTSN4llvm11SmallVectorIiLj16EEE", !54, i64 28}
+!54 = !{!"_ZTSN4llvm18SmallVectorStorageIiLj16EEE", !5, i64 0}
+!55 = !{!48, !4, i64 496}
diff --git a/test/CodeGen/PowerPC/aantidep-def-ec.mir b/test/CodeGen/PowerPC/aantidep-def-ec.mir
new file mode 100644
index 0000000000000..d1cb6782f0389
--- /dev/null
+++ b/test/CodeGen/PowerPC/aantidep-def-ec.mir
@@ -0,0 +1,117 @@
+# RUN: llc -o - %s -start-after=if-converter | FileCheck %s
+
+--- |
+  target datalayout = "E-m:e-i64:64-n32:64"
+  target triple = "powerpc64-unknown-linux-gnu"
+  
+  %struct.rwlock_t.0.22.58.68.242.244 = type {}
+  
+  @tasklist_lock = external global %struct.rwlock_t.0.22.58.68.242.244, align 1
+  
+  ; Function Attrs: nounwind
+  define void @mm_update_next_owner(i8** %p1, i32* %p2) #0 {
+  entry:
+    %0 = load i8*, i8** %p1, align 8
+    br i1 undef, label %do.body.92, label %for.body.21
+  
+  for.body.21:                                      ; preds = %entry
+    unreachable
+  
+  do.body.92:                                       ; preds = %entry
+    %usage = getelementptr inbounds i8, i8* %0, i64 -48
+    %counter.i = bitcast i8* %usage to i32*
+    %call95 = tail call signext i32 bitcast (i32 (...)* @__raw_read_unlock to i32 (%struct.rwlock_t.0.22.58.68.242.244*)*)(%struct.rwlock_t.0.22.58.68.242.244* nonnull @tasklist_lock) #1
+    store volatile i32 0, i32* %p2, align 4
+    tail call void asm sideeffect "#compiler barrier", "~{memory}"() #1
+    %1 = tail call i32 asm sideeffect "\0Alwsync \0A1:\09lwarx\09$0,0,$1\09\09# atomic_dec_return\0A\09addic\09$0,$0,-1\0A\09stwcx.\09$0,0,$1\0A\09bne-\091b\0Async \0A", "=&r,r,~{cc},~{xer},~{memory}"(i32* %counter.i) #1
+    %cmp.i = icmp eq i32 %1, 0
+    br i1 %cmp.i, label %if.then.i, label %put_task_struct.exit
+  
+  if.then.i:                                        ; preds = %do.body.92
+    unreachable
+  
+  put_task_struct.exit:                             ; preds = %do.body.92
+    ret void
+  }
+  
+  declare signext i32 @__raw_read_unlock(...)
+  
+  attributes #0 = { nounwind "target-cpu"="pwr7" }
+  attributes #1 = { nounwind }
+
+...
+---
+name:            mm_update_next_owner
+alignment:       4
+exposesReturnsTwice: false
+hasInlineAsm:    true
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:         
+  - { reg: '%x3' }
+  - { reg: '%x4' }
+calleeSavedRegisters: [ '%cr2', '%cr3', '%cr4', '%f14', '%f15', '%f16', 
+                        '%f17', '%f18', '%f19', '%f20', '%f21', '%f22', 
+                        '%f23', '%f24', '%f25', '%f26', '%f27', '%f28', 
+                        '%f29', '%f30', '%f31', '%r14', '%r15', '%r16', 
+                        '%r17', '%r18', '%r19', '%r20', '%r21', '%r22', 
+                        '%r23', '%r24', '%r25', '%r26', '%r27', '%r28', 
+                        '%r29', '%r30', '%r31', '%v20', '%v21', '%v22', 
+                        '%v23', '%v24', '%v25', '%v26', '%v27', '%v28', 
+                        '%v29', '%v30', '%v31', '%vf20', '%vf21', '%vf22', 
+                        '%vf23', '%vf24', '%vf25', '%vf26', '%vf27', '%vf28', 
+                        '%vf29', '%vf30', '%vf31', '%x14', '%x15', '%x16', 
+                        '%x17', '%x18', '%x19', '%x20', '%x21', '%x22', 
+                        '%x23', '%x24', '%x25', '%x26', '%x27', '%x28', 
+                        '%x29', '%x30', '%x31', '%cr2eq', '%cr3eq', '%cr4eq', 
+                        '%cr2gt', '%cr3gt', '%cr4gt', '%cr2lt', '%cr3lt', 
+                        '%cr4lt', '%cr2un', '%cr3un', '%cr4un' ]
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       144
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 112
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%x30' }
+  - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%x29' }
+body:             |
+  bb.0.entry:
+    liveins: %x3, %x4, %x29, %x30, %x29, %x30
+  
+    %x0 = MFLR8 implicit %lr8
+    STD %x0, 16, %x1
+    %x1 = STDU %x1, -144, %x1
+    STD killed %x29, 120, %x1 :: (store 8 into %fixed-stack.1)
+    STD killed %x30, 128, %x1 :: (store 8 into %fixed-stack.0, align 16)
+    %x30 = OR8 %x4, %x4
+    %x3 = LD 0, killed %x3 :: (load 8 from %ir.p1)
+    %x29 = ADDI8 killed %x3, -48
+    %x3 = ADDIStocHA %x2, @tasklist_lock
+    %x3 = LDtocL @tasklist_lock, killed %x3, implicit %x2 :: (load 8 from got)
+    BL8_NOP @__raw_read_unlock, csr_svr464_altivec, implicit-def %lr8, implicit %rm, implicit %x3, implicit %x2, implicit-def %r1, implicit-def dead %x3
+    %r3 = LI 0
+    STW killed %r3, 0, killed %x30 :: (volatile store 4 into %ir.p2)
+    INLINEASM $"#compiler barrier", 25
+    INLINEASM $"\0Alwsync \0A1:\09lwarx\09$0,0,$1\09\09# atomic_dec_return\0A\09addic\09$0,$0,-1\0A\09stwcx.\09$0,0,$1\0A\09bne-\091b\0Async \0A", 25, 131083, def early-clobber %r3, 851977, killed %x29, 12, implicit-def dead early-clobber %cr0
+    ; CHECK-LABEL: @mm_update_next_owner
+    ; CHECK-NOT: lwarx 29, 0, 29
+    ; CHECK-NOT: stwcx. 29, 0, 29
+    %cr0 = CMPLWI killed %r3, 0
+    %x30 = LD 128, %x1 :: (load 8 from %fixed-stack.0, align 16)
+    %x29 = LD 120, %x1 :: (load 8 from %fixed-stack.1)
+    %x1 = ADDI8 %x1, 144
+    %x0 = LD 16, %x1
+    MTLR8 %x0, implicit-def %lr8
+    BLR8 implicit %lr8, implicit %rm
+
+...
diff --git a/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
new file mode 100644
index 0000000000000..f0c0deacf4dda
--- /dev/null
+++ b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
@@ -0,0 +1,305 @@
+; RUN: llc -O2 < %s | FileCheck %s
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-grtev4-linux-gnu"
+
+; Function Attrs: nounwind
+define void @_ZN10SubProcess19ScrubbedForkAndExecEiPiS0_PNS_7ResultsE() #0 align 2 {
+; CHECK: lis 3, 1234
+; CHECK-NOT: li 3
+; CHECK-NOT: ori 3
+; CHECK-NOT: addi 3
+; CHECK-NOT: addis 3
+; CHECK-NOT: lis 3
+; CHECK: sc
+  br i1 undef, label %1, label %2
+
+; <label>:1                                       ; preds = %0
+  br label %60
+
+; <label>:2                                       ; preds = %0
+  br i1 undef, label %3, label %4
+
+; <label>:3                                       ; preds = %2
+  unreachable
+
+; <label>:4                                       ; preds = %2
+  br i1 undef, label %.lr.ph111, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+
+.lr.ph111:                                        ; preds = %4
+  br label %5
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit: ; preds = %12, %4
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader, label %13
+
+; <label>:5                                       ; preds = %12, %.lr.ph111
+  br i1 undef, label %6, label %9
+
+; <label>:6                                       ; preds = %5
+  br i1 undef, label %7, label %8
+
+; <label>:7                                       ; preds = %6
+  unreachable
+
+; <label>:8                                       ; preds = %6
+  br label %12
+
+; <label>:9                                       ; preds = %5
+  br i1 undef, label %10, label %11
+
+; <label>:10                                      ; preds = %9
+  br label %12
+
+; <label>:11                                      ; preds = %9
+  br label %12
+
+; <label>:12                                      ; preds = %11, %10, %8
+  br i1 undef, label %5, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+
+; <label>:13                                      ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader, label %14
+
+; <label>:14                                      ; preds = %13
+  br label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader: ; preds = %14, %13, %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge
+
+_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader: ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19, %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader
+  br label %_ZN10SubProcess12SafeSyscalls5closeEi.exit
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge: ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19, %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader
+  br i1 undef, label %15, label %19
+
+_ZN10SubProcess12SafeSyscalls5closeEi.exit:       ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit, %_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19: ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader
+
+; <label>:15                                      ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge
+  br label %16
+
+; <label>:16                                      ; preds = %17, %15
+  br i1 undef, label %17, label %.critedge.preheader
+
+; <label>:17                                      ; preds = %16
+  br i1 undef, label %16, label %.critedge.preheader
+
+.critedge.preheader:                              ; preds = %17, %16
+  br label %.critedge
+
+.critedge:                                        ; preds = %18, %.critedge.preheader
+  br i1 undef, label %18, label %.critedge8
+
+; <label>:18                                      ; preds = %.critedge
+  br i1 undef, label %.critedge, label %.critedge8
+
+.critedge8:                                       ; preds = %18, %.critedge
+  br label %59
+
+; <label>:19                                      ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge
+  br label %_ZN10SubProcess12SafeSyscalls5closeEi.exit22
+
+_ZN10SubProcess12SafeSyscalls5closeEi.exit22:     ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit22, %19
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit22, label %20
+
+; <label>:20                                      ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit22
+  %21 = alloca i8, i64 undef, align 1
+  br label %.thread.outer
+
+.thread.outer:                                    ; preds = %._crit_edge, %20
+  br label %.thread
+
+.thread:                                          ; preds = %45, %.thread.outer
+  call void @llvm.memset.p0i8.i64(i8* undef, i8 0, i64 56, i32 8, i1 false)
+  store i8* %21, i8** undef, align 8
+  store i32 1073741824, i32* undef, align 8
+  %22 = call { i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "sc\0A\09mfcr $0", "=&{r0},=&{r3},=&{r4},=&{r5},=&{r6},=&{r7},=&{r8},{r0},{r3},{r4},{r5},~{cr0},~{ctr},~{memory},~{r11},~{r12}"(i64 342, i64 80871424, i64 undef, i64 0) #2, !srcloc !1
+  br i1 undef, label %.lr.ph, label %.critedge15.preheader
+
+.critedge15.preheader:                            ; preds = %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge, %.thread
+  br i1 undef, label %.lr.ph93.preheader, label %.critedge15._crit_edge
+
+.lr.ph93.preheader:                               ; preds = %.critedge15.preheader
+  br label %.lr.ph93
+
+.lr.ph:                                           ; preds = %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge, %.thread
+  switch i32 undef, label %.critedge9 [
+    i32 11, label %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge
+    i32 4, label %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge
+  ]
+
+_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge: ; preds = %.lr.ph, %.lr.ph
+  br i1 undef, label %.lr.ph, label %.critedge15.preheader
+
+.critedge9:                                       ; preds = %.lr.ph
+  unreachable
+
+.critedge15._crit_edge:                           ; preds = %.critedge15, %.critedge15.preheader
+  br i1 undef, label %35, label %34
+
+.lr.ph93:                                         ; preds = %.critedge15, %.lr.ph93.preheader
+  switch i32 undef, label %33 [
+    i32 0, label %23
+    i32 1, label %23
+    i32 2, label %23
+    i32 3, label %23
+    i32 4, label %23
+    i32 5, label %23
+    i32 6, label %23
+    i32 7, label %23
+    i32 8, label %27
+    i32 9, label %30
+  ]
+
+; <label>:23                                      ; preds = %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93
+  br i1 undef, label %24, label %.critedge15
+
+; <label>:24                                      ; preds = %23
+  br i1 undef, label %.critedge15, label %25
+
+; <label>:25                                      ; preds = %24
+  br i1 undef, label %.critedge15, label %26
+
+; <label>:26                                      ; preds = %25
+  unreachable
+
+; <label>:27                                      ; preds = %.lr.ph93
+  br i1 undef, label %.critedge15, label %28
+
+; <label>:28                                      ; preds = %27
+  br i1 undef, label %29, label %.critedge15
+
+; <label>:29                                      ; preds = %28
+  br label %.critedge15
+
+; <label>:30                                      ; preds = %.lr.ph93
+  br i1 undef, label %.critedge15, label %31
+
+; <label>:31                                      ; preds = %30
+  br i1 undef, label %32, label %.critedge15
+
+; <label>:32                                      ; preds = %31
+  br label %.critedge15
+
+; <label>:33                                      ; preds = %.lr.ph93
+  unreachable
+
+.critedge15:                                      ; preds = %32, %31, %30, %29, %28, %27, %25, %24, %23
+  br i1 undef, label %.lr.ph93, label %.critedge15._crit_edge
+
+; <label>:34                                      ; preds = %.critedge15._crit_edge
+  unreachable
+
+; <label>:35                                      ; preds = %.critedge15._crit_edge
+  br i1 undef, label %45, label %36
+
+; <label>:36                                      ; preds = %35
+  br i1 undef, label %37, label %38
+
+; <label>:37                                      ; preds = %36
+  br i1 undef, label %.preheader, label %38
+
+.preheader:                                       ; preds = %37
+  br i1 undef, label %.lr.ph101, label %._crit_edge
+
+.lr.ph101:                                        ; preds = %.preheader
+  br label %39
+
+; <label>:38                                      ; preds = %37, %36
+  unreachable
+
+; <label>:39                                      ; preds = %43, %.lr.ph101
+  br i1 undef, label %40, label %43
+
+; <label>:40                                      ; preds = %39
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17, label %41
+
+; <label>:41                                      ; preds = %40
+  unreachable
+
+_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17:   ; preds = %40
+  br i1 undef, label %42, label %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit
+
+; <label>:42                                      ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17
+  unreachable
+
+_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit:     ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17
+  br i1 undef, label %.thread27, label %43
+
+; <label>:43                                      ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit, %39
+  br i1 undef, label %39, label %._crit_edge
+
+.thread27:                                        ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit
+  br label %58
+
+._crit_edge:                                      ; preds = %43, %.preheader
+  br i1 undef, label %.thread.outer, label %44
+
+; <label>:44                                      ; preds = %._crit_edge
+  unreachable
+
+; <label>:45                                      ; preds = %35
+  br i1 undef, label %46, label %.thread
+
+; <label>:46                                      ; preds = %45
+  br i1 undef, label %48, label %47
+
+; <label>:47                                      ; preds = %46
+  unreachable
+
+; <label>:48                                      ; preds = %46
+  br i1 undef, label %55, label %49
+
+; <label>:49                                      ; preds = %48
+  br i1 undef, label %50, label %51
+
+; <label>:50                                      ; preds = %49
+  br label %52
+
+; <label>:51                                      ; preds = %49
+  br label %52
+
+; <label>:52                                      ; preds = %51, %50
+  br label %53
+
+; <label>:53                                      ; preds = %54, %52
+  br i1 undef, label %54, label %.critedge13
+
+; <label>:54                                      ; preds = %53
+  br i1 undef, label %53, label %.critedge13
+
+.critedge13:                                      ; preds = %54, %53
+  br label %58
+
+; <label>:55                                      ; preds = %48
+  br label %56
+
+; <label>:56                                      ; preds = %57, %55
+  br i1 undef, label %57, label %.critedge14
+
+; <label>:57                                      ; preds = %56
+  br i1 undef, label %56, label %.critedge14
+
+.critedge14:                                      ; preds = %57, %56
+  br label %58
+
+; <label>:58                                      ; preds = %.critedge14, %.critedge13, %.thread27
+  br label %59
+
+; <label>:59                                      ; preds = %58, %.critedge8
+  br label %60
+
+; <label>:60                                      ; preds = %59, %1
+  ret void
+}
+
+; Function Attrs: nounwind argmemonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind argmemonly }
+attributes #2 = { nounwind }
+
+!1 = !{i32 -2140527538, i32 -2140527533}
diff --git a/test/CodeGen/PowerPC/addisdtprelha-nonr3.mir b/test/CodeGen/PowerPC/addisdtprelha-nonr3.mir
new file mode 100644
index 0000000000000..e4aaaf30f90f7
--- /dev/null
+++ b/test/CodeGen/PowerPC/addisdtprelha-nonr3.mir
@@ -0,0 +1,80 @@
+# RUN: llc -relocation-model=pic -start-after=block-placement -o - %s | FileCheck %s
+
+--- |
+  target datalayout = "E-m:e-i64:64-n32:64"
+  target triple = "powerpc64-unknown-linux-gnu"
+  
+  @x = internal thread_local unnamed_addr global i1 false
+  @y = external thread_local global i32, align 4
+  
+  ; Function Attrs: nounwind
+  define void @test1() #0 {
+  entry:
+    store i1 true, i1* @x, align 1
+    store i32 20, i32* @y, align 4
+    ret void
+  }
+  
+  attributes #0 = { nounwind "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "target-cpu"="pwr7" }
+  
+  !llvm.module.flags = !{!0}
+  
+  !0 = !{i32 1, !"PIC Level", i32 2}
+
+...
+---
+name:            test1
+alignment:       4
+exposesReturnsTwice: false
+hasInlineAsm:    false
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       64
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 48
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%x30' }
+  - { id: 1, offset: -8, size: 8, alignment: 8, isImmutable: true, isAliased: false }
+body:             |
+  bb.0.entry:
+    liveins: %x30, %x30
+  
+    %x0 = MFLR8 implicit %lr8
+    STD %x31, -8, %x1
+    STD killed %x0, 16, %x1
+    %x1 = STDU %x1, -64, %x1
+    %x3 = ADDIStlsldHA %x2, @x
+    %x31 = OR8 %x1, %x1
+    %x3 = ADDItlsldL killed %x3, @x
+    STD killed %x30, 48, %x31 :: (store 8 into %fixed-stack.0, align 16)
+    %x3 = GETtlsldADDR killed %x3, @x, implicit-def dead %x0, implicit-def dead %x4, implicit-def dead %x5, implicit-def dead %x6, implicit-def dead %x7, implicit-def dead %x8, implicit-def dead %x9, implicit-def dead %x10, implicit-def dead %x11, implicit-def dead %x12, implicit-def %lr8, implicit-def %ctr8, implicit-def dead %cr0, implicit-def dead %cr1, implicit-def dead %cr5, implicit-def dead %cr6, implicit-def dead %cr7
+    %x12 = ADDIStlsgdHA %x2, @y
+    %x30 = OR8 killed %x3, %x3
+    %x3 = ADDItlsgdL killed %x12, @y
+    %x3 = GETtlsADDR killed %x3, @y, implicit-def dead %x0, implicit-def dead %x4, implicit-def dead %x5, implicit-def dead %x6, implicit-def dead %x7, implicit-def dead %x8, implicit-def dead %x9, implicit-def dead %x10, implicit-def dead %x11, implicit-def dead %x12, implicit-def %lr8, implicit-def %ctr8, implicit-def dead %cr0, implicit-def dead %cr1, implicit-def dead %cr5, implicit-def dead %cr6, implicit-def dead %cr7
+    %x4 = ADDISdtprelHA killed %x30, @x
+    ; CHECK: addis 4, 30, x@dtprel@ha
+    %x5 = LI8 1
+    %r6 = LI 20
+    %x30 = LD 48, %x31 :: (load 8 from %fixed-stack.0, align 16)
+    STB8 killed %x5, target-flags(ppc-dtprel-lo) @x, killed %x4 :: (store 1 into @x)
+    STW killed %r6, 0, killed %x3 :: (store 4 into @y)
+    %x1 = ADDI8 %x1, 64
+    %x0 = LD 16, %x1
+    %x31 = LD -8, %x1
+    MTLR8 killed %x0, implicit-def %lr8
+    BLR8 implicit %lr8, implicit %rm
+
+...
diff --git a/test/CodeGen/PowerPC/alias.ll b/test/CodeGen/PowerPC/alias.ll
index 524abd5da3efb..3650cd9d8a060 100644
--- a/test/CodeGen/PowerPC/alias.ll
+++ b/test/CodeGen/PowerPC/alias.ll
@@ -2,10 +2,10 @@
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK --check-prefix=LARGE %s
 
 @foo = global i32 42
-@fooa = alias i32* @foo
+@fooa = alias i32, i32* @foo
 
 @foo2 = global i64 42
-@foo2a = alias i64* @foo2
+@foo2a = alias i64, i64* @foo2
 
 ; CHECK-LABEL: bar:
 define i32 @bar() {
diff --git a/test/CodeGen/PowerPC/bitcasts-direct-move.ll b/test/CodeGen/PowerPC/bitcasts-direct-move.ll
new file mode 100644
index 0000000000000..756f579175482
--- /dev/null
+++ b/test/CodeGen/PowerPC/bitcasts-direct-move.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
+; RUN:  --check-prefix=CHECK-P7
+
+define signext i32 @f32toi32(float %a) {
+entry:
+  %0 = bitcast float %a to i32
+  ret i32 %0
+; CHECK-P7: stfs 1,
+; CHECK-P7: lwa 3,
+; CHECK: xscvdpspn [[CONVREG:[0-9]+]], 1
+; CHECK: xxsldwi [[SHIFTREG:[0-9]+]], [[CONVREG]], [[CONVREG]], 3
+; CHECK: mfvsrwz 3, [[SHIFTREG]]
+}
+
+define i64 @f64toi64(double %a) {
+entry:
+  %0 = bitcast double %a to i64
+  ret i64 %0
+; CHECK-P7: stxsdx 1,
+; CHECK-P7: ld 3,
+; CHECK: mfvsrd 3, 1
+}
+
+define float @i32tof32(i32 signext %a) {
+entry:
+  %0 = bitcast i32 %a to float
+  ret float %0
+; CHECK-P7: stw 3,
+; CHECK-P7: lfs 1,
+; CHECK: mtvsrd [[MOVEREG:[0-9]+]], 3
+; CHECK: xxsldwi [[SHIFTREG:[0-9]+]], [[MOVEREG]], [[MOVEREG]], 1
+; CHECK: xscvspdpn 1, [[SHIFTREG]]
+}
+
+define double @i64tof64(i64 %a) {
+entry:
+  %0 = bitcast i64 %a to double
+  ret double %0
+; CHECK-P7: std 3,
+; CHECK-P7: lxsdx 1,
+; CHECK: mtvsrd 1, 3
+}
+
+define zeroext i32 @f32toi32u(float %a) {
+entry:
+  %0 = bitcast float %a to i32
+  ret i32 %0
+; CHECK-P7: stfs 1,
+; CHECK-P7: lwz 3,
+; CHECK: xscvdpspn [[CONVREG:[0-9]+]], 1
+; CHECK: xxsldwi [[SHIFTREG:[0-9]+]], [[CONVREG]], [[CONVREG]], 3
+; CHECK: mfvsrwz 3, [[SHIFTREG]]
+}
+
+define i64 @f64toi64u(double %a) {
+entry:
+  %0 = bitcast double %a to i64
+  ret i64 %0
+; CHECK-P7: stxsdx 1,
+; CHECK-P7: ld 3,
+; CHECK: mfvsrd 3, 1
+}
+
+define float @i32utof32(i32 zeroext %a) {
+entry:
+  %0 = bitcast i32 %a to float
+  ret float %0
+; CHECK-P7: stw 3,
+; CHECK-P7: lfs 1,
+; CHECK: mtvsrd [[MOVEREG:[0-9]+]], 3
+; CHECK: xxsldwi [[SHIFTREG:[0-9]+]], [[MOVEREG]], [[MOVEREG]], 1
+; CHECK: xscvspdpn 1, [[SHIFTREG]]
+}
+
+define double @i64utof64(i64 %a) {
+entry:
+  %0 = bitcast i64 %a to double
+  ret double %0
+; CHECK-P7: std 3,
+; CHECK-P7: lxsdx 1,
+; CHECK: mtvsrd 1, 3
+}
diff --git a/test/CodeGen/PowerPC/bitreverse.ll b/test/CodeGen/PowerPC/bitreverse.ll
new file mode 100644
index 0000000000000..1c3741a9a6967
--- /dev/null
+++ b/test/CodeGen/PowerPC/bitreverse.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=ppc64 %s -o - | FileCheck %s
+
+; These tests just check that the plumbing is in place for @llvm.bitreverse. The
+; actual output is massive at the moment as llvm.bitreverse is not yet legal.
+
+declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
+
+define <2 x i16> @f(<2 x i16> %a) {
+; CHECK-LABEL: f:
+; CHECK: rlwinm
+  %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
+  ret <2 x i16> %b
+}
+
+declare i8 @llvm.bitreverse.i8(i8) readnone
+
+define i8 @g(i8 %a) {
+; CHECK-LABEL: g:
+; CHECK: rlwinm
+; CHECK: rlwimi
+  %b = call i8 @llvm.bitreverse.i8(i8 %a)
+  ret i8 %b
+}
diff --git a/test/CodeGen/PowerPC/branch-hint.ll b/test/CodeGen/PowerPC/branch-hint.ll
new file mode 100644
index 0000000000000..46160507105fa
--- /dev/null
+++ b/test/CodeGen/PowerPC/branch-hint.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -ppc-use-branch-hint=false | FileCheck %s
+; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -ppc-use-branch-hint=true | FileCheck %s -check-prefix=CHECK-HINT
+define void @branch_hint_1(i32 %src) {
+entry:
+  %cmp = icmp eq i32 %src, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo() #0
+  unreachable
+
+if.end:
+  call void @goo()
+  ret void
+
+; CHECK-LABEL: branch_hint_1:
+; CHECK: beq
+
+; CHECK-HINT-LABEL: branch_hint_1:
+; CHECK-HINT: beq-
+}
+
+define void @branch_hint_2(i32 %src) {
+entry:
+  %cmp = icmp eq i32 %src, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  call void @goo()
+  ret void
+
+if.end:
+  tail call void @foo() #0
+  unreachable
+
+; CHECK-LABEL: @branch_hint_2
+; CHECK: bne
+
+; CHECK-HINT-LABEL: @branch_hint_2
+; CHECK-HINT: bne-
+}
+
+declare void @foo()
+attributes #0 = { noreturn }
+
+define void @branch_hint_3(i32 %src) {
+entry:
+  %cmp = icmp eq i32 %src, 0
+  br i1 %cmp, label %if.then, label %if.end, !prof !0
+
+if.then:
+  call void @foo()
+  ret void
+
+if.end:
+  call void @goo()
+  ret void
+
+; CHECK-LABEL: @branch_hint_3
+; CHECK: bne
+
+; CHECK-HINT-LABEL: @branch_hint_3
+; CHECK-HINT: bne
+}
+
+!0 = !{!"branch_weights", i32 64, i32 4}
+
+define void @branch_hint_4(i32 %src) {
+entry:
+  %cmp = icmp eq i32 %src, 0
+  br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+  call void @foo()
+  ret void
+
+if.end:
+  call void @goo()
+  ret void
+
+; CHECK-HINT-LABEL: branch_hint_4
+; CHECK-HINT: bne
+}
+
+!1 = !{!"branch_weights", i32 64, i32 8}
+
+define void @branch_hint_5(i32 %src) {
+entry:
+  %cmp = icmp eq i32 %src, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  ret void
+
+if.end:
+  call void @goo()
+  ret void
+
+; CHECK-HINT-LABEL: branch_hint_5:
+; CHECK-HINT: beq
+}
+
+declare void @goo()
+
+define void @branch_hint_6(i32 %src1, i32 %src2, i32 %src3) {
+entry:
+  %cmp = icmp eq i32 %src1, 0
+  br i1 %cmp, label %if.end.6, label %if.end, !prof !3
+
+if.end:
+  %cmp1 = icmp eq i32 %src2, 0
+  br i1 %cmp1, label %if.end.3, label %if.then.2
+
+if.then.2:
+  tail call void @foo() #0
+  unreachable
+
+if.end.3:
+  %cmp4 = icmp eq i32 %src3, 1
+  br i1 %cmp4, label %if.then.5, label %if.end.6
+
+if.then.5:
+  tail call void @foo() #0
+  unreachable
+
+if.end.6:
+  ret void
+
+; CHECK-HINT-LABEL: branch_hint_6:
+; CHECK-HINT: bne
+; CHECK-HINT: bne-
+; CHECK-HINT: bne+
+}
+
+!3 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/CodeGen/PowerPC/coal-sections.ll b/test/CodeGen/PowerPC/coal-sections.ll
new file mode 100644
index 0000000000000..377891c471432
--- /dev/null
+++ b/test/CodeGen/PowerPC/coal-sections.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple powerpc-apple-darwin8 -march=ppc32 | FileCheck %s
+
+; Check that *coal* sections are emitted.
+
+; CHECK: .section  __TEXT,__textcoal_nt,coalesced,pure_instructions
+; CHECK: .section  __TEXT,__textcoal_nt,coalesced,pure_instructions
+; CHECK-NEXT: .globl  _foo
+
+; CHECK: .section  __TEXT,__const_coal,coalesced
+; CHECK-NEXT: .globl  _a
+
+; CHECK: .section  __DATA,__datacoal_nt,coalesced
+; CHECK-NEXT: .globl  _b
+
+@a = weak_odr constant [4 x i32] [i32 1, i32 2, i32 3, i32 4], align 16
+@b = weak global i32 5, align 4
+@g = common global i32* null, align 8
+
+; Function Attrs: nounwind ssp uwtable
+define weak i32* @foo() {
+entry:
+  store i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 0), i32** @g, align 8
+  ret i32* @b
+}
diff --git a/test/CodeGen/PowerPC/crbit-asm-disabled.ll b/test/CodeGen/PowerPC/crbit-asm-disabled.ll
new file mode 100644
index 0000000000000..56ec8ecb85d7f
--- /dev/null
+++ b/test/CodeGen/PowerPC/crbit-asm-disabled.ll
@@ -0,0 +1,16 @@
+; RUN: not llc -mcpu=pwr7 -o /dev/null %s 2>&1 | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define zeroext i1 @testi1(i1 zeroext %b1, i1 zeroext %b2) #0 {
+entry:
+  %0 = tail call i8 asm "crand $0, $1, $2", "=^wc,^wc,^wc"(i1 %b1, i1 %b2) #0
+  %1 = and i8 %0, 1
+  %tobool3 = icmp ne i8 %1, 0
+  ret i1 %tobool3
+
+; CHECK: error: couldn't allocate output register for constraint 'wc'
+}
+
+attributes #0 = { nounwind "target-features"="-crbits" }
+
diff --git a/test/CodeGen/PowerPC/crbit-asm.ll b/test/CodeGen/PowerPC/crbit-asm.ll
index 36de3435a0816..41e65af29a8a1 100644
--- a/test/CodeGen/PowerPC/crbit-asm.ll
+++ b/test/CodeGen/PowerPC/crbit-asm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -O1 -mcpu=pwr7 < %s | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -55,5 +56,5 @@ entry:
 ; CHECK: blr
 }
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "target-features"="+crbits" }
 
diff --git a/test/CodeGen/PowerPC/cttz.ll b/test/CodeGen/PowerPC/cttz.ll
index 60de982d91a13..3757fa3e2f297 100644
--- a/test/CodeGen/PowerPC/cttz.ll
+++ b/test/CodeGen/PowerPC/cttz.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.cttz.i32(i32, i1)
 define i32 @bar(i32 %x) {
 entry:
 ; CHECK: @bar
-; CHECK: cntlz
+; CHECK: cntlzw
         %tmp.1 = call i32 @llvm.cttz.i32( i32 %x, i1 true )              ; <i32> [#uses=1]
         ret i32 %tmp.1
 }
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index 87914025b733f..b636cff0f2053 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -4,7 +4,7 @@
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
-define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readnone {
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readnone !dbg !5 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !15, metadata !DIExpression()), !dbg !17
   tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !16, metadata !DIExpression()), !dbg !18
@@ -17,10 +17,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1", isOptimized: true, emissionKind: 0, file: !21, enums: !1, retainedTypes: !1, subprograms: !3, globals: !1, imports: !1)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1", isOptimized: true, emissionKind: 0, file: !21, enums: !1, retainedTypes: !1, subprograms: !3, globals: !1, imports: !1)
 !1 = !{}
 !3 = !{!5}
-!5 = !DISubprogram(name: "main", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !21, scope: null, type: !7, function: i32 (i32, i8**)* @main, variables: !13)
+!5 = distinct !DISubprogram(name: "main", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !21, scope: null, type: !7, variables: !13)
 !6 = !DIFile(filename: "dbg.c", directory: "/src")
 !7 = !DISubroutineType(types: !8)
 !8 = !{!9, !9, !10}
@@ -29,8 +29,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !11 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !12)
 !12 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
 !13 = !{!15, !16}
-!15 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argc", line: 1, arg: 1, scope: !5, file: !6, type: !9)
-!16 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argv", line: 1, arg: 2, scope: !5, file: !6, type: !10)
+!15 = !DILocalVariable(name: "argc", line: 1, arg: 1, scope: !5, file: !6, type: !9)
+!16 = !DILocalVariable(name: "argv", line: 1, arg: 2, scope: !5, file: !6, type: !10)
 !17 = !DILocation(line: 1, column: 14, scope: !5)
 !18 = !DILocation(line: 1, column: 26, scope: !5)
 !19 = !DILocation(line: 2, column: 3, scope: !20)
diff --git a/test/CodeGen/PowerPC/dyn-alloca-offset.ll b/test/CodeGen/PowerPC/dyn-alloca-offset.ll
new file mode 100644
index 0000000000000..7159b9da736da
--- /dev/null
+++ b/test/CodeGen/PowerPC/dyn-alloca-offset.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare i64 @llvm.get.dynamic.area.offset.i64()
+
+declare i64 @bar(i64)
+
+attributes #0 = { nounwind }
+
+; Function Attrs: nounwind sanitize_address uwtable
+define signext i64 @foo(i32 signext %N, i32 signext %M) #0 {
+  %1 = alloca i64, align 32
+  %dynamic_area_offset = call i64 @llvm.get.dynamic.area.offset.i64()
+  %2 = call i64 @bar(i64 %dynamic_area_offset)
+  ret i64 %2
+
+; CHECK-DAG: li [[REG1:[0-9]+]], 112
+; CHECK: blr
+
+}
diff --git a/test/CodeGen/PowerPC/e500-1.ll b/test/CodeGen/PowerPC/e500-1.ll
new file mode 100644
index 0000000000000..7457c0e57e18d
--- /dev/null
+++ b/test/CodeGen/PowerPC/e500-1.ll
@@ -0,0 +1,30 @@
+; RUN: llc -O0 -mcpu=e500mc < %s | FileCheck %s
+; Check if e500 generates code with mfocrf insn.
+
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-linux-gnu"
+
+define internal i32 @func_49(i64 %p_50, i16 zeroext %p_51, i8* %p_52, i32 %p_53) {
+; CHECK-LABEL: @func_49
+; CHECK-NOT: mfocrf
+
+  %1 = load i64, i64* undef, align 8
+  %2 = load i64, i64* undef, align 8
+  %3 = icmp sge i32 undef, undef
+  %4 = zext i1 %3 to i32
+  %5 = sext i32 %4 to i64
+  %6 = icmp slt i64 %2, %5
+  %7 = zext i1 %6 to i32
+  %8 = call i64 @safe_sub_func_int64_t_s_s(i64 -6372137293439783564, i64 undef)
+  %9 = icmp slt i32 %7, undef
+  %10 = zext i1 %9 to i32
+  %11 = sext i32 %10 to i64
+  %12 = icmp sle i64 %1, %11
+  %13 = zext i1 %12 to i32
+  %14 = call i32 @safe_add_func_int32_t_s_s(i32 undef, i32 %13)
+  ret i32 undef
+}
+
+declare i32 @safe_add_func_int32_t_s_s(i32, i32)
+
+declare i64 @safe_sub_func_int64_t_s_s(i64, i64)
diff --git a/test/CodeGen/PowerPC/emutls_generic.ll b/test/CodeGen/PowerPC/emutls_generic.ll
new file mode 100644
index 0000000000000..a2e13a6723f80
--- /dev/null
+++ b/test/CodeGen/PowerPC/emutls_generic.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -emulated-tls -mtriple=powerpc64-unknown-linux-gnu -relocation-model=pic \
+; RUN:     | FileCheck %s
+; RUN: llc < %s -emulated-tls -mtriple=powerpc-unknown-linux-gnu -relocation-model=pic \
+; RUN:     | FileCheck %s
+
+; Make sure that TLS symbols are emitted in expected order.
+
+@external_x = external thread_local global i32, align 8
+@external_y = thread_local global i8 7, align 2
+@internal_y = internal thread_local global i64 9, align 16
+
+define i32* @get_external_x() {
+entry:
+  ret i32* @external_x
+}
+
+define i8* @get_external_y() {
+entry:
+  ret i8* @external_y
+}
+
+define i64* @get_internal_y() {
+entry:
+  ret i64* @internal_y
+}
+
+; CHECK-LABEL: get_external_x:
+; CHECK-NOT:   _tls_get_address
+; CHECK:       __emutls_get_address
+; CHECK-LABEL: get_external_y:
+; CHECK:       __emutls_get_address
+; CHECK-NOT:   _tls_get_address
+; CHECK-LABEL: get_internal_y:
+; CHECK-NOT:   __emutls_t.external_x:
+; CHECK-NOT:   __emutls_v.external_x:
+; CHECK-LABEL: __emutls_v.external_y:
+; CHECK-LABEL: __emutls_t.external_y:
+; CHECK:       __emutls_t.external_y
+; CHECK-LABEL: __emutls_v.internal_y:
+; CHECK-LABEL: __emutls_t.internal_y:
+; CHECK:       __emutls_t.internal_y
diff --git a/test/CodeGen/PowerPC/fast-isel-binary.ll b/test/CodeGen/PowerPC/fast-isel-binary.ll
index 2f1513f8aa117..1036689ff44d5 100644
--- a/test/CodeGen/PowerPC/fast-isel-binary.ll
+++ b/test/CodeGen/PowerPC/fast-isel-binary.ll
@@ -2,7 +2,7 @@
 
 ; Test add with non-legal types
 
-define void @add_i8(i8 %a, i8 %b) nounwind ssp {
+define void @add_i8(i8 %a, i8 %b) nounwind {
 entry:
 ; ELF64: add_i8
   %a.addr = alloca i8, align 4
@@ -12,7 +12,7 @@ entry:
   ret void
 }
 
-define void @add_i8_imm(i8 %a) nounwind ssp {
+define void @add_i8_imm(i8 %a) nounwind {
 entry:
 ; ELF64: add_i8_imm
   %a.addr = alloca i8, align 4
@@ -22,7 +22,7 @@ entry:
   ret void
 }
 
-define void @add_i16(i16 %a, i16 %b) nounwind ssp {
+define void @add_i16(i16 %a, i16 %b) nounwind {
 entry:
 ; ELF64: add_i16
   %a.addr = alloca i16, align 4
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-define void @add_i16_imm(i16 %a, i16 %b) nounwind ssp {
+define void @add_i16_imm(i16 %a, i16 %b) nounwind {
 entry:
 ; ELF64: add_i16_imm
   %a.addr = alloca i16, align 4
@@ -44,7 +44,7 @@ entry:
 
 ; Test or with non-legal types
 
-define void @or_i8(i8 %a, i8 %b) nounwind ssp {
+define void @or_i8(i8 %a, i8 %b) nounwind {
 entry:
 ; ELF64: or_i8
   %a.addr = alloca i8, align 4
@@ -54,7 +54,7 @@ entry:
   ret void
 }
 
-define void @or_i8_imm(i8 %a) nounwind ssp {
+define void @or_i8_imm(i8 %a) nounwind {
 entry:
 ; ELF64: or_i8_imm
   %a.addr = alloca i8, align 4
@@ -64,7 +64,7 @@ entry:
   ret void
 }
 
-define void @or_i16(i16 %a, i16 %b) nounwind ssp {
+define void @or_i16(i16 %a, i16 %b) nounwind {
 entry:
 ; ELF64: or_i16
   %a.addr = alloca i16, align 4
@@ -74,7 +74,7 @@ entry:
   ret void
 }
 
-define void @or_i16_imm(i16 %a) nounwind ssp {
+define void @or_i16_imm(i16 %a) nounwind {
 entry:
 ; ELF64: or_i16_imm
   %a.addr = alloca i16, align 4
@@ -86,7 +86,7 @@ entry:
 
 ; Test sub with non-legal types
 
-define void @sub_i8(i8 %a, i8 %b) nounwind ssp {
+define void @sub_i8(i8 %a, i8 %b) nounwind {
 entry:
 ; ELF64: sub_i8
   %a.addr = alloca i8, align 4
@@ -96,7 +96,7 @@ entry:
   ret void
 }
 
-define void @sub_i8_imm(i8 %a) nounwind ssp {
+define void @sub_i8_imm(i8 %a) nounwind {
 entry:
 ; ELF64: sub_i8_imm
   %a.addr = alloca i8, align 4
@@ -106,7 +106,7 @@ entry:
   ret void
 }
 
-define void @sub_i16(i16 %a, i16 %b) nounwind ssp {
+define void @sub_i16(i16 %a, i16 %b) nounwind {
 entry:
 ; ELF64: sub_i16
   %a.addr = alloca i16, align 4
@@ -116,7 +116,7 @@ entry:
   ret void
 }
 
-define void @sub_i16_imm(i16 %a) nounwind ssp {
+define void @sub_i16_imm(i16 %a) nounwind {
 entry:
 ; ELF64: sub_i16_imm
   %a.addr = alloca i16, align 4
@@ -126,7 +126,7 @@ entry:
   ret void
 }
 
-define void @sub_i16_badimm(i16 %a) nounwind ssp {
+define void @sub_i16_badimm(i16 %a) nounwind {
 entry:
 ; ELF64: sub_i16_imm
   %a.addr = alloca i16, align 4
diff --git a/test/CodeGen/PowerPC/fast-isel-br-const.ll b/test/CodeGen/PowerPC/fast-isel-br-const.ll
index 6be7fbf9e02fc..f411d23fb2881 100644
--- a/test/CodeGen/PowerPC/fast-isel-br-const.ll
+++ b/test/CodeGen/PowerPC/fast-isel-br-const.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
 
-define i32 @t1(i32 %a, i32 %b) nounwind uwtable ssp {
+define i32 @t1(i32 %a, i32 %b) nounwind {
 entry:
 ; ELF64: t1
   %x = add i32 %a, %b  
diff --git a/test/CodeGen/PowerPC/fast-isel-call.ll b/test/CodeGen/PowerPC/fast-isel-call.ll
index 64d8f6e791950..5d541e3a01f13 100644
--- a/test/CodeGen/PowerPC/fast-isel-call.ll
+++ b/test/CodeGen/PowerPC/fast-isel-call.ll
@@ -56,7 +56,7 @@ declare zeroext i16 @t6();
 declare signext i8 @t7();
 declare zeroext i8 @t8();
 
-define i32 @t10(i32 %argc, i8** nocapture %argv) {
+define i32 @t10(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
 ; ELF64: t10
   %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70)
@@ -97,9 +97,9 @@ define i32 @bar0(i32 %i) nounwind {
 ;        ret i32 %tmp1
 ;}
 
-declare void @float_foo(float %f) ssp
+declare void @float_foo(float %f)
 
-define void @float_const() ssp {
+define void @float_const() nounwind {
 entry:
 ; ELF64: float_const
   call void @float_foo(float 0x401C666660000000)
@@ -108,7 +108,7 @@ entry:
   ret void
 }
 
-define void @float_reg(float %dummy, float %f) ssp {
+define void @float_reg(float %dummy, float %f) nounwind {
 entry:
 ; ELF64: float_reg
   call void @float_foo(float %f)
@@ -116,9 +116,9 @@ entry:
   ret void
 }
 
-declare void @double_foo(double %d) ssp
+declare void @double_foo(double %d)
 
-define void @double_const() ssp {
+define void @double_const() nounwind {
 entry:
 ; ELF64: double_const
   call void @double_foo(double 0x1397723CCABD0000401C666660000000)
@@ -127,7 +127,7 @@ entry:
   ret void
 }
 
-define void @double_reg(double %dummy, double %d) ssp {
+define void @double_reg(double %dummy, double %d) nounwind {
 entry:
 ; ELF64: double_reg
   call void @double_foo(double %d)
diff --git a/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll b/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
index 5a9d15868b6b7..5881dc3798aec 100644
--- a/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
+++ b/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
@@ -3,7 +3,7 @@
 ; When fastisel better supports VSX fix up this test case.
 ;
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s --check-prefix=ELF64
-define void @t1a(float %a) uwtable ssp {
+define void @t1a(float %a) nounwind {
 entry:
 ; ELF64: t1a
   %cmp = fcmp oeq float %a, 0.000000e+00
@@ -22,7 +22,7 @@ if.end:                                           ; preds = %if.then, %entry
 
 declare void @foo()
 
-define void @t1b(float %a) uwtable ssp {
+define void @t1b(float %a) nounwind {
 entry:
 ; ELF64: t1b
   %cmp = fcmp oeq float %a, -0.000000e+00
@@ -39,7 +39,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t2a(double %a) uwtable ssp {
+define void @t2a(double %a) nounwind {
 entry:
 ; ELF64: t2a
   %cmp = fcmp oeq double %a, 0.000000e+00
@@ -56,7 +56,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t2b(double %a) uwtable ssp {
+define void @t2b(double %a) nounwind {
 entry:
 ; ELF64: t2b
   %cmp = fcmp oeq double %a, -0.000000e+00
@@ -73,7 +73,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t4(i8 signext %a) uwtable ssp {
+define void @t4(i8 signext %a) nounwind {
 entry:
 ; ELF64: t4
   %cmp = icmp eq i8 %a, -1
@@ -89,7 +89,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t5(i8 zeroext %a) uwtable ssp {
+define void @t5(i8 zeroext %a) nounwind {
 entry:
 ; ELF64: t5
   %cmp = icmp eq i8 %a, 1
@@ -105,7 +105,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t6(i16 signext %a) uwtable ssp {
+define void @t6(i16 signext %a) nounwind {
 entry:
 ; ELF64: t6
   %cmp = icmp eq i16 %a, -1
@@ -121,7 +121,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t7(i16 zeroext %a) uwtable ssp {
+define void @t7(i16 zeroext %a) nounwind {
 entry:
 ; ELF64: t7
   %cmp = icmp eq i16 %a, 1
@@ -137,7 +137,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t8(i32 %a) uwtable ssp {
+define void @t8(i32 %a) nounwind {
 entry:
 ; ELF64: t8
   %cmp = icmp eq i32 %a, -1
@@ -152,7 +152,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t9(i32 %a) uwtable ssp {
+define void @t9(i32 %a) nounwind {
 entry:
 ; ELF64: t9
   %cmp = icmp eq i32 %a, 1
@@ -167,7 +167,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t10(i32 %a) uwtable ssp {
+define void @t10(i32 %a) nounwind {
 entry:
 ; ELF64: t10
   %cmp = icmp eq i32 %a, 384
@@ -182,7 +182,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t11(i32 %a) uwtable ssp {
+define void @t11(i32 %a) nounwind {
 entry:
 ; ELF64: t11
   %cmp = icmp eq i32 %a, 4096
@@ -197,7 +197,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t12(i8 %a) uwtable ssp {
+define void @t12(i8 %a) nounwind {
 entry:
 ; ELF64: t12
   %cmp = icmp ugt i8 %a, -113
@@ -229,7 +229,7 @@ if.end:                                           ; preds = %entry
   ret void
 }
 
-define void @t14(i64 %a) uwtable ssp {
+define void @t14(i64 %a) nounwind {
 entry:
 ; ELF64: t14
   %cmp = icmp eq i64 %a, -1
@@ -244,7 +244,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t15(i64 %a) uwtable ssp {
+define void @t15(i64 %a) nounwind {
 entry:
 ; ELF64: t15
   %cmp = icmp eq i64 %a, 1
@@ -259,7 +259,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t16(i64 %a) uwtable ssp {
+define void @t16(i64 %a) nounwind {
 entry:
 ; ELF64: t16
   %cmp = icmp eq i64 %a, 384
@@ -274,7 +274,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define void @t17(i64 %a) uwtable ssp {
+define void @t17(i64 %a) nounwind {
 entry:
 ; ELF64: t17
   %cmp = icmp eq i64 %a, 32768
diff --git a/test/CodeGen/PowerPC/fast-isel-const.ll b/test/CodeGen/PowerPC/fast-isel-const.ll
index a751a2be6c693..3987e54a8d1b6 100644
--- a/test/CodeGen/PowerPC/fast-isel-const.ll
+++ b/test/CodeGen/PowerPC/fast-isel-const.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s --check-prefix=ELF64
 
-define zeroext i1 @testi1(i8 %in) nounwind uwtable ssp {
+define zeroext i1 @testi1(i8 %in) nounwind {
 entry:
   %c = icmp eq i8 %in, 5
   br i1 %c, label %true, label %false
diff --git a/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll b/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll
index cfb934c6ab02c..e4cdf8d7a9c3a 100644
--- a/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll
+++ b/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll
@@ -2,7 +2,7 @@
 
 ; Test sitofp
 
-define void @sitofp_double_i32(i32 %a, double %b) nounwind ssp {
+define void @sitofp_double_i32(i32 %a, double %b) nounwind {
 entry:
 ; ELF64: sitofp_double_i32
   %b.addr = alloca double, align 8
@@ -14,7 +14,7 @@ entry:
   ret void
 }
 
-define void @sitofp_double_i64(i64 %a, double %b) nounwind ssp {
+define void @sitofp_double_i64(i64 %a, double %b) nounwind {
 entry:
 ; ELF64: sitofp_double_i64
   %b.addr = alloca double, align 8
@@ -26,7 +26,7 @@ entry:
   ret void
 }
 
-define void @sitofp_double_i16(i16 %a, double %b) nounwind ssp {
+define void @sitofp_double_i16(i16 %a, double %b) nounwind {
 entry:
 ; ELF64: sitofp_double_i16
   %b.addr = alloca double, align 8
@@ -39,7 +39,7 @@ entry:
   ret void
 }
 
-define void @sitofp_double_i8(i8 %a, double %b) nounwind ssp {
+define void @sitofp_double_i8(i8 %a, double %b) nounwind {
 entry:
 ; ELF64: sitofp_double_i8
   %b.addr = alloca double, align 8
@@ -54,7 +54,7 @@ entry:
 
 ; Test fptosi
 
-define void @fptosi_float_i32(float %a) nounwind ssp {
+define void @fptosi_float_i32(float %a) nounwind {
 entry:
 ; ELF64: fptosi_float_i32
   %b.addr = alloca i32, align 4
@@ -66,7 +66,7 @@ entry:
   ret void
 }
 
-define void @fptosi_float_i64(float %a) nounwind ssp {
+define void @fptosi_float_i64(float %a) nounwind {
 entry:
 ; ELF64: fptosi_float_i64
   %b.addr = alloca i64, align 4
@@ -78,7 +78,7 @@ entry:
   ret void
 }
 
-define void @fptosi_double_i32(double %a) nounwind ssp {
+define void @fptosi_double_i32(double %a) nounwind {
 entry:
 ; ELF64: fptosi_double_i32
   %b.addr = alloca i32, align 8
@@ -90,7 +90,7 @@ entry:
   ret void
 }
 
-define void @fptosi_double_i64(double %a) nounwind ssp {
+define void @fptosi_double_i64(double %a) nounwind {
 entry:
 ; ELF64: fptosi_double_i64
   %b.addr = alloca i64, align 8
@@ -104,7 +104,7 @@ entry:
 
 ; Test fptoui
 
-define void @fptoui_float_i32(float %a) nounwind ssp {
+define void @fptoui_float_i32(float %a) nounwind {
 entry:
 ; ELF64: fptoui_float_i32
   %b.addr = alloca i32, align 4
@@ -116,7 +116,7 @@ entry:
   ret void
 }
 
-define void @fptoui_double_i32(double %a) nounwind ssp {
+define void @fptoui_double_i32(double %a) nounwind {
 entry:
 ; ELF64: fptoui_double_i32
   %b.addr = alloca i32, align 8
diff --git a/test/CodeGen/PowerPC/fast-isel-conversion.ll b/test/CodeGen/PowerPC/fast-isel-conversion.ll
index f7557d456858e..a9324592aeab7 100644
--- a/test/CodeGen/PowerPC/fast-isel-conversion.ll
+++ b/test/CodeGen/PowerPC/fast-isel-conversion.ll
@@ -11,7 +11,7 @@
 
 ; Test sitofp
 
-define void @sitofp_single_i64(i64 %a, float %b) nounwind ssp {
+define void @sitofp_single_i64(i64 %a, float %b) nounwind {
 entry:
 ; ELF64: sitofp_single_i64
 ; ELF64LE: sitofp_single_i64
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-define void @sitofp_single_i32(i32 %a, float %b) nounwind ssp {
+define void @sitofp_single_i32(i32 %a, float %b) nounwind {
 entry:
 ; ELF64: sitofp_single_i32
 ; ELF64LE: sitofp_single_i32
@@ -57,7 +57,7 @@ entry:
   ret void
 }
 
-define void @sitofp_single_i16(i16 %a, float %b) nounwind ssp {
+define void @sitofp_single_i16(i16 %a, float %b) nounwind {
 entry:
 ; ELF64: sitofp_single_i16
 ; ELF64LE: sitofp_single_i16
@@ -81,7 +81,7 @@ entry:
   ret void
 }
 
-define void @sitofp_single_i8(i8 %a) nounwind ssp {
+define void @sitofp_single_i8(i8 %a) nounwind {
 entry:
 ; ELF64: sitofp_single_i8
 ; ELF64LE: sitofp_single_i8
@@ -105,7 +105,7 @@ entry:
   ret void
 }
 
-define void @sitofp_double_i32(i32 %a, double %b) nounwind ssp {
+define void @sitofp_double_i32(i32 %a, double %b) nounwind {
 entry:
 ; ELF64: sitofp_double_i32
 ; ELF64LE: sitofp_double_i32
@@ -129,7 +129,7 @@ entry:
   ret void
 }
 
-define void @sitofp_double_i64(i64 %a, double %b) nounwind ssp {
+define void @sitofp_double_i64(i64 %a, double %b) nounwind {
 entry:
 ; ELF64: sitofp_double_i64
 ; ELF64LE: sitofp_double_i64
@@ -149,7 +149,7 @@ entry:
   ret void
 }
 
-define void @sitofp_double_i16(i16 %a, double %b) nounwind ssp {
+define void @sitofp_double_i16(i16 %a, double %b) nounwind {
 entry:
 ; ELF64: sitofp_double_i16
 ; ELF64LE: sitofp_double_i16
@@ -172,7 +172,7 @@ entry:
   ret void
 }
 
-define void @sitofp_double_i8(i8 %a, double %b) nounwind ssp {
+define void @sitofp_double_i8(i8 %a, double %b) nounwind {
 entry:
 ; ELF64: sitofp_double_i8
 ; ELF64LE: sitofp_double_i8
@@ -197,7 +197,7 @@ entry:
 
 ; Test uitofp
 
-define void @uitofp_single_i64(i64 %a, float %b) nounwind ssp {
+define void @uitofp_single_i64(i64 %a, float %b) nounwind {
 entry:
 ; ELF64: uitofp_single_i64
 ; ELF64LE: uitofp_single_i64
@@ -215,7 +215,7 @@ entry:
   ret void
 }
 
-define void @uitofp_single_i32(i32 %a, float %b) nounwind ssp {
+define void @uitofp_single_i32(i32 %a, float %b) nounwind {
 entry:
 ; ELF64: uitofp_single_i32
 ; ELF64LE: uitofp_single_i32
@@ -238,7 +238,7 @@ entry:
   ret void
 }
 
-define void @uitofp_single_i16(i16 %a, float %b) nounwind ssp {
+define void @uitofp_single_i16(i16 %a, float %b) nounwind {
 entry:
 ; ELF64: uitofp_single_i16
 ; ELF64LE: uitofp_single_i16
@@ -262,7 +262,7 @@ entry:
   ret void
 }
 
-define void @uitofp_single_i8(i8 %a) nounwind ssp {
+define void @uitofp_single_i8(i8 %a) nounwind {
 entry:
 ; ELF64: uitofp_single_i8
 ; ELF64LE: uitofp_single_i8
@@ -286,7 +286,7 @@ entry:
   ret void
 }
 
-define void @uitofp_double_i64(i64 %a, double %b) nounwind ssp {
+define void @uitofp_double_i64(i64 %a, double %b) nounwind {
 entry:
 ; ELF64: uitofp_double_i64
 ; ELF64LE: uitofp_double_i64
@@ -304,7 +304,7 @@ entry:
   ret void
 }
 
-define void @uitofp_double_i32(i32 %a, double %b) nounwind ssp {
+define void @uitofp_double_i32(i32 %a, double %b) nounwind {
 entry:
 ; ELF64: uitofp_double_i32
 ; ELF64LE: uitofp_double_i32
@@ -327,7 +327,7 @@ entry:
   ret void
 }
 
-define void @uitofp_double_i16(i16 %a, double %b) nounwind ssp {
+define void @uitofp_double_i16(i16 %a, double %b) nounwind {
 entry:
 ; ELF64: uitofp_double_i16
 ; ELF64LE: uitofp_double_i16
@@ -350,7 +350,7 @@ entry:
   ret void
 }
 
-define void @uitofp_double_i8(i8 %a, double %b) nounwind ssp {
+define void @uitofp_double_i8(i8 %a, double %b) nounwind {
 entry:
 ; ELF64: uitofp_double_i8
 ; ELF64LE: uitofp_double_i8
@@ -375,7 +375,7 @@ entry:
 
 ; Test fptosi
 
-define void @fptosi_float_i32(float %a) nounwind ssp {
+define void @fptosi_float_i32(float %a) nounwind {
 entry:
 ; ELF64: fptosi_float_i32
 ; ELF64LE: fptosi_float_i32
@@ -395,7 +395,7 @@ entry:
   ret void
 }
 
-define void @fptosi_float_i64(float %a) nounwind ssp {
+define void @fptosi_float_i64(float %a) nounwind {
 entry:
 ; ELF64: fptosi_float_i64
 ; ELF64LE: fptosi_float_i64
@@ -415,7 +415,7 @@ entry:
   ret void
 }
 
-define void @fptosi_double_i32(double %a) nounwind ssp {
+define void @fptosi_double_i32(double %a) nounwind {
 entry:
 ; ELF64: fptosi_double_i32
 ; ELF64LE: fptosi_double_i32
@@ -435,7 +435,7 @@ entry:
   ret void
 }
 
-define void @fptosi_double_i64(double %a) nounwind ssp {
+define void @fptosi_double_i64(double %a) nounwind {
 entry:
 ; ELF64: fptosi_double_i64
 ; ELF64LE: fptosi_double_i64
@@ -457,7 +457,7 @@ entry:
 
 ; Test fptoui
 
-define void @fptoui_float_i32(float %a) nounwind ssp {
+define void @fptoui_float_i32(float %a) nounwind {
 entry:
 ; ELF64: fptoui_float_i32
 ; ELF64LE: fptoui_float_i32
@@ -477,7 +477,7 @@ entry:
   ret void
 }
 
-define void @fptoui_float_i64(float %a) nounwind ssp {
+define void @fptoui_float_i64(float %a) nounwind {
 entry:
 ; ELF64: fptoui_float_i64
 ; ELF64LE: fptoui_float_i64
@@ -495,7 +495,7 @@ entry:
   ret void
 }
 
-define void @fptoui_double_i32(double %a) nounwind ssp {
+define void @fptoui_double_i32(double %a) nounwind {
 entry:
 ; ELF64: fptoui_double_i32
 ; ELF64LE: fptoui_double_i32
@@ -515,7 +515,7 @@ entry:
   ret void
 }
 
-define void @fptoui_double_i64(double %a) nounwind ssp {
+define void @fptoui_double_i64(double %a) nounwind {
 entry:
 ; ELF64: fptoui_double_i64
 ; ELF64LE: fptoui_double_i64
diff --git a/test/CodeGen/PowerPC/fast-isel-crash.ll b/test/CodeGen/PowerPC/fast-isel-crash.ll
index 55e87effcd822..e20ef6bcd5d3c 100644
--- a/test/CodeGen/PowerPC/fast-isel-crash.ll
+++ b/test/CodeGen/PowerPC/fast-isel-crash.ll
@@ -11,12 +11,12 @@ entry:
   ret void
 }
 
-define internal i32 @_Z13get_global_idj(i32 %dim) nounwind ssp {
+define internal i32 @_Z13get_global_idj(i32 %dim) nounwind {
 entry:
   ret i32 undef
 }
 
-define void @wrap(i8 addrspace(1)* addrspace(1)* %arglist, i32 addrspace(1)* %gtid) nounwind ssp {
+define void @wrap(i8 addrspace(1)* addrspace(1)* %arglist, i32 addrspace(1)* %gtid) nounwind {
 entry:
   call void @stretch(<4 x i8> addrspace(1)* undef, <4 x i8> addrspace(1)* undef, i32 undef, i32 undef, i32 undef, i32 undef, <2 x float> undef, <4 x float> undef)
   ret void
diff --git a/test/CodeGen/PowerPC/fast-isel-ext.ll b/test/CodeGen/PowerPC/fast-isel-ext.ll
index 6fd3b4035122d..ce8ac440b79cb 100644
--- a/test/CodeGen/PowerPC/fast-isel-ext.ll
+++ b/test/CodeGen/PowerPC/fast-isel-ext.ll
@@ -2,35 +2,35 @@
 
 ; zext
 
-define i32 @zext_8_32(i8 %a) nounwind ssp {
+define i32 @zext_8_32(i8 %a) nounwind {
 ; ELF64: zext_8_32
   %r = zext i8 %a to i32
 ; ELF64: clrlwi {{[0-9]+}}, {{[0-9]+}}, 24
   ret i32 %r
 }
 
-define i32 @zext_16_32(i16 %a) nounwind ssp {
+define i32 @zext_16_32(i16 %a) nounwind {
 ; ELF64: zext_16_32
   %r = zext i16 %a to i32
 ; ELF64: clrlwi {{[0-9]+}}, {{[0-9]+}}, 16
   ret i32 %r
 }
 
-define i64 @zext_8_64(i8 %a) nounwind ssp {
+define i64 @zext_8_64(i8 %a) nounwind {
 ; ELF64: zext_8_64
   %r = zext i8 %a to i64
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
   ret i64 %r
 }
 
-define i64 @zext_16_64(i16 %a) nounwind ssp {
+define i64 @zext_16_64(i16 %a) nounwind {
 ; ELF64: zext_16_64
   %r = zext i16 %a to i64
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
   ret i64 %r
 }
 
-define i64 @zext_32_64(i32 %a) nounwind ssp {
+define i64 @zext_32_64(i32 %a) nounwind {
 ; ELF64: zext_32_64
   %r = zext i32 %a to i64
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
@@ -39,35 +39,35 @@ define i64 @zext_32_64(i32 %a) nounwind ssp {
 
 ; sext
 
-define i32 @sext_8_32(i8 %a) nounwind ssp {
+define i32 @sext_8_32(i8 %a) nounwind {
 ; ELF64: sext_8_32
   %r = sext i8 %a to i32
 ; ELF64: extsb
   ret i32 %r
 }
 
-define i32 @sext_16_32(i16 %a) nounwind ssp {
+define i32 @sext_16_32(i16 %a) nounwind {
 ; ELF64: sext_16_32
   %r = sext i16 %a to i32
 ; ELF64: extsh
   ret i32 %r
 }
 
-define i64 @sext_8_64(i8 %a) nounwind ssp {
+define i64 @sext_8_64(i8 %a) nounwind {
 ; ELF64: sext_8_64
   %r = sext i8 %a to i64
 ; ELF64: extsb
   ret i64 %r
 }
 
-define i64 @sext_16_64(i16 %a) nounwind ssp {
+define i64 @sext_16_64(i16 %a) nounwind {
 ; ELF64: sext_16_64
   %r = sext i16 %a to i64
 ; ELF64: extsh
   ret i64 %r
 }
 
-define i64 @sext_32_64(i32 %a) nounwind ssp {
+define i64 @sext_32_64(i32 %a) nounwind {
 ; ELF64: sext_32_64
   %r = sext i32 %a to i64
 ; ELF64: extsw
diff --git a/test/CodeGen/PowerPC/fast-isel-fold.ll b/test/CodeGen/PowerPC/fast-isel-fold.ll
index e56101a28e2b2..24cdca35b0dda 100644
--- a/test/CodeGen/PowerPC/fast-isel-fold.ll
+++ b/test/CodeGen/PowerPC/fast-isel-fold.ll
@@ -4,7 +4,7 @@
 @b = global i16 2, align 2
 @c = global i32 4, align 4
 
-define void @t1() nounwind uwtable ssp {
+define void @t1() nounwind {
 ; ELF64: t1
   %1 = load i8, i8* @a, align 1
   call void @foo1(i8 zeroext %1)
@@ -14,7 +14,7 @@ define void @t1() nounwind uwtable ssp {
   ret void
 }
 
-define void @t2() nounwind uwtable ssp {
+define void @t2() nounwind {
 ; ELF64: t2
   %1 = load i16, i16* @b, align 2
   call void @foo2(i16 zeroext %1)
@@ -24,7 +24,7 @@ define void @t2() nounwind uwtable ssp {
   ret void
 }
 
-define void @t2a() nounwind uwtable ssp {
+define void @t2a() nounwind {
 ; ELF64: t2a
   %1 = load i32, i32* @c, align 4
   call void @foo3(i32 zeroext %1)
@@ -38,7 +38,7 @@ declare void @foo1(i8 zeroext)
 declare void @foo2(i16 zeroext)
 declare void @foo3(i32 zeroext)
 
-define i32 @t3() nounwind uwtable ssp {
+define i32 @t3() nounwind {
 ; ELF64: t3
   %1 = load i8, i8* @a, align 1
   %2 = zext i8 %1 to i32
@@ -47,7 +47,7 @@ define i32 @t3() nounwind uwtable ssp {
   ret i32 %2
 }
 
-define i32 @t4() nounwind uwtable ssp {
+define i32 @t4() nounwind {
 ; ELF64: t4
   %1 = load i16, i16* @b, align 2
   %2 = zext i16 %1 to i32
@@ -56,7 +56,7 @@ define i32 @t4() nounwind uwtable ssp {
   ret i32 %2
 }
 
-define i32 @t5() nounwind uwtable ssp {
+define i32 @t5() nounwind {
 ; ELF64: t5
   %1 = load i16, i16* @b, align 2
   %2 = sext i16 %1 to i32
@@ -65,7 +65,7 @@ define i32 @t5() nounwind uwtable ssp {
   ret i32 %2
 }
 
-define i32 @t6() nounwind uwtable ssp {
+define i32 @t6() nounwind {
 ; ELF64: t6
   %1 = load i8, i8* @a, align 2
   %2 = sext i8 %1 to i32
@@ -74,7 +74,7 @@ define i32 @t6() nounwind uwtable ssp {
   ret i32 %2
 }
 
-define i64 @t7() nounwind uwtable ssp {
+define i64 @t7() nounwind {
 ; ELF64: t7
   %1 = load i8, i8* @a, align 1
   %2 = zext i8 %1 to i64
@@ -83,7 +83,7 @@ define i64 @t7() nounwind uwtable ssp {
   ret i64 %2
 }
 
-define i64 @t8() nounwind uwtable ssp {
+define i64 @t8() nounwind {
 ; ELF64: t8
   %1 = load i16, i16* @b, align 2
   %2 = zext i16 %1 to i64
@@ -92,7 +92,7 @@ define i64 @t8() nounwind uwtable ssp {
   ret i64 %2
 }
 
-define i64 @t9() nounwind uwtable ssp {
+define i64 @t9() nounwind {
 ; ELF64: t9
   %1 = load i16, i16* @b, align 2
   %2 = sext i16 %1 to i64
@@ -101,7 +101,7 @@ define i64 @t9() nounwind uwtable ssp {
   ret i64 %2
 }
 
-define i64 @t10() nounwind uwtable ssp {
+define i64 @t10() nounwind {
 ; ELF64: t10
   %1 = load i8, i8* @a, align 2
   %2 = sext i8 %1 to i64
@@ -110,7 +110,7 @@ define i64 @t10() nounwind uwtable ssp {
   ret i64 %2
 }
 
-define i64 @t11() nounwind uwtable ssp {
+define i64 @t11() nounwind {
 ; ELF64: t11
   %1 = load i32, i32* @c, align 4
   %2 = zext i32 %1 to i64
@@ -119,7 +119,7 @@ define i64 @t11() nounwind uwtable ssp {
   ret i64 %2
 }
 
-define i64 @t12() nounwind uwtable ssp {
+define i64 @t12() nounwind {
 ; ELF64: t12
   %1 = load i32, i32* @c, align 4
   %2 = sext i32 %1 to i64
diff --git a/test/CodeGen/PowerPC/fast-isel-indirectbr.ll b/test/CodeGen/PowerPC/fast-isel-indirectbr.ll
index b5477134c517a..d66fd1fb752d4 100644
--- a/test/CodeGen/PowerPC/fast-isel-indirectbr.ll
+++ b/test/CodeGen/PowerPC/fast-isel-indirectbr.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
 
-define void @t1(i8* %x) {
+define void @t1(i8* %x) nounwind {
 entry:
 ; ELF64: t1
   br label %L0
diff --git a/test/CodeGen/PowerPC/fast-isel-load-store.ll b/test/CodeGen/PowerPC/fast-isel-load-store.ll
index f6a55f06b2cb6..800e5aa66090d 100644
--- a/test/CodeGen/PowerPC/fast-isel-load-store.ll
+++ b/test/CodeGen/PowerPC/fast-isel-load-store.ll
@@ -24,7 +24,7 @@
 
 ; load
 
-define i8 @t1() nounwind uwtable ssp {
+define i8 @t1() nounwind {
 ; ELF64: t1
   %1 = load i8, i8* @a, align 1
 ; ELF64: lbz
@@ -33,7 +33,7 @@ define i8 @t1() nounwind uwtable ssp {
   ret i8 %2
 }
 
-define i16 @t2() nounwind uwtable ssp {
+define i16 @t2() nounwind {
 ; ELF64: t2
   %1 = load i16, i16* @b, align 2
 ; ELF64: lhz
@@ -42,7 +42,7 @@ define i16 @t2() nounwind uwtable ssp {
   ret i16 %2
 }
 
-define i32 @t3() nounwind uwtable ssp {
+define i32 @t3() nounwind {
 ; ELF64: t3
   %1 = load i32, i32* @c, align 4
 ; ELF64: lwz
@@ -51,7 +51,7 @@ define i32 @t3() nounwind uwtable ssp {
   ret i32 %2
 }
 
-define i64 @t4() nounwind uwtable ssp {
+define i64 @t4() nounwind {
 ; ELF64: t4
   %1 = load i64, i64* @d, align 4
 ; ELF64: ld
@@ -60,7 +60,7 @@ define i64 @t4() nounwind uwtable ssp {
   ret i64 %2
 }
 
-define float @t5() nounwind uwtable ssp {
+define float @t5() nounwind {
 ; ELF64: t5
   %1 = load float, float* @e, align 4
 ; ELF64: lfs
@@ -69,7 +69,7 @@ define float @t5() nounwind uwtable ssp {
   ret float %2
 }
 
-define double @t6() nounwind uwtable ssp {
+define double @t6() nounwind {
 ; ELF64: t6
   %1 = load double, double* @f, align 8
 ; ELF64: lfd
@@ -80,7 +80,7 @@ define double @t6() nounwind uwtable ssp {
 
 ; store
 
-define void @t7(i8 %v) nounwind uwtable ssp {
+define void @t7(i8 %v) nounwind {
 ; ELF64: t7
   %1 = add nsw i8 %v, 1
   store i8 %1, i8* @a, align 1
@@ -91,7 +91,7 @@ define void @t7(i8 %v) nounwind uwtable ssp {
   ret void
 }
 
-define void @t8(i16 %v) nounwind uwtable ssp {
+define void @t8(i16 %v) nounwind {
 ; ELF64: t8
   %1 = add nsw i16 %v, 1
   store i16 %1, i16* @b, align 2
@@ -102,7 +102,7 @@ define void @t8(i16 %v) nounwind uwtable ssp {
   ret void
 }
 
-define void @t9(i32 %v) nounwind uwtable ssp {
+define void @t9(i32 %v) nounwind {
 ; ELF64: t9
   %1 = add nsw i32 %v, 1
   store i32 %1, i32* @c, align 4
@@ -113,7 +113,7 @@ define void @t9(i32 %v) nounwind uwtable ssp {
   ret void
 }
 
-define void @t10(i64 %v) nounwind uwtable ssp {
+define void @t10(i64 %v) nounwind {
 ; ELF64: t10
   %1 = add nsw i64 %v, 1
   store i64 %1, i64* @d, align 4
@@ -124,7 +124,7 @@ define void @t10(i64 %v) nounwind uwtable ssp {
   ret void
 }
 
-define void @t11(float %v) nounwind uwtable ssp {
+define void @t11(float %v) nounwind {
 ; ELF64: t11
   %1 = fadd float %v, 1.0
   store float %1, float* @e, align 4
@@ -133,7 +133,7 @@ define void @t11(float %v) nounwind uwtable ssp {
   ret void
 }
 
-define void @t12(double %v) nounwind uwtable ssp {
+define void @t12(double %v) nounwind {
 ; ELF64: t12
   %1 = fadd double %v, 1.0
   store double %1, double* @f, align 8
@@ -143,7 +143,7 @@ define void @t12(double %v) nounwind uwtable ssp {
 }
 
 ;; lwa requires an offset divisible by 4, so we need lwax here.
-define i64 @t13() nounwind uwtable ssp {
+define i64 @t13() nounwind {
 ; ELF64: t13
   %1 = load i32, i32* getelementptr inbounds (%struct.s, %struct.s* @g, i32 0, i32 1), align 1
   %2 = sext i32 %1 to i64
@@ -155,7 +155,7 @@ define i64 @t13() nounwind uwtable ssp {
 }
 
 ;; ld requires an offset divisible by 4, so we need ldx here.
-define i64 @t14() nounwind uwtable ssp {
+define i64 @t14() nounwind {
 ; ELF64: t14
   %1 = load i64, i64* getelementptr inbounds (%struct.t, %struct.t* @h, i32 0, i32 1), align 1
 ; ELF64: li
@@ -166,7 +166,7 @@ define i64 @t14() nounwind uwtable ssp {
 }
 
 ;; std requires an offset divisible by 4, so we need stdx here.
-define void @t15(i64 %v) nounwind uwtable ssp {
+define void @t15(i64 %v) nounwind {
 ; ELF64: t15
   %1 = add nsw i64 %v, 1
   store i64 %1, i64* getelementptr inbounds (%struct.t, %struct.t* @h, i32 0, i32 1), align 1
@@ -179,7 +179,7 @@ define void @t15(i64 %v) nounwind uwtable ssp {
 }
 
 ;; ld requires an offset that fits in 16 bits, so we need ldx here.
-define i64 @t16() nounwind uwtable ssp {
+define i64 @t16() nounwind {
 ; ELF64: t16
   %1 = load i64, i64* getelementptr inbounds ([8192 x i64], [8192 x i64]* @i, i32 0, i64 5000), align 8
 ; ELF64: lis
@@ -191,7 +191,7 @@ define i64 @t16() nounwind uwtable ssp {
 }
 
 ;; std requires an offset that fits in 16 bits, so we need stdx here.
-define void @t17(i64 %v) nounwind uwtable ssp {
+define void @t17(i64 %v) nounwind {
 ; ELF64: t17
   %1 = add nsw i64 %v, 1
   store i64 %1, i64* getelementptr inbounds ([8192 x i64], [8192 x i64]* @i, i32 0, i64 5000), align 8
diff --git a/test/CodeGen/PowerPC/fast-isel-redefinition.ll b/test/CodeGen/PowerPC/fast-isel-redefinition.ll
index 60706a6e14380..19392c938e150 100644
--- a/test/CodeGen/PowerPC/fast-isel-redefinition.ll
+++ b/test/CodeGen/PowerPC/fast-isel-redefinition.ll
@@ -3,7 +3,7 @@
 ; doesn't crash.  (It crashed formerly on ARM, and proved useful in
 ; discovering a bug on PowerPC as well.)
 
-define i32 @f(i32* %x) nounwind ssp {
+define i32 @f(i32* %x) nounwind {
   %y = getelementptr inbounds i32, i32* %x, i32 5000
   %tmp103 = load i32, i32* %y, align 4
   ret i32 %tmp103
diff --git a/test/CodeGen/PowerPC/fast-isel-ret.ll b/test/CodeGen/PowerPC/fast-isel-ret.ll
index 1e4566d94dfdb..e05ef7d9ab824 100644
--- a/test/CodeGen/PowerPC/fast-isel-ret.ll
+++ b/test/CodeGen/PowerPC/fast-isel-ret.ll
@@ -4,7 +4,7 @@
 ;
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s --check-prefix=ELF64
 
-define zeroext i1 @rettrue() nounwind uwtable ssp {
+define zeroext i1 @rettrue() nounwind {
 entry:
 ; ELF64-LABEL: rettrue
 ; ELF64: li 3, 1
@@ -12,7 +12,7 @@ entry:
   ret i1 true
 }
 
-define zeroext i1 @retfalse() nounwind uwtable ssp {
+define zeroext i1 @retfalse() nounwind {
 entry:
 ; ELF64-LABEL: retfalse
 ; ELF64: li 3, 0
@@ -20,7 +20,7 @@ entry:
   ret i1 false
 }
 
-define signext i1 @retstrue() nounwind uwtable ssp {
+define signext i1 @retstrue() nounwind {
 entry:
 ; ELF64-LABEL: retstrue
 ; ELF64: li 3, -1
@@ -28,7 +28,7 @@ entry:
   ret i1 true
 }
 
-define signext i1 @retsfalse() nounwind uwtable ssp {
+define signext i1 @retsfalse() nounwind {
 entry:
 ; ELF64-LABEL: retsfalse
 ; ELF64: li 3, 0
@@ -36,7 +36,7 @@ entry:
   ret i1 false
 }
 
-define signext i8 @ret2(i8 signext %a) nounwind uwtable ssp {
+define signext i8 @ret2(i8 signext %a) nounwind {
 entry:
 ; ELF64-LABEL: ret2
 ; ELF64: extsb
@@ -44,7 +44,7 @@ entry:
   ret i8 %a
 }
 
-define zeroext i8 @ret3(i8 signext %a) nounwind uwtable ssp {
+define zeroext i8 @ret3(i8 signext %a) nounwind {
 entry:
 ; ELF64-LABEL: ret3
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
@@ -52,7 +52,7 @@ entry:
   ret i8 %a
 }
 
-define signext i16 @ret4(i16 signext %a) nounwind uwtable ssp {
+define signext i16 @ret4(i16 signext %a) nounwind {
 entry:
 ; ELF64-LABEL: ret4
 ; ELF64: extsh
@@ -60,7 +60,7 @@ entry:
   ret i16 %a
 }
 
-define zeroext i16 @ret5(i16 signext %a) nounwind uwtable ssp {
+define zeroext i16 @ret5(i16 signext %a) nounwind {
 entry:
 ; ELF64-LABEL: ret5
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
@@ -68,7 +68,7 @@ entry:
   ret i16 %a
 }
 
-define i16 @ret6(i16 %a) nounwind uwtable ssp {
+define i16 @ret6(i16 %a) nounwind {
 entry:
 ; ELF64-LABEL: ret6
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
@@ -76,7 +76,7 @@ entry:
   ret i16 %a
 }
 
-define signext i32 @ret7(i32 signext %a) nounwind uwtable ssp {
+define signext i32 @ret7(i32 signext %a) nounwind {
 entry:
 ; ELF64-LABEL: ret7
 ; ELF64: extsw
@@ -84,7 +84,7 @@ entry:
   ret i32 %a
 }
 
-define zeroext i32 @ret8(i32 signext %a) nounwind uwtable ssp {
+define zeroext i32 @ret8(i32 signext %a) nounwind {
 entry:
 ; ELF64-LABEL: ret8
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
@@ -92,7 +92,7 @@ entry:
   ret i32 %a
 }
 
-define i32 @ret9(i32 %a) nounwind uwtable ssp {
+define i32 @ret9(i32 %a) nounwind {
 entry:
 ; ELF64-LABEL: ret9
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
@@ -100,7 +100,7 @@ entry:
   ret i32 %a
 }
 
-define i64 @ret10(i64 %a) nounwind uwtable ssp {
+define i64 @ret10(i64 %a) nounwind {
 entry:
 ; ELF64-LABEL: ret10
 ; ELF64-NOT: exts
@@ -109,21 +109,21 @@ entry:
   ret i64 %a
 }
 
-define float @ret11(float %a) nounwind uwtable ssp {
+define float @ret11(float %a) nounwind {
 entry:
 ; ELF64-LABEL: ret11
 ; ELF64: blr
   ret float %a
 }
 
-define double @ret12(double %a) nounwind uwtable ssp {
+define double @ret12(double %a) nounwind {
 entry:
 ; ELF64-LABEL: ret12
 ; ELF64: blr
   ret double %a
 }
 
-define i8 @ret13() nounwind uwtable ssp {
+define i8 @ret13() nounwind {
 entry:
 ; ELF64-LABEL: ret13
 ; ELF64: li
@@ -131,7 +131,7 @@ entry:
   ret i8 15;
 }
 
-define i16 @ret14() nounwind uwtable ssp {
+define i16 @ret14() nounwind {
 entry:
 ; ELF64-LABEL: ret14
 ; ELF64: li
@@ -139,7 +139,7 @@ entry:
   ret i16 -225;
 }
 
-define i32 @ret15() nounwind uwtable ssp {
+define i32 @ret15() nounwind {
 entry:
 ; ELF64-LABEL: ret15
 ; ELF64: lis
@@ -148,7 +148,7 @@ entry:
   ret i32 278135;
 }
 
-define i64 @ret16() nounwind uwtable ssp {
+define i64 @ret16() nounwind {
 entry:
 ; ELF64-LABEL: ret16
 ; ELF64: li
@@ -159,7 +159,7 @@ entry:
   ret i64 27813515225;
 }
 
-define float @ret17() nounwind uwtable ssp {
+define float @ret17() nounwind {
 entry:
 ; ELF64-LABEL: ret17
 ; ELF64: addis
@@ -168,7 +168,7 @@ entry:
   ret float 2.5;
 }
 
-define double @ret18() nounwind uwtable ssp {
+define double @ret18() nounwind {
 entry:
 ; ELF64-LABEL: ret18
 ; ELF64: addis
@@ -176,3 +176,13 @@ entry:
 ; ELF64: blr
   ret double 2.5e-33;
 }
+
+define zeroext i32 @ret19() nounwind {
+entry:
+; ELF64-LABEL: ret19
+; ELF64: li
+; ELF64: oris
+; ELF64: ori
+; ELF64: blr
+  ret i32 -1
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-shifter.ll b/test/CodeGen/PowerPC/fast-isel-shifter.ll
index c18f659dde135..04cb41920605a 100644
--- a/test/CodeGen/PowerPC/fast-isel-shifter.ll
+++ b/test/CodeGen/PowerPC/fast-isel-shifter.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
 
-define i32 @shl() nounwind ssp {
+define i32 @shl() nounwind {
 entry:
 ; ELF64: shl
 ; ELF64: slw
@@ -8,7 +8,7 @@ entry:
   ret i32 %shl
 }
 
-define i32 @shl_reg(i32 %src1, i32 %src2) nounwind ssp {
+define i32 @shl_reg(i32 %src1, i32 %src2) nounwind {
 entry:
 ; ELF64: shl_reg
 ; ELF64: slw
@@ -16,7 +16,7 @@ entry:
   ret i32 %shl
 }
 
-define i32 @lshr() nounwind ssp {
+define i32 @lshr() nounwind {
 entry:
 ; ELF64: lshr
 ; ELF64: srw
@@ -24,7 +24,7 @@ entry:
   ret i32 %lshr
 }
 
-define i32 @lshr_reg(i32 %src1, i32 %src2) nounwind ssp {
+define i32 @lshr_reg(i32 %src1, i32 %src2) nounwind {
 entry:
 ; ELF64: lshr_reg
 ; ELF64: srw
@@ -32,7 +32,7 @@ entry:
   ret i32 %lshr
 }
 
-define i32 @ashr() nounwind ssp {
+define i32 @ashr() nounwind {
 entry:
 ; ELF64: ashr
 ; ELF64: srawi
@@ -40,7 +40,7 @@ entry:
   ret i32 %ashr
 }
 
-define i32 @ashr_reg(i32 %src1, i32 %src2) nounwind ssp {
+define i32 @ashr_reg(i32 %src1, i32 %src2) nounwind {
 entry:
 ; ELF64: ashr_reg
 ; ELF64: sraw
diff --git a/test/CodeGen/PowerPC/fastisel-gep-promote-before-add.ll b/test/CodeGen/PowerPC/fastisel-gep-promote-before-add.ll
index 96cf67c869f9b..32f4c23c2de29 100644
--- a/test/CodeGen/PowerPC/fastisel-gep-promote-before-add.ll
+++ b/test/CodeGen/PowerPC/fastisel-gep-promote-before-add.ll
@@ -2,7 +2,7 @@
 ; sext(a) + sext(b) != sext(a + b)
 ; RUN: llc -mtriple=powerpc64-unknown-freebsd10.0 %s -O0 -o - | FileCheck %s
 
-define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind {
 entry:
   %ptr.addr = alloca i8*, align 8
   %add = add i8 64, 64 ; 0x40 + 0x40
diff --git a/test/CodeGen/PowerPC/fma-mutate-register-constraint.ll b/test/CodeGen/PowerPC/fma-mutate-register-constraint.ll
new file mode 100644
index 0000000000000..fd2ba4ec635ec
--- /dev/null
+++ b/test/CodeGen/PowerPC/fma-mutate-register-constraint.ll
@@ -0,0 +1,89 @@
+; RUN: llc -enable-unsafe-fp-math < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; CHECK-NOT: {{vmrg[hl]w.*(3[23456789]|[456][0-9])}}
+define void @__f0() {
+entry:
+  %0 = shufflevector <8 x float> zeroinitializer, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = shufflevector <16 x float> %0, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %2 = shufflevector <8 x float> zeroinitializer, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = shufflevector <16 x float> %2, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %4 = fmul <32 x float> %1, %3
+  %5 = load <4 x float>, <4 x float>* undef, align 128
+  %6 = load <4 x float>, <4 x float>* undef, align 128
+  %7 = shufflevector <4 x float> undef, <4 x float> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %8 = shufflevector <4 x float> undef, <4 x float> %6, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %9 = shufflevector <8 x float> %7, <8 x float> %8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %10 = shufflevector <16 x float> undef, <16 x float> %9, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = load <4 x float>, <4 x float>* null, align 128
+  %12 = load <4 x float>, <4 x float>* undef, align 128
+  %13 = shufflevector <4 x float> undef, <4 x float> %11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %14 = shufflevector <4 x float> undef, <4 x float> %12, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %15 = shufflevector <8 x float> %13, <8 x float> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %16 = shufflevector <16 x float> undef, <16 x float> %15, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %17 = fmul <32 x float> %10, %16
+  %18 = fsub <32 x float> %4, %17
+  %19 = shufflevector <32 x float> %18, <32 x float> undef, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  %20 = bitcast <64 x float> %19 to <32 x double>
+  %21 = shufflevector <32 x double> undef, <32 x double> %20, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  %22 = bitcast <64 x double> %21 to <128 x float>
+  %23 = shufflevector <128 x float> undef, <128 x float> %22, <256 x i32> <i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255>
+  %24 = shufflevector <256 x float> undef, <256 x float> %23, <512 x i32> <i32 0, i32 256, i32 1, i32 257, i32 2, i32 258, i32 3, i32 259, i32 4, i32 260, i32 5, i32 261, i32 6, i32 262, i32 7, i32 263, i32 8, i32 264, i32 9, i32 265, i32 10, i32 266, i32 11, i32 267, i32 12, i32 268, i32 13, i32 269, i32 14, i32 270, i32 15, i32 271, i32 16, i32 272, i32 17, i32 273, i32 18, i32 274, i32 19, i32 275, i32 20, i32 276, i32 21, i32 277, i32 22, i32 278, i32 23, i32 279, i32 24, i32 280, i32 25, i32 281, i32 26, i32 282, i32 27, i32 283, i32 28, i32 284, i32 29, i32 285, i32 30, i32 286, i32 31, i32 287, i32 32, i32 288, i32 33, i32 289, i32 34, i32 290, i32 35, i32 291, i32 36, i32 292, i32 37, i32 293, i32 38, i32 294, i32 39, i32 295, i32 40, i32 296, i32 41, i32 297, i32 42, i32 298, i32 43, i32 299, i32 44, i32 300, i32 45, i32 301, i32 46, i32 302, i32 47, i32 303, i32 48, i32 304, i32 49, i32 305, i32 50, i32 306, i32 51, i32 307, i32 52, i32 308, i32 53, i32 309, i32 54, i32 310, i32 55, i32 311, i32 56, i32 312, i32 57, i32 313, i32 58, i32 314, i32 59, i32 315, i32 60, i32 316, i32 61, i32 317, i32 62, i32 318, i32 63, i32 319, i32 64, i32 320, i32 65, i32 321, i32 66, i32 322, i32 67, i32 323, i32 68, i32 324, i32 69, i32 325, i32 70, i32 326, i32 71, i32 327, i32 72, i32 328, i32 73, i32 329, i32 74, i32 330, i32 75, i32 331, i32 76, i32 332, i32 77, i32 333, i32 78, i32 334, i32 79, i32 335, i32 80, i32 336, i32 81, i32 337, i32 82, i32 338, i32 83, i32 339, i32 84, i32 340, i32 85, i32 341, i32 86, i32 342, i32 87, i32 343, i32 88, i32 344, i32 89, i32 345, i32 90, i32 346, i32 91, i32 347, i32 92, i32 348, i32 93, i32 349, i32 94, i32 350, i32 95, i32 351, i32 96, i32 352, i32 97, i32 353, i32 98, i32 354, i32 99, i32 355, i32 100, i32 356, i32 101, i32 357, i32 102, i32 358, i32 103, i32 359, i32 104, i32 360, i32 105, i32 361, i32 106, i32 362, i32 107, i32 363, i32 108, i32 364, i32 109, i32 365, i32 110, i32 366, i32 111, i32 367, i32 112, i32 368, i32 113, i32 369, i32 114, i32 370, i32 115, i32 371, i32 116, i32 372, i32 117, i32 373, i32 118, i32 374, i32 119, i32 375, i32 120, i32 376, i32 121, i32 377, i32 122, i32 378, i32 123, i32 379, i32 124, i32 380, i32 125, i32 381, i32 126, i32 382, i32 127, i32 383, i32 128, i32 384, i32 129, i32 385, i32 130, i32 386, i32 131, i32 387, i32 132, i32 388, i32 133, i32 389, i32 134, i32 390, i32 135, i32 391, i32 136, i32 392, i32 137, i32 393, i32 138, i32 394, i32 139, i32 395, i32 140, i32 396, i32 141, i32 397, i32 142, i32 398, i32 143, i32 399, i32 144, i32 400, i32 145, i32 401, i32 146, i32 402, i32 147, i32 403, i32 148, i32 404, i32 149, i32 405, i32 150, i32 406, i32 151, i32 407, i32 152, i32 408, i32 153, i32 409, i32 154, i32 410, i32 155, i32 411, i32 156, i32 412, i32 157, i32 413, i32 158, i32 414, i32 159, i32 415, i32 160, i32 416, i32 161, i32 417, i32 162, i32 418, i32 163, i32 419, i32 164, i32 420, i32 165, i32 421, i32 166, i32 422, i32 167, i32 423, i32 168, i32 424, i32 169, i32 425, i32 170, i32 426, i32 171, i32 427, i32 172, i32 428, i32 173, i32 429, i32 174, i32 430, i32 175, i32 431, i32 176, i32 432, i32 177, i32 433, i32 178, i32 434, i32 179, i32 435, i32 180, i32 436, i32 181, i32 437, i32 182, i32 438, i32 183, i32 439, i32 184, i32 440, i32 185, i32 441, i32 186, i32 442, i32 187, i32 443, i32 188, i32 444, i32 189, i32 445, i32 190, i32 446, i32 191, i32 447, i32 192, i32 448, i32 193, i32 449, i32 194, i32 450, i32 195, i32 451, i32 196, i32 452, i32 197, i32 453, i32 198, i32 454, i32 199, i32 455, i32 200, i32 456, i32 201, i32 457, i32 202, i32 458, i32 203, i32 459, i32 204, i32 460, i32 205, i32 461, i32 206, i32 462, i32 207, i32 463, i32 208, i32 464, i32 209, i32 465, i32 210, i32 466, i32 211, i32 467, i32 212, i32 468, i32 213, i32 469, i32 214, i32 470, i32 215, i32 471, i32 216, i32 472, i32 217, i32 473, i32 218, i32 474, i32 219, i32 475, i32 220, i32 476, i32 221, i32 477, i32 222, i32 478, i32 223, i32 479, i32 224, i32 480, i32 225, i32 481, i32 226, i32 482, i32 227, i32 483, i32 228, i32 484, i32 229, i32 485, i32 230, i32 486, i32 231, i32 487, i32 232, i32 488, i32 233, i32 489, i32 234, i32 490, i32 235, i32 491, i32 236, i32 492, i32 237, i32 493, i32 238, i32 494, i32 239, i32 495, i32 240, i32 496, i32 241, i32 497, i32 242, i32 498, i32 243, i32 499, i32 244, i32 500, i32 245, i32 501, i32 246, i32 502, i32 247, i32 503, i32 248, i32 504, i32 249, i32 505, i32 250, i32 506, i32 251, i32 507, i32 252, i32 508, i32 253, i32 509, i32 254, i32 510, i32 255, i32 511>
+  %25 = shufflevector <512 x float> %24, <512 x float> undef, <1024 x i32> <i32 0, i32 512, i32 1, i32 513, i32 2, i32 514, i32 3, i32 515, i32 4, i32 516, i32 5, i32 517, i32 6, i32 518, i32 7, i32 519, i32 8, i32 520, i32 9, i32 521, i32 10, i32 522, i32 11, i32 523, i32 12, i32 524, i32 13, i32 525, i32 14, i32 526, i32 15, i32 527, i32 16, i32 528, i32 17, i32 529, i32 18, i32 530, i32 19, i32 531, i32 20, i32 532, i32 21, i32 533, i32 22, i32 534, i32 23, i32 535, i32 24, i32 536, i32 25, i32 537, i32 26, i32 538, i32 27, i32 539, i32 28, i32 540, i32 29, i32 541, i32 30, i32 542, i32 31, i32 543, i32 32, i32 544, i32 33, i32 545, i32 34, i32 546, i32 35, i32 547, i32 36, i32 548, i32 37, i32 549, i32 38, i32 550, i32 39, i32 551, i32 40, i32 552, i32 41, i32 553, i32 42, i32 554, i32 43, i32 555, i32 44, i32 556, i32 45, i32 557, i32 46, i32 558, i32 47, i32 559, i32 48, i32 560, i32 49, i32 561, i32 50, i32 562, i32 51, i32 563, i32 52, i32 564, i32 53, i32 565, i32 54, i32 566, i32 55, i32 567, i32 56, i32 568, i32 57, i32 569, i32 58, i32 570, i32 59, i32 571, i32 60, i32 572, i32 61, i32 573, i32 62, i32 574, i32 63, i32 575, i32 64, i32 576, i32 65, i32 577, i32 66, i32 578, i32 67, i32 579, i32 68, i32 580, i32 69, i32 581, i32 70, i32 582, i32 71, i32 583, i32 72, i32 584, i32 73, i32 585, i32 74, i32 586, i32 75, i32 587, i32 76, i32 588, i32 77, i32 589, i32 78, i32 590, i32 79, i32 591, i32 80, i32 592, i32 81, i32 593, i32 82, i32 594, i32 83, i32 595, i32 84, i32 596, i32 85, i32 597, i32 86, i32 598, i32 87, i32 599, i32 88, i32 600, i32 89, i32 601, i32 90, i32 602, i32 91, i32 603, i32 92, i32 604, i32 93, i32 605, i32 94, i32 606, i32 95, i32 607, i32 96, i32 608, i32 97, i32 609, i32 98, i32 610, i32 99, i32 611, i32 100, i32 612, i32 101, i32 613, i32 102, i32 614, i32 103, i32 615, i32 104, i32 616, i32 105, i32 617, i32 106, i32 618, i32 107, i32 619, i32 108, i32 620, i32 109, i32 621, i32 110, i32 622, i32 111, i32 623, i32 112, i32 624, i32 113, i32 625, i32 114, i32 626, i32 115, i32 627, i32 116, i32 628, i32 117, i32 629, i32 118, i32 630, i32 119, i32 631, i32 120, i32 632, i32 121, i32 633, i32 122, i32 634, i32 123, i32 635, i32 124, i32 636, i32 125, i32 637, i32 126, i32 638, i32 127, i32 639, i32 128, i32 640, i32 129, i32 641, i32 130, i32 642, i32 131, i32 643, i32 132, i32 644, i32 133, i32 645, i32 134, i32 646, i32 135, i32 647, i32 136, i32 648, i32 137, i32 649, i32 138, i32 650, i32 139, i32 651, i32 140, i32 652, i32 141, i32 653, i32 142, i32 654, i32 143, i32 655, i32 144, i32 656, i32 145, i32 657, i32 146, i32 658, i32 147, i32 659, i32 148, i32 660, i32 149, i32 661, i32 150, i32 662, i32 151, i32 663, i32 152, i32 664, i32 153, i32 665, i32 154, i32 666, i32 155, i32 667, i32 156, i32 668, i32 157, i32 669, i32 158, i32 670, i32 159, i32 671, i32 160, i32 672, i32 161, i32 673, i32 162, i32 674, i32 163, i32 675, i32 164, i32 676, i32 165, i32 677, i32 166, i32 678, i32 167, i32 679, i32 168, i32 680, i32 169, i32 681, i32 170, i32 682, i32 171, i32 683, i32 172, i32 684, i32 173, i32 685, i32 174, i32 686, i32 175, i32 687, i32 176, i32 688, i32 177, i32 689, i32 178, i32 690, i32 179, i32 691, i32 180, i32 692, i32 181, i32 693, i32 182, i32 694, i32 183, i32 695, i32 184, i32 696, i32 185, i32 697, i32 186, i32 698, i32 187, i32 699, i32 188, i32 700, i32 189, i32 701, i32 190, i32 702, i32 191, i32 703, i32 192, i32 704, i32 193, i32 705, i32 194, i32 706, i32 195, i32 707, i32 196, i32 708, i32 197, i32 709, i32 198, i32 710, i32 199, i32 711, i32 200, i32 712, i32 201, i32 713, i32 202, i32 714, i32 203, i32 715, i32 204, i32 716, i32 205, i32 717, i32 206, i32 718, i32 207, i32 719, i32 208, i32 720, i32 209, i32 721, i32 210, i32 722, i32 211, i32 723, i32 212, i32 724, i32 213, i32 725, i32 214, i32 726, i32 215, i32 727, i32 216, i32 728, i32 217, i32 729, i32 218, i32 730, i32 219, i32 731, i32 220, i32 732, i32 221, i32 733, i32 222, i32 734, i32 223, i32 735, i32 224, i32 736, i32 225, i32 737, i32 226, i32 738, i32 227, i32 739, i32 228, i32 740, i32 229, i32 741, i32 230, i32 742, i32 231, i32 743, i32 232, i32 744, i32 233, i32 745, i32 234, i32 746, i32 235, i32 747, i32 236, i32 748, i32 237, i32 749, i32 238, i32 750, i32 239, i32 751, i32 240, i32 752, i32 241, i32 753, i32 242, i32 754, i32 243, i32 755, i32 244, i32 756, i32 245, i32 757, i32 246, i32 758, i32 247, i32 759, i32 248, i32 760, i32 249, i32 761, i32 250, i32 762, i32 251, i32 763, i32 252, i32 764, i32 253, i32 765, i32 254, i32 766, i32 255, i32 767, i32 256, i32 768, i32 257, i32 769, i32 258, i32 770, i32 259, i32 771, i32 260, i32 772, i32 261, i32 773, i32 262, i32 774, i32 263, i32 775, i32 264, i32 776, i32 265, i32 777, i32 266, i32 778, i32 267, i32 779, i32 268, i32 780, i32 269, i32 781, i32 270, i32 782, i32 271, i32 783, i32 272, i32 784, i32 273, i32 785, i32 274, i32 786, i32 275, i32 787, i32 276, i32 788, i32 277, i32 789, i32 278, i32 790, i32 279, i32 791, i32 280, i32 792, i32 281, i32 793, i32 282, i32 794, i32 283, i32 795, i32 284, i32 796, i32 285, i32 797, i32 286, i32 798, i32 287, i32 799, i32 288, i32 800, i32 289, i32 801, i32 290, i32 802, i32 291, i32 803, i32 292, i32 804, i32 293, i32 805, i32 294, i32 806, i32 295, i32 807, i32 296, i32 808, i32 297, i32 809, i32 298, i32 810, i32 299, i32 811, i32 300, i32 812, i32 301, i32 813, i32 302, i32 814, i32 303, i32 815, i32 304, i32 816, i32 305, i32 817, i32 306, i32 818, i32 307, i32 819, i32 308, i32 820, i32 309, i32 821, i32 310, i32 822, i32 311, i32 823, i32 312, i32 824, i32 313, i32 825, i32 314, i32 826, i32 315, i32 827, i32 316, i32 828, i32 317, i32 829, i32 318, i32 830, i32 319, i32 831, i32 320, i32 832, i32 321, i32 833, i32 322, i32 834, i32 323, i32 835, i32 324, i32 836, i32 325, i32 837, i32 326, i32 838, i32 327, i32 839, i32 328, i32 840, i32 329, i32 841, i32 330, i32 842, i32 331, i32 843, i32 332, i32 844, i32 333, i32 845, i32 334, i32 846, i32 335, i32 847, i32 336, i32 848, i32 337, i32 849, i32 338, i32 850, i32 339, i32 851, i32 340, i32 852, i32 341, i32 853, i32 342, i32 854, i32 343, i32 855, i32 344, i32 856, i32 345, i32 857, i32 346, i32 858, i32 347, i32 859, i32 348, i32 860, i32 349, i32 861, i32 350, i32 862, i32 351, i32 863, i32 352, i32 864, i32 353, i32 865, i32 354, i32 866, i32 355, i32 867, i32 356, i32 868, i32 357, i32 869, i32 358, i32 870, i32 359, i32 871, i32 360, i32 872, i32 361, i32 873, i32 362, i32 874, i32 363, i32 875, i32 364, i32 876, i32 365, i32 877, i32 366, i32 878, i32 367, i32 879, i32 368, i32 880, i32 369, i32 881, i32 370, i32 882, i32 371, i32 883, i32 372, i32 884, i32 373, i32 885, i32 374, i32 886, i32 375, i32 887, i32 376, i32 888, i32 377, i32 889, i32 378, i32 890, i32 379, i32 891, i32 380, i32 892, i32 381, i32 893, i32 382, i32 894, i32 383, i32 895, i32 384, i32 896, i32 385, i32 897, i32 386, i32 898, i32 387, i32 899, i32 388, i32 900, i32 389, i32 901, i32 390, i32 902, i32 391, i32 903, i32 392, i32 904, i32 393, i32 905, i32 394, i32 906, i32 395, i32 907, i32 396, i32 908, i32 397, i32 909, i32 398, i32 910, i32 399, i32 911, i32 400, i32 912, i32 401, i32 913, i32 402, i32 914, i32 403, i32 915, i32 404, i32 916, i32 405, i32 917, i32 406, i32 918, i32 407, i32 919, i32 408, i32 920, i32 409, i32 921, i32 410, i32 922, i32 411, i32 923, i32 412, i32 924, i32 413, i32 925, i32 414, i32 926, i32 415, i32 927, i32 416, i32 928, i32 417, i32 929, i32 418, i32 930, i32 419, i32 931, i32 420, i32 932, i32 421, i32 933, i32 422, i32 934, i32 423, i32 935, i32 424, i32 936, i32 425, i32 937, i32 426, i32 938, i32 427, i32 939, i32 428, i32 940, i32 429, i32 941, i32 430, i32 942, i32 431, i32 943, i32 432, i32 944, i32 433, i32 945, i32 434, i32 946, i32 435, i32 947, i32 436, i32 948, i32 437, i32 949, i32 438, i32 950, i32 439, i32 951, i32 440, i32 952, i32 441, i32 953, i32 442, i32 954, i32 443, i32 955, i32 444, i32 956, i32 445, i32 957, i32 446, i32 958, i32 447, i32 959, i32 448, i32 960, i32 449, i32 961, i32 450, i32 962, i32 451, i32 963, i32 452, i32 964, i32 453, i32 965, i32 454, i32 966, i32 455, i32 967, i32 456, i32 968, i32 457, i32 969, i32 458, i32 970, i32 459, i32 971, i32 460, i32 972, i32 461, i32 973, i32 462, i32 974, i32 463, i32 975, i32 464, i32 976, i32 465, i32 977, i32 466, i32 978, i32 467, i32 979, i32 468, i32 980, i32 469, i32 981, i32 470, i32 982, i32 471, i32 983, i32 472, i32 984, i32 473, i32 985, i32 474, i32 986, i32 475, i32 987, i32 476, i32 988, i32 477, i32 989, i32 478, i32 990, i32 479, i32 991, i32 480, i32 992, i32 481, i32 993, i32 482, i32 994, i32 483, i32 995, i32 484, i32 996, i32 485, i32 997, i32 486, i32 998, i32 487, i32 999, i32 488, i32 1000, i32 489, i32 1001, i32 490, i32 1002, i32 491, i32 1003, i32 492, i32 1004, i32 493, i32 1005, i32 494, i32 1006, i32 495, i32 1007, i32 496, i32 1008, i32 497, i32 1009, i32 498, i32 1010, i32 499, i32 1011, i32 500, i32 1012, i32 501, i32 1013, i32 502, i32 1014, i32 503, i32 1015, i32 504, i32 1016, i32 505, i32 1017, i32 506, i32 1018, i32 507, i32 1019, i32 508, i32 1020, i32 509, i32 1021, i32 510, i32 1022, i32 511, i32 1023>
+  %26 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x float> %26, <4 x float>* undef, align 128
+  %27 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+  store <4 x float> %27, <4 x float>* undef, align 128
+  %28 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 164, i32 165, i32 166, i32 167>
+  store <4 x float> %28, <4 x float>* undef, align 128
+  %29 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 168, i32 169, i32 170, i32 171>
+  store <4 x float> %29, <4 x float>* undef, align 128
+  %30 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 172, i32 173, i32 174, i32 175>
+  store <4 x float> %30, <4 x float>* undef, align 128
+  %31 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 176, i32 177, i32 178, i32 179>
+  store <4 x float> %31, <4 x float>* undef, align 128
+  %32 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 284, i32 285, i32 286, i32 287>
+  store <4 x float> %32, <4 x float>* undef, align 128
+  %33 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 328, i32 329, i32 330, i32 331>
+  store <4 x float> %33, <4 x float>* undef, align 128
+  %34 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 332, i32 333, i32 334, i32 335>
+  store <4 x float> %34, <4 x float>* undef, align 128
+  %35 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 524, i32 525, i32 526, i32 527>
+  store <4 x float> %35, <4 x float>* undef, align 128
+  %36 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 528, i32 529, i32 530, i32 531>
+  store <4 x float> %36, <4 x float>* undef, align 128
+  %37 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 648, i32 649, i32 650, i32 651>
+  store <4 x float> %37, <4 x float>* undef, align 128
+  %38 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 652, i32 653, i32 654, i32 655>
+  store <4 x float> %38, <4 x float>* undef, align 128
+  %39 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 656, i32 657, i32 658, i32 659>
+  store <4 x float> %39, <4 x float>* undef, align 128
+  %40 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 732, i32 733, i32 734, i32 735>
+  store <4 x float> %40, <4 x float>* undef, align 128
+  %41 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 736, i32 737, i32 738, i32 739>
+  store <4 x float> %41, <4 x float>* undef, align 128
+  %42 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 740, i32 741, i32 742, i32 743>
+  store <4 x float> %42, <4 x float>* undef, align 128
+  %43 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 872, i32 873, i32 874, i32 875>
+  store <4 x float> %43, <4 x float>* undef, align 128
+  %44 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 968, i32 969, i32 970, i32 971>
+  store <4 x float> %44, <4 x float>* undef, align 128
+  %45 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 1016, i32 1017, i32 1018, i32 1019>
+  store <4 x float> %45, <4 x float>* undef, align 128
+  %46 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 1020, i32 1021, i32 1022, i32 1023>
+  store <4 x float> %46, <4 x float>* undef, align 128
+  %47 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  store <4 x float> %47, <4 x float>* undef, align 128
+  %48 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  store <4 x float> %48, <4 x float>* undef, align 128
+  %49 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+  store <4 x float> %49, <4 x float>* undef, align 128
+  %50 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+  store <4 x float> %50, <4 x float>* undef, align 128
+  %51 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 148, i32 149, i32 150, i32 151>
+  store <4 x float> %51, <4 x float>* undef, align 128
+  %52 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 632, i32 633, i32 634, i32 635>
+  store <4 x float> %52, <4 x float>* undef, align 128
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll b/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll
index 1d9b648231405..4868a18a95a0e 100644
--- a/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll
+++ b/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll
@@ -24,8 +24,7 @@ entry:
   ret float %conv
 ; CHECK-LABEL: @_Z6testfcc
 ; CHECK: mtvsrwz [[MOVEREG01:[0-9]+]], 3
-; FIXME: Once we have XSCVUXDSP implemented, this will change
-; CHECK: fcfidus 1, [[MOVEREG01]]
+; CHECK: xscvuxdsp 1, [[MOVEREG01]]
 }
 
 ; Function Attrs: nounwind
@@ -77,8 +76,7 @@ entry:
   ret float %conv
 ; CHECK-LABEL: @_Z7testfuch
 ; CHECK: mtvsrwz [[MOVEREG03:[0-9]+]], 3
-; FIXME: Once we have XSCVUXDSP implemented, this will change
-; CHECK: fcfidus 1, [[MOVEREG03]]
+; CHECK: xscvuxdsp 1, [[MOVEREG03]]
 }
 
 ; Function Attrs: nounwind
@@ -130,8 +128,7 @@ entry:
   ret float %conv
 ; CHECK-LABEL: @_Z6testfss
 ; CHECK: mtvsrwa [[MOVEREG05:[0-9]+]], 3
-; FIXME: Once we have XSCVSXDSP implemented, this will change
-; CHECK: fcfids 1, [[MOVEREG05]]
+; CHECK: xscvsxdsp 1, [[MOVEREG05]]
 }
 
 ; Function Attrs: nounwind
@@ -183,8 +180,7 @@ entry:
   ret float %conv
 ; CHECK-LABEL: @_Z7testfust
 ; CHECK: mtvsrwz [[MOVEREG07:[0-9]+]], 3
-; FIXME: Once we have XSCVUXDSP implemented, this will change
-; CHECK: fcfidus 1, [[MOVEREG07]]
+; CHECK: xscvuxdsp 1, [[MOVEREG07]]
 }
 
 ; Function Attrs: nounwind
@@ -236,8 +232,7 @@ entry:
   ret float %conv
 ; CHECK-LABEL: @_Z6testfii
 ; CHECK: mtvsrwa [[MOVEREG09:[0-9]+]], 3
-; FIXME: Once we have XSCVSXDSP implemented, this will change
-; CHECK: fcfids 1, [[MOVEREG09]]
+; CHECK: xscvsxdsp 1, [[MOVEREG09]]
 }
 
 ; Function Attrs: nounwind
@@ -289,8 +284,7 @@ entry:
   ret float %conv
 ; CHECK-LABEL: @_Z7testfuij
 ; CHECK: mtvsrwz [[MOVEREG11:[0-9]+]], 3
-; FIXME: Once we have XSCVUXDSP implemented, this will change
-; CHECK: fcfidus 1, [[MOVEREG11]]
+; CHECK: xscvuxdsp 1, [[MOVEREG11]]
 }
 
 ; Function Attrs: nounwind
@@ -342,8 +336,7 @@ entry:
   ret float %conv
 ; CHECK-LABEL:@_Z7testfllx
 ; CHECK: mtvsrd [[MOVEREG13:[0-9]+]], 3
-; FIXME: Once we have XSCVSXDSP implemented, this will change
-; CHECK: fcfids 1, [[MOVEREG13]]
+; CHECK: xscvsxdsp 1, [[MOVEREG13]]
 }
 
 ; Function Attrs: nounwind
@@ -395,8 +388,7 @@ entry:
   ret float %conv
 ; CHECK-LABEL: @_Z8testfully
 ; CHECK: mtvsrd [[MOVEREG15:[0-9]+]], 3
-; FIXME: Once we have XSCVUXDSP implemented, this will change
-; CHECK: fcfidus 1, [[MOVEREG15]]
+; CHECK: xscvuxdsp 1, [[MOVEREG15]]
 }
 
 ; Function Attrs: nounwind
diff --git a/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll b/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
new file mode 100644
index 0000000000000..f5b0a3a59bf36
--- /dev/null
+++ b/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
@@ -0,0 +1,137 @@
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PPC64-P8
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PPC64-P8
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC32
+
+define i128 @test_abs(ppc_fp128 %x) nounwind  {
+entry:
+; PPC64-LABEL: test_abs:
+; PPC64-DAG: stxsdx 2, 0, [[ADDR_HI:[0-9]+]]
+; PPC64-DAG: stxsdx 1, 0, [[ADDR_LO:[0-9]+]]
+; PPC64-DAG: addi [[ADDR_HI]], [[SP:[0-9]+]], [[OFFSET_HI:-?[0-9]+]]
+; PPC64-DAG: addi [[ADDR_LO]], [[SP]], [[OFFSET_LO:-?[0-9]+]]
+; PPC64-DAG: li [[MASK_REG:[0-9]+]], 1
+; PPC64: sldi [[MASK_REG]], [[MASK_REG]], 63
+; PPC64-DAG: ld [[HI:[0-9]+]], [[OFFSET_LO]]([[SP]])
+; PPC64-DAG: ld [[LO:[0-9]+]], [[OFFSET_HI]]([[SP]])
+; PPC64: and [[FLIP_BIT:[0-9]+]], [[HI]], [[MASK_REG]]
+; PPC64-DAG: xor 3, [[HI]], [[FLIP_BIT]]
+; PPC64-DAG: xor 4, [[LO]], [[FLIP_BIT]]
+; PPC64: blr
+
+; PPC64-P8-LABEL: test_abs:
+; PPC64-P8-DAG: mfvsrd [[LO:[0-9]+]], 2
+; PPC64-P8-DAG: mfvsrd [[HI:[0-9]+]], 1
+; PPC64-P8-DAG: li [[MASK_REG:[0-9]+]], 1
+; PPC64-P8-DAG: sldi [[SHIFT_REG:[0-9]+]], [[MASK_REG]], 63
+; PPC64-P8: and [[FLIP_BIT:[0-9]+]], [[HI]], [[SHIFT_REG]]
+; PPC64-P8-DAG: xor 3, [[HI]], [[FLIP_BIT]]
+; PPC64-P8-DAG: xor 4, [[LO]], [[FLIP_BIT]]
+; PPC64-P8: blr
+
+; PPC32-DAG: stfd 1, 24(1)
+; PPC32-DAG: stfd 2, 16(1)
+; PPC32: nop
+; PPC32-DAG: lwz [[HI0:[0-9]+]], 24(1)
+; PPC32-DAG: lwz [[LO0:[0-9]+]], 16(1)
+; PPC32-DAG: lwz [[HI1:[0-9]+]], 28(1)
+; PPC32-DAG: lwz [[LO1:[0-9]+]], 20(1)
+; PPC32: rlwinm [[FLIP_BIT:[0-9]+]], [[HI0]], 0, 0, 0
+; PPC32-DAG: xor [[HI0]], [[HI0]], [[FLIP_BIT]]
+; PPC32-DAG: xor [[LO0]], [[LO0]], [[FLIP_BIT]]
+; PPC32: blr
+	%0 = tail call ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128 %x)
+	%1 = bitcast ppc_fp128 %0 to i128
+	ret i128 %1
+}
+
+define i128 @test_neg(ppc_fp128 %x) nounwind  {
+entry:
+; PPC64-LABEL: test_neg:
+; PPC64-DAG: stxsdx 2, 0, [[ADDR_HI:[0-9]+]]
+; PPC64-DAG: stxsdx 1, 0, [[ADDR_LO:[0-9]+]]
+; PPC64-DAG: addi [[ADDR_HI]], [[SP:[0-9]+]], [[OFFSET_HI:-?[0-9]+]]
+; PPC64-DAG: addi [[ADDR_LO]], [[SP]], [[OFFSET_LO:-?[0-9]+]]
+; PPC64-DAG: li [[FLIP_BIT:[0-9]+]], 1
+; PPC64-DAG: sldi [[FLIP_BIT]], [[FLIP_BIT]], 63
+; PPC64-DAG: ld [[HI:[0-9]+]], [[OFFSET_LO]]([[SP]])
+; PPC64-DAG: ld [[LO:[0-9]+]], [[OFFSET_HI]]([[SP]])
+; PPC64-NOT: BARRIER
+; PPC64-DAG: xor 3, [[HI]], [[FLIP_BIT]]
+; PPC64-DAG: xor 4, [[LO]], [[FLIP_BIT]]
+; PPC64: blr
+
+; PPC64-P8-LABEL: test_neg:
+; PPC64-P8-DAG: mfvsrd [[LO:[0-9]+]], 2
+; PPC64-P8-DAG: mfvsrd [[HI:[0-9]+]], 1
+; PPC64-P8-DAG: li [[IMM1:[0-9]+]], 1
+; PPC64-P8-DAG: sldi [[FLIP_BIT]], [[IMM1]], 63
+; PPC64-P8-NOT: BARRIER
+; PPC64-P8-DAG: xor 3, [[HI]], [[FLIP_BIT]]
+; PPC64-P8-DAG: xor 4, [[LO]], [[FLIP_BIT]]
+; PPC64-P8: blr
+
+; PPC32-DAG: stfd 1, 24(1)
+; PPC32-DAG: stfd 2, 16(1)
+; PPC32: nop
+; PPC32-DAG: lwz [[HI0:[0-9]+]], 24(1)
+; PPC32-DAG: lwz [[LO0:[0-9]+]], 16(1)
+; PPC32-DAG: lwz [[HI1:[0-9]+]], 28(1)
+; PPC32-DAG: lwz [[LO1:[0-9]+]], 20(1)
+; PPC32-NOT: BARRIER
+; PPC32-DAG: xoris [[HI0]], [[HI0]], 32768
+; PPC32-DAG: xoris [[LO0]], [[LO0]], 32768
+; PPC32: blr
+	%0 = fsub ppc_fp128 0xM80000000000000000000000000000000, %x
+	%1 = bitcast ppc_fp128 %0 to i128
+	ret i128 %1
+}
+
+define i128 @test_copysign(ppc_fp128 %x) nounwind  {
+entry:
+; PPC64-LABEL: test_copysign:
+; PPC64-DAG: stxsdx 1, 0, [[ADDR_REG:[0-9]+]]
+; PPC64-DAG: addi [[ADDR_REG]], 1, [[OFFSET:-?[0-9]+]]
+; PPC64-DAG: li [[SIGN:[0-9]+]], 1
+; PPC64-DAG: sldi [[SIGN]], [[SIGN]], 63
+; PPC64-DAG: li [[HI_TMP:[0-9]+]], 16399
+; PPC64-DAG: sldi [[CST_HI:[0-9]+]], [[HI_TMP]], 48
+; PPC64-DAG: li [[LO_TMP:[0-9]+]], 3019
+; PPC64-DAG: sldi [[CST_LO:[0-9]+]], [[LO_TMP]], 52
+; PPC64-NOT: BARRIER
+; PPC64-DAG: ld [[X_HI:[0-9]+]], [[OFFSET]](1)
+; PPC64-DAG: and [[NEW_HI_TMP:[0-9]+]], [[X_HI]], [[SIGN]]
+; PPC64-DAG: or 3, [[NEW_HI_TMP]], [[CST_HI]]
+; PPC64-DAG: xor 4, [[SIGN]], [[CST_LO]]
+; PPC64: blr
+
+; PPC64-P8-LABEL: test_copysign:
+; PPC64-P8-DAG: mfvsrd [[X_HI:[0-9]+]], 1
+; PPC64-P8-DAG: li [[SIGN:[0-9]+]], 1
+; PPC64-P8-DAG: sldi [[SIGN]], [[SIGN]], 63
+; PPC64-P8-DAG: li [[HI_TMP:[0-9]+]], 16399
+; PPC64-P8-DAG: sldi [[CST_HI:[0-9]+]], [[HI_TMP]], 48
+; PPC64-P8-DAG: li [[LO_TMP:[0-9]+]], 3019
+; PPC64-P8-DAG: sldi [[CST_LO:[0-9]+]], [[LO_TMP]], 52
+; PPC64-P8-NOT: BARRIER
+; PPC64-P8-DAG: and [[NEW_HI_TMP:[0-9]+]], [[X_HI]], [[SIGN]]
+; PPC64-P8-DAG: or 3, [[NEW_HI_TMP]], [[CST_HI]]
+; PPC64-P8-DAG: xor 4, [[NEW_HI_TMP]], [[CST_LO]]
+; PPC64-P8: blr
+
+; PPC32: stfd 1, [[STACK:[0-9]+]](1)
+; PPC32: nop
+; PPC32: lwz [[HI:[0-9]+]], [[STACK]](1)
+; PPC32: rlwinm [[FLIP_BIT:[0-9]+]], [[HI]], 0, 0, 0
+; PPC32-NOT: BARRIER
+; PPC32-DAG: oris {{[0-9]+}}, [[FLIP_BIT]], 16399
+; PPC32-DAG: xoris {{[0-9]+}}, [[FLIP_BIT]], 48304
+; PPC32: blr
+	%0 = tail call ppc_fp128 @llvm.copysign.ppcf128(ppc_fp128 0xMBCB0000000000000400F000000000000, ppc_fp128 %x)
+	%1 = bitcast ppc_fp128 %0 to i128
+	ret i128 %1
+}
+
+declare ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128)
+declare ppc_fp128 @llvm.copysign.ppcf128(ppc_fp128, ppc_fp128)
diff --git a/test/CodeGen/PowerPC/load-shift-combine.ll b/test/CodeGen/PowerPC/load-shift-combine.ll
index 8d1f8146db95f..3b46857252164 100644
--- a/test/CodeGen/PowerPC/load-shift-combine.ll
+++ b/test/CodeGen/PowerPC/load-shift-combine.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; REQUIRES: default_triple
 
 ; This used to cause a crash.  A standard load is converted to a pre-increment
 ; load.  Later the pre-increment load is combined with a subsequent SRL to
diff --git a/test/CodeGen/PowerPC/long-compare.ll b/test/CodeGen/PowerPC/long-compare.ll
index e53356a5ddf23..d596068cbb715 100644
--- a/test/CodeGen/PowerPC/long-compare.ll
+++ b/test/CodeGen/PowerPC/long-compare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 | grep cntlz
+; RUN: llc < %s -march=ppc32 | grep cntlzw
 ; RUN: llc < %s -march=ppc32 | not grep xori 
 ; RUN: llc < %s -march=ppc32 | not grep "li "
 ; RUN: llc < %s -march=ppc32 | not grep "mr "
diff --git a/test/CodeGen/PowerPC/machine-combiner.ll b/test/CodeGen/PowerPC/machine-combiner.ll
new file mode 100644
index 0000000000000..93fb2020d530b
--- /dev/null
+++ b/test/CodeGen/PowerPC/machine-combiner.ll
@@ -0,0 +1,188 @@
+; RUN: llc -O3 -mcpu=pwr7 -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-PWR
+; RUN: llc -O3 -mcpu=a2q -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-QPX
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Verify that the first two adds are independent regardless of how the inputs are
+; commuted. The destination registers are used as source registers for the third add.
+
+define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds1:
+; CHECK:       # BB#0:
+; CHECK:       fadds [[REG0:[0-9]+]], 1, 2
+; CHECK:       fadds [[REG1:[0-9]+]], 3, 4
+; CHECK:       fadds 1, [[REG0]], [[REG1]]
+; CHECK-NEXT:  blr
+
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds2:
+; CHECK:       # BB#0:
+; CHECK:       fadds [[REG0:[0-9]+]], 1, 2
+; CHECK:       fadds [[REG1:[0-9]+]], 3, 4
+; CHECK:       fadds 1, [[REG0]], [[REG1]]
+; CHECK-NEXT:  blr
+
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds3:
+; CHECK:       # BB#0:
+; CHECK:       fadds [[REG0:[0-9]+]], 1, 2
+; CHECK:       fadds [[REG1:[0-9]+]], 3, 4
+; CHECK:       fadds 1, [[REG0]], [[REG1]]
+; CHECK-NEXT:  blr
+
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds4:
+; CHECK:       # BB#0:
+; CHECK:       fadds [[REG0:[0-9]+]], 1, 2
+; CHECK:       fadds [[REG1:[0-9]+]], 3, 4
+; CHECK:       fadds 1, [[REG0]], [[REG1]]
+; CHECK-NEXT:  blr
+
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
+; produced because that would cost more compile time.
+
+define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
+; CHECK-LABEL: reassociate_adds5:
+; CHECK:       # BB#0:
+; CHECK:       fadds [[REG12:[0-9]+]], 5, 6
+; CHECK:       fadds [[REG0:[0-9]+]], 1, 2
+; CHECK:       fadds [[REG11:[0-9]+]], 3, 4
+; CHECK:       fadds [[REG13:[0-9]+]], [[REG12]], 7
+; CHECK:       fadds [[REG1:[0-9]+]], [[REG0]], [[REG11]]
+; CHECK:       fadds [[REG2:[0-9]+]], [[REG1]], [[REG13]]
+; CHECK:       fadds 1, [[REG2]], 8
+; CHECK-NEXT:    blr
+
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  %t3 = fadd float %t2, %x4
+  %t4 = fadd float %t3, %x5
+  %t5 = fadd float %t4, %x6
+  %t6 = fadd float %t5, %x7
+  ret float %t6
+}
+
+; Verify that we reassociate vector instructions too.
+
+define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL: vector_reassociate_adds1:
+; CHECK:       # BB#0:
+; CHECK-QPX:       qvfadds [[REG0:[0-9]+]], 1, 2
+; CHECK-QPX:       qvfadds [[REG1:[0-9]+]], 3, 4
+; CHECK-QPX:       qvfadds 1, [[REG0]], [[REG1]]
+; CHECK-PWR:       xvaddsp [[REG0:[0-9]+]], 34, 35
+; CHECK-PWR:       xvaddsp [[REG1:[0-9]+]], 36, 37
+; CHECK-PWR:       xvaddsp 34, [[REG0]], [[REG1]]
+; CHECK-NEXT:  blr
+
+  %t0 = fadd <4 x float> %x0, %x1
+  %t1 = fadd <4 x float> %t0, %x2
+  %t2 = fadd <4 x float> %t1, %x3
+  ret <4 x float> %t2
+}
+
+define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL: vector_reassociate_adds2:
+; CHECK:       # BB#0:
+; CHECK-QPX:       qvfadds [[REG0:[0-9]+]], 1, 2
+; CHECK-QPX:       qvfadds [[REG1:[0-9]+]], 3, 4
+; CHECK-QPX:       qvfadds 1, [[REG0]], [[REG1]]
+; CHECK-PWR:       xvaddsp [[REG0:[0-9]+]], 34, 35
+; CHECK-PWR:       xvaddsp [[REG1:[0-9]+]], 36, 37
+; CHECK-PWR:       xvaddsp 34, [[REG0]], [[REG1]]
+; CHECK-NEXT:  blr
+
+  %t0 = fadd <4 x float> %x0, %x1
+  %t1 = fadd <4 x float> %x2, %t0
+  %t2 = fadd <4 x float> %t1, %x3
+  ret <4 x float> %t2
+}
+
+define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL: vector_reassociate_adds3:
+; CHECK:       # BB#0:
+; CHECK-QPX:       qvfadds [[REG0:[0-9]+]], 1, 2
+; CHECK-QPX:       qvfadds [[REG1:[0-9]+]], 3, 4
+; CHECK-QPX:       qvfadds 1, [[REG0]], [[REG1]]
+; CHECK-PWR:       xvaddsp [[REG0:[0-9]+]], 34, 35
+; CHECK-PWR:       xvaddsp [[REG1:[0-9]+]], 36, 37
+; CHECK-PWR:       xvaddsp 34, [[REG0]], [[REG1]]
+; CHECK-NEXT:  blr
+
+  %t0 = fadd <4 x float> %x0, %x1
+  %t1 = fadd <4 x float> %t0, %x2
+  %t2 = fadd <4 x float> %x3, %t1
+  ret <4 x float> %t2
+}
+
+define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL: vector_reassociate_adds4:
+; CHECK:       # BB#0:
+; CHECK-QPX:       qvfadds [[REG0:[0-9]+]], 1, 2
+; CHECK-QPX:       qvfadds [[REG1:[0-9]+]], 3, 4
+; CHECK-QPX:       qvfadds 1, [[REG0]], [[REG1]]
+; CHECK-PWR:       xvaddsp [[REG0:[0-9]+]], 34, 35
+; CHECK-PWR:       xvaddsp [[REG1:[0-9]+]], 36, 37
+; CHECK-PWR:       xvaddsp 34, [[REG0]], [[REG1]]
+; CHECK-NEXT:  blr
+
+  %t0 = fadd <4 x float> %x0, %x1
+  %t1 = fadd <4 x float> %x2, %t0
+  %t2 = fadd <4 x float> %x3, %t1
+  ret <4 x float> %t2
+}
+
+define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
+  %t0 = fdiv float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) {
+  %t0 = fdiv float %x0, %x1
+  %t1 = fmul float %x2, %t0
+  %t2 = fmul float %x3, %t1
+  ret float %t2
+}
+
+define double @reassociate_adds_double(double %x0, double %x1, double %x2, double %x3) {
+  %t0 = fdiv double %x0, %x1
+  %t1 = fadd double %x2, %t0
+  %t2 = fadd double %x3, %t1
+  ret double %t2
+}
+
+define double @reassociate_muls_double(double %x0, double %x1, double %x2, double %x3) {
+  %t0 = fdiv double %x0, %x1
+  %t1 = fmul double %x2, %t0
+  %t2 = fmul double %x3, %t1
+  ret double %t2
+}
+
+
diff --git a/test/CodeGen/PowerPC/mc-instrlat.ll b/test/CodeGen/PowerPC/mc-instrlat.ll
new file mode 100644
index 0000000000000..0bbac14f6d3ee
--- /dev/null
+++ b/test/CodeGen/PowerPC/mc-instrlat.ll
@@ -0,0 +1,25 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(double %eps) #0 {
+entry:
+  %0 = fmul fast double %eps, %eps
+  %div = fmul fast double %0, 0x3FD5555555555555
+  tail call void @bar(double %div) #2
+  unreachable
+
+; This used to crash because we'd call a function to compute instruction
+; latency not supported with itineraries.
+; CHECK-LABEL: @foo
+; CHECK: bar
+
+}
+
+declare void @bar(double) #1
+
+attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/mcm-13.ll b/test/CodeGen/PowerPC/mcm-13.ll
new file mode 100644
index 0000000000000..ba371c5026c19
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-13.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large <%s | FileCheck %s
+
+; Test correct code generation for medium and large code model
+; for loading and storing a weak variable
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@wi = weak global i32 0, align 4
+
+define signext i32 @test_weak() nounwind {
+entry:
+  %0 = load i32, i32* @wi, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @wi, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: test_weak:
+; CHECK: addis [[REG1:[0-9]+]], 2, .LC[[TOCNUM:[0-9]+]]@toc@ha
+; CHECK: ld [[REG2:[0-9]+]], .LC[[TOCNUM]]@toc@l([[REG1]])
+; CHECK: lwz {{[0-9]+}}, 0([[REG2]])
+; CHECK: stw {{[0-9]+}}, 0([[REG2]])
+; CHECK: .section .toc
+; CHECK: .LC[[TOCNUM]]:
+; CHECK: .tc {{[a-z0-9A-Z_.]+}}[TC],{{[a-z0-9A-Z_.]+}}
diff --git a/test/CodeGen/PowerPC/memcpy-vec.ll b/test/CodeGen/PowerPC/memcpy-vec.ll
index 70b8ea931a275..29baef55ce17c 100644
--- a/test/CodeGen/PowerPC/memcpy-vec.ll
+++ b/test/CodeGen/PowerPC/memcpy-vec.ll
@@ -14,8 +14,11 @@ entry:
 
 ; PWR7-LABEL: @foo1
 ; PWR7-NOT: bl memcpy
-; PWR7: ld {{[0-9]+}}, {{[0-9]+}}(4)
-; PWR7: std {{[0-9]+}}, {{[0-9]+}}(3)
+; PWR7-DAG: li [[OFFSET:[0-9]+]], 16
+; PWR7-DAG: lxvd2x [[TMP0:[0-9]+]], 4, [[OFFSET]]
+; PWR7-DAG: stxvd2x [[TMP0]], 0, 3
+; PWR7-DAG: lxvd2x [[TMP1:[0-9]+]], 0, 4
+; PWR7-DAG: stxvd2x [[TMP1]], 0, 3
 ; PWR7: blr
 
 ; PWR8-LABEL: @foo1
diff --git a/test/CodeGen/PowerPC/merge-st-chain-op.ll b/test/CodeGen/PowerPC/merge-st-chain-op.ll
new file mode 100644
index 0000000000000..bfb911c011572
--- /dev/null
+++ b/test/CodeGen/PowerPC/merge-st-chain-op.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@_ZNSs4_Rep20_S_empty_rep_storageE = external global [0 x i64], align 8
+
+; Function Attrs: nounwind
+define void @_ZN5clang7tooling15RefactoringTool10runAndSaveEPNS0_21FrontendActionFactoryE() #0 align 2 {
+entry:
+  br i1 undef, label %_ZN4llvm18IntrusiveRefCntPtrIN5clang13DiagnosticIDsEEC2EPS2_.exit, label %return
+
+; CHECK: @_ZN5clang7tooling15RefactoringTool10runAndSaveEPNS0_21FrontendActionFactoryE
+
+_ZN4llvm18IntrusiveRefCntPtrIN5clang13DiagnosticIDsEEC2EPS2_.exit: ; preds = %entry
+  %call2 = call noalias i8* @_Znwm() #3
+  %ref_cnt.i.i = bitcast i8* %call2 to i32*
+  store <2 x i8*> <i8* bitcast (i64* getelementptr inbounds ([0 x i64], [0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*), i8* bitcast (i64* getelementptr inbounds ([0 x i64], [0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*)>, <2 x i8*>* undef, align 8
+  %IgnoreWarnings.i = getelementptr inbounds i8, i8* %call2, i64 4
+  %0 = bitcast i8* %IgnoreWarnings.i to i32*
+  call void @llvm.memset.p0i8.i64(i8* null, i8 0, i64 48, i32 8, i1 false) #4
+  store i32 251658240, i32* %0, align 4
+  store i256 37662610426935100959726589394453639584271499769928088551424, i256* null, align 8
+  store i32 1, i32* %ref_cnt.i.i, align 4
+  unreachable
+
+return:                                           ; preds = %entry
+  ret void
+}
+
+; Function Attrs: nobuiltin
+declare noalias i8* @_Znwm() #1
+
+; Function Attrs: nounwind argmemonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #2
+
+attributes #0 = { nounwind "target-cpu"="pwr7" }
+attributes #1 = { nobuiltin "target-cpu"="pwr7" }
+attributes #2 = { nounwind argmemonly }
+attributes #3 = { builtin nounwind }
+attributes #4 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
new file mode 100644
index 0000000000000..8da8df58a85c9
--- /dev/null
+++ b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -0,0 +1,1476 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-LE
+
+; The build[csilf] functions simply test the scalar_to_vector handling with
+; direct moves. This corresponds to the "insertelement" instruction. Subsequent
+; to this, there will be a splat corresponding to the shufflevector.
+
+@d = common global double 0.000000e+00, align 8
+
+; Function Attrs: nounwind
+define <16 x i8> @buildc(i8 zeroext %a) {
+entry:
+  %a.addr = alloca i8, align 1
+  store i8 %a, i8* %a.addr, align 1
+  %0 = load i8, i8* %a.addr, align 1
+  %splat.splatinsert = insertelement <16 x i8> undef, i8 %0, i32 0
+  %splat.splat = shufflevector <16 x i8> %splat.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %splat.splat
+; CHECK: sldi [[REG1:[0-9]+]], 3, 56
+; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
+; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+}
+
+; Function Attrs: nounwind
+define <8 x i16> @builds(i16 zeroext %a) {
+entry:
+  %a.addr = alloca i16, align 2
+  store i16 %a, i16* %a.addr, align 2
+  %0 = load i16, i16* %a.addr, align 2
+  %splat.splatinsert = insertelement <8 x i16> undef, i16 %0, i32 0
+  %splat.splat = shufflevector <8 x i16> %splat.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %splat.splat
+; CHECK: sldi [[REG1:[0-9]+]], 3, 48
+; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
+; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+}
+
+; Function Attrs: nounwind
+define <4 x i32> @buildi(i32 zeroext %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32, i32* %a.addr, align 4
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; CHECK: sldi [[REG1:[0-9]+]], 3, 32
+; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
+; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+}
+
+; Function Attrs: nounwind
+define <2 x i64> @buildl(i64 %a) {
+entry:
+  %a.addr = alloca i64, align 8
+  store i64 %a, i64* %a.addr, align 8
+  %0 = load i64, i64* %a.addr, align 8
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %0, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; CHECK: mtvsrd {{[0-9]+}}, 3
+; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
+; CHECK-LE: xxspltd [[REG1]], [[REG1]], 0
+}
+
+; Function Attrs: nounwind
+define <4 x float> @buildf(float %a) {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, float* %a.addr, align 4
+  %0 = load float, float* %a.addr, align 4
+  %splat.splatinsert = insertelement <4 x float> undef, float %0, i32 0
+  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %splat.splat
+; CHECK: xscvdpspn {{[0-9]+}}, 1
+; CHECK-LE: xscvdpspn [[REG1:[0-9]+]], 1
+; CHECK-LE: xxsldwi {{[0-9]+}}, [[REG1]], [[REG1]], 1
+}
+
+; The optimization to remove stack operations from PPCDAGToDAGISel::Select
+; should still trigger for v2f64, producing an lxvdsx.
+; Function Attrs: nounwind
+define <2 x double> @buildd() #0 {
+entry:
+  %0 = load double, double* @d, align 8
+  %splat.splatinsert = insertelement <2 x double> undef, double %0, i32 0
+  %splat.splat = shufflevector <2 x double> %splat.splatinsert, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %splat.splat
+; CHECK: ld [[REG1:[0-9]+]], .LC0@toc@l
+; CHECK: lxvdsx 34, 0, [[REG1]]
+; CHECK-LE: ld [[REG1:[0-9]+]], .LC0@toc@l
+; CHECK-LE: lxvdsx 34, 0, [[REG1]]
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc0(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 0
+  ret i8 %vecext
+; CHECK-LABEL: @getsc0
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 8, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc0
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: clrldi 3, 3, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc1(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 1
+  ret i8 %vecext
+; CHECK-LABEL: @getsc1
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 16, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc1
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 56, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc2(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 2
+  ret i8 %vecext
+; CHECK-LABEL: @getsc2
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 24, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc2
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 48, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc3(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 3
+  ret i8 %vecext
+; CHECK-LABEL: @getsc3
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 32, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc3
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 40, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc4(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 4
+  ret i8 %vecext
+; CHECK-LABEL: @getsc4
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 40, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc4
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 32, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc5(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 5
+  ret i8 %vecext
+; CHECK-LABEL: @getsc5
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 48, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc5
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 24, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc6(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 6
+  ret i8 %vecext
+; CHECK-LABEL: @getsc6
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 56, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc6
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 16, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc7(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 7
+  ret i8 %vecext
+; CHECK-LABEL: @getsc7
+; CHECK: mfvsrd 3, 34
+; CHECK: clrldi 3, 3, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc7
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 8, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc8(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 8
+  ret i8 %vecext
+; CHECK-LABEL: @getsc8
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 8, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc8
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: clrldi 3, 3, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc9(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 9
+  ret i8 %vecext
+; CHECK-LABEL: @getsc9
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 16, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc9
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 56, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc10(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 10
+  ret i8 %vecext
+; CHECK-LABEL: @getsc10
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 24, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc10
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 48, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc11(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 11
+  ret i8 %vecext
+; CHECK-LABEL: @getsc11
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 32, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc11
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 40, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc12(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 12
+  ret i8 %vecext
+; CHECK-LABEL: @getsc12
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 40, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc12
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 32, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc13(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 13
+  ret i8 %vecext
+; CHECK-LABEL: @getsc13
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 48, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc13
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 24, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc14(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 14
+  ret i8 %vecext
+; CHECK-LABEL: @getsc14
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 56, 56
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc14
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 16, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i8 @getsc15(<16 x i8> %vsc) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 15
+  ret i8 %vecext
+; CHECK-LABEL: @getsc15
+; CHECK: mfvsrd 3,
+; CHECK: extsb 3, 3
+; CHECK-LE-LABEL: @getsc15
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 8, 56
+; CHECK-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc0(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 0
+  ret i8 %vecext
+; CHECK-LABEL: @getuc0
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 8, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc0
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc1(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 1
+  ret i8 %vecext
+; CHECK-LABEL: @getuc1
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 16, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc1
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 56, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc2(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 2
+  ret i8 %vecext
+; CHECK-LABEL: @getuc2
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 24, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc2
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 48, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc3(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 3
+  ret i8 %vecext
+; CHECK-LABEL: @getuc3
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 32, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc3
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 40, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc4(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 4
+  ret i8 %vecext
+; CHECK-LABEL: @getuc4
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 40, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc4
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 32, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc5(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 5
+  ret i8 %vecext
+; CHECK-LABEL: @getuc5
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 48, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc5
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 24, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc6(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 6
+  ret i8 %vecext
+; CHECK-LABEL: @getuc6
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 56, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc6
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 16, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc7(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 7
+  ret i8 %vecext
+; CHECK-LABEL: @getuc7
+; CHECK: mfvsrd 3, 34
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc7
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 8, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc8(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 8
+  ret i8 %vecext
+; CHECK-LABEL: @getuc8
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 8, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc8
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc9(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 9
+  ret i8 %vecext
+; CHECK-LABEL: @getuc9
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 16, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc9
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 56, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc10(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 10
+  ret i8 %vecext
+; CHECK-LABEL: @getuc10
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 24, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc10
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 48, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc11(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 11
+  ret i8 %vecext
+; CHECK-LABEL: @getuc11
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 32, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc11
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 40, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc12(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 12
+  ret i8 %vecext
+; CHECK-LABEL: @getuc12
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 40, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc12
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 32, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc13(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 13
+  ret i8 %vecext
+; CHECK-LABEL: @getuc13
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 48, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc13
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 24, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc14(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 14
+  ret i8 %vecext
+; CHECK-LABEL: @getuc14
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 56, 56
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc14
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 16, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getuc15(<16 x i8> %vuc) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %vecext = extractelement <16 x i8> %0, i32 15
+  ret i8 %vecext
+; CHECK-LABEL: @getuc15
+; CHECK: mfvsrd 3,
+; CHECK: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getuc15
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 8, 56
+; CHECK-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define signext i8 @getvelsc(<16 x i8> %vsc, i32 signext %i) {
+entry:
+  %vsc.addr = alloca <16 x i8>, align 16
+  %i.addr = alloca i32, align 4
+  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
+  %1 = load i32, i32* %i.addr, align 4
+  %vecext = extractelement <16 x i8> %0, i32 %1
+  ret i8 %vecext
+; CHECK-LABEL: @getvelsc
+; CHECK-DAG: andi. [[ANDI:[0-9]+]], {{[0-9]+}}, 8
+; CHECK-DAG: lvsl [[SHMSK:[0-9]+]], 0, [[ANDI]]
+; CHECK-DAG: vperm [[PERMD:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, [[SHMSK]]
+; CHECK-DAG: mfvsrd [[MOV:[0-9]+]],
+; CHECK-DAG: li [[IMM7:[0-9]+]], 7
+; CHECK-DAG: andc [[ANDC:[0-9]+]], [[IMM7]]
+; CHECK-DAG: sldi [[SHL:[0-9]+]], [[ANDC]], 3
+; CHECK-DAG: srd 3, [[MOV]], [[SHL]]
+; CHECK-DAG: extsb 3, 3
+; CHECK-LE-LABEL: @getvelsc
+; CHECK-DAG-LE: li [[IMM8:[0-9]+]], 8
+; CHECK-DAG-LE: andc [[ANDC:[0-9]+]], [[IMM8]]
+; CHECK-DAG-LE: lvsl [[SHMSK:[0-9]+]], 0, [[ANDC]]
+; CHECK-DAG-LE: vperm [[PERMD:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, [[SHMSK]]
+; CHECK-DAG-LE: mfvsrd [[MOV:[0-9]+]],
+; CHECK-DAG-LE: li [[IMM7:[0-9]+]], 7
+; CHECK-DAG-LE: and [[AND:[0-9]+]], [[IMM7]]
+; CHECK-DAG-LE: sldi [[SHL:[0-9]+]], [[AND]], 3
+; CHECK-DAG-LE: srd 3, [[MOV]], [[SHL]]
+; CHECK-DAG-LE: extsb 3, 3
+}
+
+; Function Attrs: nounwind
+define zeroext i8 @getveluc(<16 x i8> %vuc, i32 signext %i) {
+entry:
+  %vuc.addr = alloca <16 x i8>, align 16
+  %i.addr = alloca i32, align 4
+  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
+  %1 = load i32, i32* %i.addr, align 4
+  %vecext = extractelement <16 x i8> %0, i32 %1
+  ret i8 %vecext
+; CHECK-LABEL: @getveluc
+; CHECK-DAG: andi. [[ANDI:[0-9]+]], {{[0-9]+}}, 8
+; CHECK-DAG: lvsl [[SHMSK:[0-9]+]], 0, [[ANDI]]
+; CHECK-DAG: vperm [[PERMD:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, [[SHMSK]]
+; CHECK-DAG: mfvsrd [[MOV:[0-9]+]],
+; CHECK-DAG: li [[IMM7:[0-9]+]], 7
+; CHECK-DAG: andc [[ANDC:[0-9]+]], [[IMM7]]
+; CHECK-DAG: sldi [[SHL:[0-9]+]], [[ANDC]], 3
+; CHECK-DAG: srd 3, [[MOV]], [[SHL]]
+; CHECK-DAG: clrldi   3, 3, 56
+; CHECK-LE-LABEL: @getveluc
+; CHECK-DAG-LE: li [[IMM8:[0-9]+]], 8
+; CHECK-DAG-LE: andc [[ANDC:[0-9]+]], [[IMM8]]
+; CHECK-DAG-LE: lvsl [[SHMSK:[0-9]+]], 0, [[ANDC]]
+; CHECK-DAG-LE: vperm [[PERMD:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, [[SHMSK]]
+; CHECK-DAG-LE: mfvsrd [[MOV:[0-9]+]],
+; CHECK-DAG-LE: li [[IMM7:[0-9]+]], 7
+; CHECK-DAG-LE: and [[AND:[0-9]+]], [[IMM7]]
+; CHECK-DAG-LE: sldi [[SHL:[0-9]+]], [[AND]], 3
+; CHECK-DAG-LE: srd 3, [[MOV]], [[SHL]]
+; CHECK-DAG-LE: clrldi   3, 3, 56
+}
+
+; Function Attrs: nounwind
+define signext i16 @getss0(<8 x i16> %vss) {
+entry:
+  %vss.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 0
+  ret i16 %vecext
+; CHECK-LABEL: @getss0
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 16, 48
+; CHECK: extsh 3, 3
+; CHECK-LE-LABEL: @getss0
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: clrldi 3, 3, 48
+; CHECK-LE: extsh 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i16 @getss1(<8 x i16> %vss) {
+entry:
+  %vss.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 1
+  ret i16 %vecext
+; CHECK-LABEL: @getss1
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 32, 48
+; CHECK: extsh 3, 3
+; CHECK-LE-LABEL: @getss1
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 48, 48
+; CHECK-LE: extsh 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i16 @getss2(<8 x i16> %vss) {
+entry:
+  %vss.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 2
+  ret i16 %vecext
+; CHECK-LABEL: @getss2
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 48, 48
+; CHECK: extsh 3, 3
+; CHECK-LE-LABEL: @getss2
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 32, 48
+; CHECK-LE: extsh 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i16 @getss3(<8 x i16> %vss) {
+entry:
+  %vss.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 3
+  ret i16 %vecext
+; CHECK-LABEL: @getss3
+; CHECK: mfvsrd 3, 34
+; CHECK: clrldi 3, 3, 48
+; CHECK: extsh 3, 3
+; CHECK-LE-LABEL: @getss3
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 16, 48
+; CHECK-LE: extsh 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i16 @getss4(<8 x i16> %vss) {
+entry:
+  %vss.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 4
+  ret i16 %vecext
+; CHECK-LABEL: @getss4
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 16, 48
+; CHECK: extsh 3, 3
+; CHECK-LE-LABEL: @getss4
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: clrldi 3, 3, 48
+; CHECK-LE: extsh 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i16 @getss5(<8 x i16> %vss) {
+entry:
+  %vss.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 5
+  ret i16 %vecext
+; CHECK-LABEL: @getss5
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 32, 48
+; CHECK: extsh 3, 3
+; CHECK-LE-LABEL: @getss5
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 48, 48
+; CHECK-LE: extsh 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i16 @getss6(<8 x i16> %vss) {
+entry:
+  %vss.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 6
+  ret i16 %vecext
+; CHECK-LABEL: @getss6
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 48, 48
+; CHECK: extsh 3, 3
+; CHECK-LE-LABEL: @getss6
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 32, 48
+; CHECK-LE: extsh 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i16 @getss7(<8 x i16> %vss) {
+entry:
+  %vss.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 7
+  ret i16 %vecext
+; CHECK-LABEL: @getss7
+; CHECK: mfvsrd 3,
+; CHECK: extsh 3, 3
+; CHECK-LE-LABEL: @getss7
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 16, 48
+; CHECK-LE: extsh 3, 3
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @getus0(<8 x i16> %vus) {
+entry:
+  %vus.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 0
+  ret i16 %vecext
+; CHECK-LABEL: @getus0
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 16, 48
+; CHECK: clrldi   3, 3, 48
+; CHECK-LE-LABEL: @getus0
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: clrldi   3, 3, 48
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @getus1(<8 x i16> %vus) {
+entry:
+  %vus.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 1
+  ret i16 %vecext
+; CHECK-LABEL: @getus1
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 32, 48
+; CHECK: clrldi   3, 3, 48
+; CHECK-LE-LABEL: @getus1
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 48, 48
+; CHECK-LE: clrldi   3, 3, 48
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @getus2(<8 x i16> %vus) {
+entry:
+  %vus.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 2
+  ret i16 %vecext
+; CHECK-LABEL: @getus2
+; CHECK: mfvsrd 3, 34
+; CHECK: rldicl 3, 3, 48, 48
+; CHECK: clrldi   3, 3, 48
+; CHECK-LE-LABEL: @getus2
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 32, 48
+; CHECK-LE: clrldi   3, 3, 48
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @getus3(<8 x i16> %vus) {
+entry:
+  %vus.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 3
+  ret i16 %vecext
+; CHECK-LABEL: @getus3
+; CHECK: mfvsrd 3, 34
+; CHECK: clrldi   3, 3, 48
+; CHECK-LE-LABEL: @getus3
+; CHECK-LE: mfvsrd 3,
+; CHECK-LE: rldicl 3, 3, 16, 48
+; CHECK-LE: clrldi   3, 3, 48
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @getus4(<8 x i16> %vus) {
+entry:
+  %vus.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 4
+  ret i16 %vecext
+; CHECK-LABEL: @getus4
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 16, 48
+; CHECK: clrldi   3, 3, 48
+; CHECK-LE-LABEL: @getus4
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: clrldi   3, 3, 48
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @getus5(<8 x i16> %vus) {
+entry:
+  %vus.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 5
+  ret i16 %vecext
+; CHECK-LABEL: @getus5
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 32, 48
+; CHECK: clrldi   3, 3, 48
+; CHECK-LE-LABEL: @getus5
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 48, 48
+; CHECK-LE: clrldi   3, 3, 48
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @getus6(<8 x i16> %vus) {
+entry:
+  %vus.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 6
+  ret i16 %vecext
+; CHECK-LABEL: @getus6
+; CHECK: mfvsrd 3,
+; CHECK: rldicl 3, 3, 48, 48
+; CHECK: clrldi   3, 3, 48
+; CHECK-LE-LABEL: @getus6
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 32, 48
+; CHECK-LE: clrldi   3, 3, 48
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @getus7(<8 x i16> %vus) {
+entry:
+  %vus.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
+  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
+  %vecext = extractelement <8 x i16> %0, i32 7
+  ret i16 %vecext
+; CHECK-LABEL: @getus7
+; CHECK: mfvsrd 3,
+; CHECK: clrldi   3, 3, 48
+; CHECK-LE-LABEL: @getus7
+; CHECK-LE: mfvsrd 3, 34
+; CHECK-LE: rldicl 3, 3, 16, 48
+; CHECK-LE: clrldi   3, 3, 48
+}
+
+; Function Attrs: nounwind
+define signext i16 @getvelss(<8 x i16> %vss, i32 signext %i) {
+entry:
+  %vss.addr = alloca <8 x i16>, align 16
+  %i.addr = alloca i32, align 4
+  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
+  %1 = load i32, i32* %i.addr, align 4
+  %vecext = extractelement <8 x i16> %0, i32 %1
+  ret i16 %vecext
+; CHECK-LABEL: @getvelss
+; CHECK-DAG: andi. [[ANDI:[0-9]+]], {{[0-9]+}}, 4
+; CHECK-DAG: sldi [[MUL2:[0-9]+]], [[ANDI]], 1
+; CHECK-DAG: lvsl [[SHMSK:[0-9]+]], 0, [[MUL2]]
+; CHECK-DAG: vperm [[PERMD:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, [[SHMSK]]
+; CHECK-DAG: mfvsrd [[MOV:[0-9]+]],
+; CHECK-DAG: li [[IMM3:[0-9]+]], 3
+; CHECK-DAG: andc [[ANDC:[0-9]+]], [[IMM3]]
+; CHECK-DAG: sldi [[SHL:[0-9]+]], [[ANDC]], 4
+; CHECK-DAG: srd 3, [[MOV]], [[SHL]]
+; CHECK-DAG: extsh 3, 3
+; CHECK-LE-LABEL: @getvelss
+; CHECK-DAG-LE: li [[IMM4:[0-9]+]], 4
+; CHECK-DAG-LE: andc [[ANDC:[0-9]+]], [[IMM4]]
+; CHECK-DAG-LE: sldi [[MUL2:[0-9]+]], [[ANDC]], 1
+; CHECK-DAG-LE: lvsl [[SHMSK:[0-9]+]], 0, [[MUL2]]
+; CHECK-DAG-LE: vperm [[PERMD:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, [[SHMSK]]
+; CHECK-DAG-LE: mfvsrd [[MOV:[0-9]+]],
+; CHECK-DAG-LE: li [[IMM3:[0-9]+]], 3
+; CHECK-DAG-LE: and [[AND:[0-9]+]], [[IMM3]]
+; CHECK-DAG-LE: sldi [[SHL:[0-9]+]], [[AND]], 4
+; CHECK-DAG-LE: srd 3, [[MOV]], [[SHL]]
+; CHECK-DAG-LE: extsh 3, 3
+}
+
+; Function Attrs: nounwind
+define zeroext i16 @getvelus(<8 x i16> %vus, i32 signext %i) {
+entry:
+  %vus.addr = alloca <8 x i16>, align 16
+  %i.addr = alloca i32, align 4
+  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
+  %1 = load i32, i32* %i.addr, align 4
+  %vecext = extractelement <8 x i16> %0, i32 %1
+  ret i16 %vecext
+; CHECK-LABEL: @getvelus
+; CHECK-DAG: andi. [[ANDI:[0-9]+]], {{[0-9]+}}, 4
+; CHECK-DAG: sldi [[MUL2:[0-9]+]], [[ANDI]], 1
+; CHECK-DAG: lvsl [[SHMSK:[0-9]+]], 0, [[MUL2]]
+; CHECK-DAG: vperm [[PERMD:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, [[SHMSK]]
+; CHECK-DAG: mfvsrd [[MOV:[0-9]+]],
+; CHECK-DAG: li [[IMM3:[0-9]+]], 3
+; CHECK-DAG: andc [[ANDC:[0-9]+]], [[IMM3]]
+; CHECK-DAG: sldi [[SHL:[0-9]+]], [[ANDC]], 4
+; CHECK-DAG: srd 3, [[MOV]], [[SHL]]
+; CHECK-DAG: clrldi   3, 3, 48
+; CHECK-LE-LABEL: @getvelus
+; CHECK-DAG-LE: li [[IMM4:[0-9]+]], 4
+; CHECK-DAG-LE: andc [[ANDC:[0-9]+]], [[IMM4]]
+; CHECK-DAG-LE: sldi [[MUL2:[0-9]+]], [[ANDC]], 1
+; CHECK-DAG-LE: lvsl [[SHMSK:[0-9]+]], 0, [[MUL2]]
+; CHECK-DAG-LE: vperm [[PERMD:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, [[SHMSK]]
+; CHECK-DAG-LE: mfvsrd [[MOV:[0-9]+]],
+; CHECK-DAG-LE: li [[IMM3:[0-9]+]], 3
+; CHECK-DAG-LE: and [[AND:[0-9]+]], [[IMM3]]
+; CHECK-DAG-LE: sldi [[SHL:[0-9]+]], [[AND]], 4
+; CHECK-DAG-LE: srd 3, [[MOV]], [[SHL]]
+; CHECK-DAG-LE: clrldi   3, 3, 48
+}
+
+; Function Attrs: nounwind
+define signext i32 @getsi0(<4 x i32> %vsi) {
+entry:
+  %vsi.addr = alloca <4 x i32>, align 16
+  store <4 x i32> %vsi, <4 x i32>* %vsi.addr, align 16
+  %0 = load <4 x i32>, <4 x i32>* %vsi.addr, align 16
+  %vecext = extractelement <4 x i32> %0, i32 0
+  ret i32 %vecext
+; CHECK-LABEL: @getsi0
+; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 3
+; CHECK: mfvsrwz 3, [[SHL]]
+; CHECK: extsw 3, 3
+; CHECK-LE-LABEL: @getsi0
+; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK-LE: mfvsrwz 3, [[SHL]]
+; CHECK-LE: extsw 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i32 @getsi1(<4 x i32> %vsi) {
+entry:
+  %vsi.addr = alloca <4 x i32>, align 16
+  store <4 x i32> %vsi, <4 x i32>* %vsi.addr, align 16
+  %0 = load <4 x i32>, <4 x i32>* %vsi.addr, align 16
+  %vecext = extractelement <4 x i32> %0, i32 1
+  ret i32 %vecext
+; CHECK-LABEL: @getsi1
+; CHECK: mfvsrwz 3, 34
+; CHECK: extsw 3, 3
+; CHECK-LE-LABEL: @getsi1
+; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
+; CHECK-LE: mfvsrwz 3, [[SHL]]
+; CHECK-LE: extsw 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i32 @getsi2(<4 x i32> %vsi) {
+entry:
+  %vsi.addr = alloca <4 x i32>, align 16
+  store <4 x i32> %vsi, <4 x i32>* %vsi.addr, align 16
+  %0 = load <4 x i32>, <4 x i32>* %vsi.addr, align 16
+  %vecext = extractelement <4 x i32> %0, i32 2
+  ret i32 %vecext
+; CHECK-LABEL: @getsi2
+; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
+; CHECK: mfvsrwz 3, [[SHL]]
+; CHECK: extsw 3, 3
+; CHECK-LE-LABEL: @getsi2
+; CHECK-LE: mfvsrwz 3, 34
+; CHECK-LE: extsw 3, 3
+}
+
+; Function Attrs: nounwind
+define signext i32 @getsi3(<4 x i32> %vsi) {
+entry:
+  %vsi.addr = alloca <4 x i32>, align 16
+  store <4 x i32> %vsi, <4 x i32>* %vsi.addr, align 16
+  %0 = load <4 x i32>, <4 x i32>* %vsi.addr, align 16
+  %vecext = extractelement <4 x i32> %0, i32 3
+  ret i32 %vecext
+; CHECK-LABEL: @getsi3
+; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK: mfvsrwz 3, [[SHL]]
+; CHECK: extsw 3, 3
+; CHECK-LE-LABEL: @getsi3
+; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 3
+; CHECK-LE: mfvsrwz 3, [[SHL]]
+; CHECK-LE: extsw 3, 3
+}
+
+; Function Attrs: nounwind
+define zeroext i32 @getui0(<4 x i32> %vui) {
+entry:
+  %vui.addr = alloca <4 x i32>, align 16
+  store <4 x i32> %vui, <4 x i32>* %vui.addr, align 16
+  %0 = load <4 x i32>, <4 x i32>* %vui.addr, align 16
+  %vecext = extractelement <4 x i32> %0, i32 0
+  ret i32 %vecext
+; CHECK-LABEL: @getui0
+; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 3
+; CHECK: mfvsrwz 3, [[SHL]]
+; CHECK: clrldi   3, 3, 32
+; CHECK-LE-LABEL: @getui0
+; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK-LE: mfvsrwz 3, [[SHL]]
+; CHECK-LE: clrldi   3, 3, 32
+}
+
+; Function Attrs: nounwind
+define zeroext i32 @getui1(<4 x i32> %vui) {
+entry:
+  %vui.addr = alloca <4 x i32>, align 16
+  store <4 x i32> %vui, <4 x i32>* %vui.addr, align 16
+  %0 = load <4 x i32>, <4 x i32>* %vui.addr, align 16
+  %vecext = extractelement <4 x i32> %0, i32 1
+  ret i32 %vecext
+; CHECK-LABEL: @getui1
+; CHECK: mfvsrwz 3, 34
+; CHECK: clrldi   3, 3, 32
+; CHECK-LE-LABEL: @getui1
+; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
+; CHECK-LE: mfvsrwz 3, [[SHL]]
+; CHECK-LE: clrldi   3, 3, 32
+}
+
+; Function Attrs: nounwind
+define zeroext i32 @getui2(<4 x i32> %vui) {
+entry:
+  %vui.addr = alloca <4 x i32>, align 16
+  store <4 x i32> %vui, <4 x i32>* %vui.addr, align 16
+  %0 = load <4 x i32>, <4 x i32>* %vui.addr, align 16
+  %vecext = extractelement <4 x i32> %0, i32 2
+  ret i32 %vecext
+; CHECK-LABEL: @getui2
+; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
+; CHECK: mfvsrwz 3, [[SHL]]
+; CHECK: clrldi   3, 3, 32
+; CHECK-LE-LABEL: @getui2
+; CHECK-LE: mfvsrwz 3, 34
+; CHECK-LE: clrldi   3, 3, 32
+}
+
+; Function Attrs: nounwind
+define zeroext i32 @getui3(<4 x i32> %vui) {
+entry:
+  %vui.addr = alloca <4 x i32>, align 16
+  store <4 x i32> %vui, <4 x i32>* %vui.addr, align 16
+  %0 = load <4 x i32>, <4 x i32>* %vui.addr, align 16
+  %vecext = extractelement <4 x i32> %0, i32 3
+  ret i32 %vecext
+; CHECK-LABEL: @getui3
+; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK: mfvsrwz 3, [[SHL]]
+; CHECK: clrldi   3, 3, 32
+; CHECK-LE-LABEL: @getui3
+; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 3
+; CHECK-LE: mfvsrwz 3, [[SHL]]
+; CHECK-LE: clrldi   3, 3, 32
+}
+
+; Function Attrs: nounwind
+define signext i32 @getvelsi(<4 x i32> %vsi, i32 signext %i) {
+entry:
+  %vsi.addr = alloca <4 x i32>, align 16
+  %i.addr = alloca i32, align 4
+  store <4 x i32> %vsi, <4 x i32>* %vsi.addr, align 16
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load <4 x i32>, <4 x i32>* %vsi.addr, align 16
+  %1 = load i32, i32* %i.addr, align 4
+  %vecext = extractelement <4 x i32> %0, i32 %1
+  ret i32 %vecext
+; CHECK-LABEL: @getvelsi
+; CHECK-LE-LABEL: @getvelsi
+; FIXME: add check patterns when variable element extraction is implemented
+}
+
+; Function Attrs: nounwind
+define zeroext i32 @getvelui(<4 x i32> %vui, i32 signext %i) {
+entry:
+  %vui.addr = alloca <4 x i32>, align 16
+  %i.addr = alloca i32, align 4
+  store <4 x i32> %vui, <4 x i32>* %vui.addr, align 16
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load <4 x i32>, <4 x i32>* %vui.addr, align 16
+  %1 = load i32, i32* %i.addr, align 4
+  %vecext = extractelement <4 x i32> %0, i32 %1
+  ret i32 %vecext
+; CHECK-LABEL: @getvelui
+; CHECK-LE-LABEL: @getvelui
+; FIXME: add check patterns when variable element extraction is implemented
+}
+
+; Function Attrs: nounwind
+define i64 @getsl0(<2 x i64> %vsl) {
+entry:
+  %vsl.addr = alloca <2 x i64>, align 16
+  store <2 x i64> %vsl, <2 x i64>* %vsl.addr, align 16
+  %0 = load <2 x i64>, <2 x i64>* %vsl.addr, align 16
+  %vecext = extractelement <2 x i64> %0, i32 0
+  ret i64 %vecext
+; CHECK-LABEL: @getsl0
+; CHECK: mfvsrd 3, 34
+; CHECK-LE-LABEL: @getsl0
+; CHECK-LE: xxswapd  [[SWP:[0-9]+]], 34
+; CHECK-LE: mfvsrd 3, [[SWP]]
+}
+
+; Function Attrs: nounwind
+define i64 @getsl1(<2 x i64> %vsl) {
+entry:
+  %vsl.addr = alloca <2 x i64>, align 16
+  store <2 x i64> %vsl, <2 x i64>* %vsl.addr, align 16
+  %0 = load <2 x i64>, <2 x i64>* %vsl.addr, align 16
+  %vecext = extractelement <2 x i64> %0, i32 1
+  ret i64 %vecext
+; CHECK-LABEL: @getsl1
+; CHECK: xxswapd  [[SWP:[0-9]+]], 34
+; CHECK: mfvsrd 3, [[SWP]]
+; CHECK-LE-LABEL: @getsl1
+; CHECK-LE: mfvsrd 3, 34
+}
+
+; Function Attrs: nounwind
+define i64 @getul0(<2 x i64> %vul) {
+entry:
+  %vul.addr = alloca <2 x i64>, align 16
+  store <2 x i64> %vul, <2 x i64>* %vul.addr, align 16
+  %0 = load <2 x i64>, <2 x i64>* %vul.addr, align 16
+  %vecext = extractelement <2 x i64> %0, i32 0
+  ret i64 %vecext
+; CHECK-LABEL: @getul0
+; CHECK: mfvsrd 3, 34
+; CHECK-LE-LABEL: @getul0
+; CHECK-LE: xxswapd  [[SWP:[0-9]+]], 34
+; CHECK-LE: mfvsrd 3, [[SWP]]
+}
+
+; Function Attrs: nounwind
+define i64 @getul1(<2 x i64> %vul) {
+entry:
+  %vul.addr = alloca <2 x i64>, align 16
+  store <2 x i64> %vul, <2 x i64>* %vul.addr, align 16
+  %0 = load <2 x i64>, <2 x i64>* %vul.addr, align 16
+  %vecext = extractelement <2 x i64> %0, i32 1
+  ret i64 %vecext
+; CHECK-LABEL: @getul1
+; CHECK: xxswapd  [[SWP:[0-9]+]], 34
+; CHECK: mfvsrd 3, [[SWP]]
+; CHECK-LE-LABEL: @getul1
+; CHECK-LE: mfvsrd 3, 34
+}
+
+; Function Attrs: nounwind
+define i64 @getvelsl(<2 x i64> %vsl, i32 signext %i) {
+entry:
+  %vsl.addr = alloca <2 x i64>, align 16
+  %i.addr = alloca i32, align 4
+  store <2 x i64> %vsl, <2 x i64>* %vsl.addr, align 16
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load <2 x i64>, <2 x i64>* %vsl.addr, align 16
+  %1 = load i32, i32* %i.addr, align 4
+  %vecext = extractelement <2 x i64> %0, i32 %1
+  ret i64 %vecext
+; CHECK-LABEL: @getvelsl
+; CHECK-LE-LABEL: @getvelsl
+; FIXME: add check patterns when variable element extraction is implemented
+}
+
+; Function Attrs: nounwind
+define i64 @getvelul(<2 x i64> %vul, i32 signext %i) {
+entry:
+  %vul.addr = alloca <2 x i64>, align 16
+  %i.addr = alloca i32, align 4
+  store <2 x i64> %vul, <2 x i64>* %vul.addr, align 16
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load <2 x i64>, <2 x i64>* %vul.addr, align 16
+  %1 = load i32, i32* %i.addr, align 4
+  %vecext = extractelement <2 x i64> %0, i32 %1
+  ret i64 %vecext
+; CHECK-LABEL: @getvelul
+; CHECK-LE-LABEL: @getvelul
+; FIXME: add check patterns when variable element extraction is implemented
+}
+
+; Function Attrs: nounwind
+define float @getf0(<4 x float> %vf) {
+entry:
+  %vf.addr = alloca <4 x float>, align 16
+  store <4 x float> %vf, <4 x float>* %vf.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %vf.addr, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  ret float %vecext
+; CHECK-LABEL: @getf0
+; CHECK: xscvspdpn 1, 34
+; CHECK-LE-LABEL: @getf0
+; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 3
+; CHECK-LE: xscvspdpn 1, [[SHL]]
+}
+
+; Function Attrs: nounwind
+define float @getf1(<4 x float> %vf) {
+entry:
+  %vf.addr = alloca <4 x float>, align 16
+  store <4 x float> %vf, <4 x float>* %vf.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %vf.addr, align 16
+  %vecext = extractelement <4 x float> %0, i32 1
+  ret float %vecext
+; CHECK-LABEL: @getf1
+; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
+; CHECK: xscvspdpn 1, [[SHL]]
+; CHECK-LE-LABEL: @getf1
+; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK-LE: xscvspdpn 1, [[SHL]]
+}
+
+; Function Attrs: nounwind
+define float @getf2(<4 x float> %vf) {
+entry:
+  %vf.addr = alloca <4 x float>, align 16
+  store <4 x float> %vf, <4 x float>* %vf.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %vf.addr, align 16
+  %vecext = extractelement <4 x float> %0, i32 2
+  ret float %vecext
+; CHECK-LABEL: @getf2
+; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK: xscvspdpn 1, [[SHL]]
+; CHECK-LE-LABEL: @getf2
+; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
+; CHECK-LE: xscvspdpn 1, [[SHL]]
+}
+
+; Function Attrs: nounwind
+define float @getf3(<4 x float> %vf) {
+entry:
+  %vf.addr = alloca <4 x float>, align 16
+  store <4 x float> %vf, <4 x float>* %vf.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %vf.addr, align 16
+  %vecext = extractelement <4 x float> %0, i32 3
+  ret float %vecext
+; CHECK-LABEL: @getf3
+; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 3
+; CHECK: xscvspdpn 1, [[SHL]]
+; CHECK-LE-LABEL: @getf3
+; CHECK-LE: xscvspdpn 1, 34
+}
+
+; Function Attrs: nounwind
+define float @getvelf(<4 x float> %vf, i32 signext %i) {
+entry:
+  %vf.addr = alloca <4 x float>, align 16
+  %i.addr = alloca i32, align 4
+  store <4 x float> %vf, <4 x float>* %vf.addr, align 16
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load <4 x float>, <4 x float>* %vf.addr, align 16
+  %1 = load i32, i32* %i.addr, align 4
+  %vecext = extractelement <4 x float> %0, i32 %1
+  ret float %vecext
+; CHECK-LABEL: @getvelf
+; CHECK-LE-LABEL: @getvelf
+; FIXME: add check patterns when variable element extraction is implemented
+}
+
+; Function Attrs: nounwind
+define double @getd0(<2 x double> %vd) {
+entry:
+  %vd.addr = alloca <2 x double>, align 16
+  store <2 x double> %vd, <2 x double>* %vd.addr, align 16
+  %0 = load <2 x double>, <2 x double>* %vd.addr, align 16
+  %vecext = extractelement <2 x double> %0, i32 0
+  ret double %vecext
+; CHECK-LABEL: @getd0
+; CHECK: xxlor 1, 34, 34
+; CHECK-LE-LABEL: @getd0
+; CHECK-LE: xxswapd  1, 34
+}
+
+; Function Attrs: nounwind
+define double @getd1(<2 x double> %vd) {
+entry:
+  %vd.addr = alloca <2 x double>, align 16
+  store <2 x double> %vd, <2 x double>* %vd.addr, align 16
+  %0 = load <2 x double>, <2 x double>* %vd.addr, align 16
+  %vecext = extractelement <2 x double> %0, i32 1
+  ret double %vecext
+; CHECK-LABEL: @getd1
+; CHECK: xxswapd  1, 34
+; CHECK-LE-LABEL: @getd1
+; CHECK-LE: xxlor 1, 34, 34
+}
+
+; Function Attrs: nounwind
+define double @getveld(<2 x double> %vd, i32 signext %i) {
+entry:
+  %vd.addr = alloca <2 x double>, align 16
+  %i.addr = alloca i32, align 4
+  store <2 x double> %vd, <2 x double>* %vd.addr, align 16
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load <2 x double>, <2 x double>* %vd.addr, align 16
+  %1 = load i32, i32* %i.addr, align 4
+  %vecext = extractelement <2 x double> %0, i32 %1
+  ret double %vecext
+; CHECK-LABEL: @getveld
+; CHECK-LE-LABEL: @getveld
+; FIXME: add check patterns when variable element extraction is implemented
+}
diff --git a/test/CodeGen/PowerPC/peephole-align.ll b/test/CodeGen/PowerPC/peephole-align.ll
new file mode 100644
index 0000000000000..c8c2fe4d32cea
--- /dev/null
+++ b/test/CodeGen/PowerPC/peephole-align.ll
@@ -0,0 +1,335 @@
+; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck -check-prefix=POWER7 -check-prefix=CHECK %s
+; RUN: llc -mcpu=pwr8 -O1 -code-model=medium <%s | FileCheck -check-prefix=POWER8 -check-prefix=CHECK %s
+
+; Test peephole optimization for medium code model (32-bit TOC offsets)
+; for loading and storing small offsets within aligned values.
+; For power8, verify that the optimization doesn't fire, as it prevents fusion
+; opportunities.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.b4 = type<{ i8, i8, i8, i8 }>
+%struct.h2 = type<{ i16, i16 }>
+
+%struct.b8 = type<{ i8, i8, i8, i8, i8, i8, i8, i8 }>
+%struct.h4 = type<{ i16, i16, i16, i16 }>
+%struct.w2 = type<{ i32, i32 }>
+
+%struct.d2 = type<{ i64, i64 }>
+%struct.misalign = type<{ i8, i64 }>
+
+@b4v = global %struct.b4 <{ i8 1, i8 2, i8 3, i8 4 }>, align 4
+@h2v = global %struct.h2 <{ i16 1, i16 2 }>, align 4
+
+@b8v = global %struct.b8 <{ i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8 }>, align 8
+@h4v = global %struct.h4 <{ i16 1, i16 2, i16 3, i16 4 }>, align 8
+@w2v = global %struct.w2 <{ i32 1, i32 2 }>, align 8
+
+@d2v = global %struct.d2 <{ i64 1, i64 2 }>, align 16
+@misalign_v = global %struct.misalign <{ i8 1, i64 2 }>, align 16
+
+; CHECK-LABEL: test_b4:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, b4v@toc@ha
+; POWER7-DAG: lbz [[REG0_0:[0-9]+]], b4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG1_0:[0-9]+]], b4v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG2_0:[0-9]+]], b4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG3_0:[0-9]+]], b4v@toc@l+3([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER7-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER7-DAG: stb [[REG0_1]], b4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG1_1]], b4v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG2_1]], b4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG3_1]], b4v@toc@l+3([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, b4v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], b4v@toc@l
+; POWER8-DAG: lbz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG1_0:[0-9]+]], 1([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG2_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG3_0:[0-9]+]], 3([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER8-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER8-DAG: stb [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG1_1]], 1([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG2_1]], 2([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG3_1]], 3([[REGSTRUCT]])
+define void @test_b4() nounwind {
+entry:
+  %0 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 0), align 1
+  %inc0 = add nsw i8 %0, 1
+  store i8 %inc0, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 0), align 1
+  %1 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 1), align 1
+  %inc1 = add nsw i8 %1, 2
+  store i8 %inc1, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 1), align 1
+  %2 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 2), align 1
+  %inc2 = add nsw i8 %2, 3
+  store i8 %inc2, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 2), align 1
+  %3 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 3), align 1
+  %inc3 = add nsw i8 %3, 4
+  store i8 %inc3, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 3), align 1
+  ret void
+}
+
+; CHECK-LABEL: test_h2:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, h2v@toc@ha
+; POWER7-DAG: lhz [[REG0_0:[0-9]+]], h2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG1_0:[0-9]+]], h2v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: sth [[REG0_1]], h2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG1_1]], h2v@toc@l+2([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, h2v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], h2v@toc@l
+; POWER8-DAG: lhz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG1_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: sth [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG1_1]], 2([[REGSTRUCT]])
+define void @test_h2() nounwind {
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %inc0 = add nsw i16 %0, 1
+  store i16 %inc0, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %1 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  %inc1 = add nsw i16 %1, 2
+  store i16 %inc1, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  ret void
+}
+
+; CHECK-LABEL: test_h2_optsize:
+; CHECK: addis [[REGSTRUCT:[0-9]+]], 2, h2v@toc@ha
+; CHECK-DAG: lhz [[REG0_0:[0-9]+]], h2v@toc@l([[REGSTRUCT]])
+; CHECK-DAG: lhz [[REG1_0:[0-9]+]], h2v@toc@l+2([[REGSTRUCT]])
+; CHECK-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; CHECK-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; CHECK-DAG: sth [[REG0_1]], h2v@toc@l([[REGSTRUCT]])
+; CHECK-DAG: sth [[REG1_1]], h2v@toc@l+2([[REGSTRUCT]])
+define void @test_h2_optsize() optsize nounwind {
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %inc0 = add nsw i16 %0, 1
+  store i16 %inc0, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %1 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  %inc1 = add nsw i16 %1, 2
+  store i16 %inc1, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  ret void
+}
+
+; CHECK-LABEL: test_b8:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, b8v@toc@ha
+; POWER7-DAG: lbz [[REG0_0:[0-9]+]], b8v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG1_0:[0-9]+]], b8v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG2_0:[0-9]+]], b8v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG3_0:[0-9]+]], b8v@toc@l+3([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG4_0:[0-9]+]], b8v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG5_0:[0-9]+]], b8v@toc@l+5([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG6_0:[0-9]+]], b8v@toc@l+6([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG7_0:[0-9]+]], b8v@toc@l+7([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER7-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER7-DAG: addi [[REG4_1:[0-9]+]], [[REG4_0]], 5
+; POWER7-DAG: addi [[REG5_1:[0-9]+]], [[REG5_0]], 6
+; POWER7-DAG: addi [[REG6_1:[0-9]+]], [[REG6_0]], 7
+; POWER7-DAG: addi [[REG7_1:[0-9]+]], [[REG7_0]], 8
+; POWER7-DAG: stb [[REG0_1]], b8v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG1_1]], b8v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG2_1]], b8v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG3_1]], b8v@toc@l+3([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG4_1]], b8v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG5_1]], b8v@toc@l+5([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG6_1]], b8v@toc@l+6([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG7_1]], b8v@toc@l+7([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, b8v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], b8v@toc@l
+; POWER8-DAG: lbz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG1_0:[0-9]+]], 1([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG2_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG3_0:[0-9]+]], 3([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG4_0:[0-9]+]], 4([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG5_0:[0-9]+]], 5([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG6_0:[0-9]+]], 6([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG7_0:[0-9]+]], 7([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER8-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER8-DAG: addi [[REG4_1:[0-9]+]], [[REG4_0]], 5
+; POWER8-DAG: addi [[REG5_1:[0-9]+]], [[REG5_0]], 6
+; POWER8-DAG: addi [[REG6_1:[0-9]+]], [[REG6_0]], 7
+; POWER8-DAG: addi [[REG7_1:[0-9]+]], [[REG7_0]], 8
+; POWER8-DAG: stb [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG1_1]], 1([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG2_1]], 2([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG3_1]], 3([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG4_1]], 4([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG5_1]], 5([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG6_1]], 6([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG7_1]], 7([[REGSTRUCT]])
+define void @test_b8() nounwind {
+entry:
+  %0 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 0), align 1
+  %inc0 = add nsw i8 %0, 1
+  store i8 %inc0, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 0), align 1
+  %1 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 1), align 1
+  %inc1 = add nsw i8 %1, 2
+  store i8 %inc1, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 1), align 1
+  %2 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 2), align 1
+  %inc2 = add nsw i8 %2, 3
+  store i8 %inc2, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 2), align 1
+  %3 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 3), align 1
+  %inc3 = add nsw i8 %3, 4
+  store i8 %inc3, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 3), align 1
+  %4 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 4), align 1
+  %inc4 = add nsw i8 %4, 5
+  store i8 %inc4, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 4), align 1
+  %5 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 5), align 1
+  %inc5 = add nsw i8 %5, 6
+  store i8 %inc5, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 5), align 1
+  %6 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 6), align 1
+  %inc6 = add nsw i8 %6, 7
+  store i8 %inc6, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 6), align 1
+  %7 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 7), align 1
+  %inc7 = add nsw i8 %7, 8
+  store i8 %inc7, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 7), align 1
+  ret void
+}
+
+; CHECK-LABEL: test_h4:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, h4v@toc@ha
+; POWER7-DAG: lhz [[REG0_0:[0-9]+]], h4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG1_0:[0-9]+]], h4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG2_0:[0-9]+]], h4v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG3_0:[0-9]+]], h4v@toc@l+6([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER7-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER7-DAG: sth [[REG0_1]], h4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG1_1]], h4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG2_1]], h4v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG3_1]], h4v@toc@l+6([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, h4v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], h4v@toc@l
+; POWER8-DAG: lhz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG1_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG2_0:[0-9]+]], 4([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG3_0:[0-9]+]], 6([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER8-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER8-DAG: sth [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG1_1]], 2([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG2_1]], 4([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG3_1]], 6([[REGSTRUCT]])
+define void @test_h4() nounwind {
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 0), align 2
+  %inc0 = add nsw i16 %0, 1
+  store i16 %inc0, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 0), align 2
+  %1 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 1), align 2
+  %inc1 = add nsw i16 %1, 2
+  store i16 %inc1, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 1), align 2
+  %2 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 2), align 2
+  %inc2 = add nsw i16 %2, 3
+  store i16 %inc2, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 2), align 2
+  %3 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 3), align 2
+  %inc3 = add nsw i16 %3, 4
+  store i16 %inc3, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 3), align 2
+  ret void
+}
+
+; CHECK-LABEL: test_w2:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, w2v@toc@ha
+; POWER7-DAG: lwz [[REG0_0:[0-9]+]], w2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lwz [[REG1_0:[0-9]+]], w2v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: stw [[REG0_1]], w2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: stw [[REG1_1]], w2v@toc@l+4([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, w2v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], w2v@toc@l
+; POWER8-DAG: lwz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lwz [[REG1_0:[0-9]+]], 4([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: stw [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: stw [[REG1_1]], 4([[REGSTRUCT]])
+define void @test_w2() nounwind {
+entry:
+  %0 = load i32, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 0), align 4
+  %inc0 = add nsw i32 %0, 1
+  store i32 %inc0, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 0), align 4
+  %1 = load i32, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 1), align 4
+  %inc1 = add nsw i32 %1, 2
+  store i32 %inc1, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 1), align 4
+  ret void
+}
+
+; CHECK-LABEL: test_d2:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, d2v@toc@ha
+; POWER7-DAG: ld [[REG0_0:[0-9]+]], d2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: ld [[REG1_0:[0-9]+]], d2v@toc@l+8([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: std [[REG0_1]], d2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: std [[REG1_1]], d2v@toc@l+8([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, d2v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], d2v@toc@l
+; POWER8-DAG: ld [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: ld [[REG1_0:[0-9]+]], 8([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: std [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: std [[REG1_1]], 8([[REGSTRUCT]])
+define void @test_d2() nounwind {
+entry:
+  %0 = load i64, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 0), align 8
+  %inc0 = add nsw i64 %0, 1
+  store i64 %inc0, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 0), align 8
+  %1 = load i64, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 1), align 8
+  %inc1 = add nsw i64 %1, 2
+  store i64 %inc1, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 1), align 8
+  ret void
+}
+
+; Make sure the optimization fires on power8 if there is a single use resulting
+; in a better fusion opportunity.
+; register 3 is the return value, so it should be chosen
+; CHECK-LABEL: test_singleuse:
+; CHECK: addis 3, 2, d2v@toc@ha
+; CHECK: ld 3, d2v@toc@l+8(3)
+define i64 @test_singleuse() nounwind {
+entry:
+  %0 = load i64, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 1), align 8
+  ret i64 %0
+}
+
+; Make sure the optimization fails to fire if the symbol is aligned, but the offset is not.
+; CHECK-LABEL: test_misalign
+; POWER7: addis [[REGSTRUCT_0:[0-9]+]], 2, misalign_v@toc@ha
+; POWER7: addi [[REGSTRUCT:[0-9]+]], [[REGSTRUCT_0]], misalign_v@toc@l
+; POWER7: li [[OFFSET_REG:[0-9]+]], 1
+; POWER7: ldx [[REG0_0:[0-9]+]], [[REGSTRUCT]], [[OFFSET_REG]]
+; POWER7: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7: stdx [[REG0_1]], [[REGSTRUCT]], [[OFFSET_REG]]
+define void @test_misalign() nounwind {
+entry:
+  %0 = load i64, i64* getelementptr inbounds (%struct.misalign, %struct.misalign* @misalign_v, i32 0, i32 1), align 1
+  %inc0 = add nsw i64 %0, 1
+  store i64 %inc0, i64* getelementptr inbounds (%struct.misalign, %struct.misalign* @misalign_v, i32 0, i32 1), align 1
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
new file mode 100644
index 0000000000000..2f75190327ef8
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
@@ -0,0 +1,784 @@
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu %s -o - -enable-shrink-wrap=false |  FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+;
+; Note: Lots of tests use inline asm instead of regular calls.
+; This allows to have a better control on what the allocation will do.
+; Otherwise, we may have spill right in the entry block, defeating
+; shrink-wrapping. Moreover, some of the inline asm statement (nop)
+; are here to ensure that the related paths do not end up as critical
+; edges.
+
+
+; Initial motivating example: Simple diamond with a call just on one side.
+; CHECK-LABEL: foo:
+;
+; Compare the arguments and return
+; No prologue needed.
+; ENABLE: cmpw 0, 3, 4
+; ENABLE-NEXT: bgelr 0
+;
+; Prologue code.
+;  At a minimum, we save/restore the link register. Other registers may be saved
+;  as well. 
+; CHECK: mflr 
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; DISABLE: cmpw 0, 3, 4
+; DISABLE-NEXT: bge 0, .[[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Store %a on the stack
+; CHECK: stw 3, {{[0-9]+([0-9]+)}}
+; Set the alloca address in the second argument.
+; CHECK-NEXT: addi 4, 1, {{[0-9]+}}
+; Set the first argument to zero.
+; CHECK-NEXT: li 3, 0
+; CHECK-NEXT: bl doSomething
+;
+; With shrink-wrapping, epilogue is just after the call.
+; Restore the link register and return.
+; Note that there could be other epilog code before the link register is 
+; restored but we will not check for it here.
+; ENABLE: mtlr
+; ENABLE-NEXT: blr
+;
+; DISABLE: [[EXIT_LABEL]]:
+;
+; Without shrink-wrapping, epilogue is in the exit block.
+; Epilogue code. (What we pop does not matter.)
+; DISABLE: mtlr {{[0-9]+}}
+; DISABLE-NEXT: blr
+;
+
+define i32 @foo(i32 %a, i32 %b) {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+; Function Attrs: optsize
+declare i32 @doSomething(i32, i32*)
+
+
+
+; Check that we do not perform the restore inside the loop whereas the save
+; is outside.
+; CHECK-LABEL: freqSaveAndRestoreOutsideLoop:
+;
+; Shrink-wrapping allows to skip the prologue in the else case.
+; ENABLE: cmplwi 0, 3, 0
+; ENABLE: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the link register
+; CHECK: mflr {{[0-9]+}}
+;
+; DISABLE: cmplwi 0, 3, 0
+; DISABLE: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Loop preheader
+; CHECK-DAG: li [[SUM:[0-9]+]], 0
+; CHECK-DAG: li [[IV:[0-9]+]], 10
+; 
+; Loop body
+; CHECK: .[[LOOP:LBB[0-9_]+]]: # %for.body
+; CHECK: bl something
+; CHECK-DAG: addi [[IV]], [[IV]], -1
+; CHECK-DAG: add [[SUM]], 3, [[SUM]] 
+; CHECK-NEXT: cmplwi [[IV]], 0
+; CHECK-NEXT: bne 0, .[[LOOP]]
+;
+; Next BB.
+; CHECK: slwi 3, [[SUM]], 3
+;
+; Jump to epilogue.
+; DISABLE: b .[[EPILOG_BB:LBB[0-9_]+]]
+;
+; DISABLE: .[[ELSE_LABEL]]: # %if.else
+; Shift second argument by one and store into returned register.
+; DISABLE: slwi 3, 4, 1
+; DISABLE: .[[EPILOG_BB]]: # %if.end
+;
+; Epilogue code.
+; CHECK: mtlr {{[0-9]+}}
+; CHECK-NEXT: blr
+;
+; ENABLE: .[[ELSE_LABEL]]: # %if.else
+; Shift second argument by one and store into returned register.
+; ENABLE: slwi 3, 4, 1
+; ENABLE-NEXT: blr
+define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
+  %call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+declare i32 @something(...)
+
+; Check that we do not perform the shrink-wrapping inside the loop even
+; though that would be legal. The cost model must prevent that.
+; CHECK-LABEL: freqSaveAndRestoreOutsideLoop2:
+; Prologue code.
+; Make sure we save the link register before the call
+; CHECK: mflr {{[0-9]+}}
+;
+; Loop preheader
+; CHECK-DAG: li [[SUM:[0-9]+]], 0
+; CHECK-DAG: li [[IV:[0-9]+]], 10
+; 
+; Loop body
+; CHECK: .[[LOOP:LBB[0-9_]+]]: # %for.body
+; CHECK: bl something
+; CHECK-DAG: addi [[IV]], [[IV]], -1
+; CHECK-DAG: add [[SUM]], 3, [[SUM]] 
+; CHECK-NEXT: cmplwi [[IV]], 0
+; CHECK-NEXT: bne 0, .[[LOOP]]
+;
+; Next BB
+; CHECK: %for.exit
+; CHECK: mtlr {{[0-9]+}}
+; CHECK-NEXT: blr
+define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) {
+entry:
+  br label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.04 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
+  %sum.03 = phi i32 [ 0, %for.preheader ], [ %add, %for.body ]
+  %call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
+  %add = add nsw i32 %call, %sum.03
+  %inc = add nuw nsw i32 %i.04, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  tail call void asm "nop", ""()
+  br label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add
+}
+
+
+; Check with a more complex case that we do not have save within the loop and
+; restore outside.
+; CHECK-LABEL: loopInfoSaveOutsideLoop:
+;
+; ENABLE: cmplwi 0, 3, 0
+; ENABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the link register 
+; CHECK: mflr {{[0-9]+}}
+;
+; DISABLE: cmplwi 0, 3, 0
+; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Loop preheader
+; CHECK-DAG: li [[SUM:[0-9]+]], 0
+; CHECK-DAG: li [[IV:[0-9]+]], 10
+; 
+; Loop body
+; CHECK: .[[LOOP:LBB[0-9_]+]]: # %for.body
+; CHECK: bl something
+; CHECK-DAG: addi [[IV]], [[IV]], -1
+; CHECK-DAG: add [[SUM]], 3, [[SUM]] 
+; CHECK-NEXT: cmplwi [[IV]], 0
+; CHECK-NEXT: bne 0, .[[LOOP]]
+; 
+; Next BB
+; CHECK: bl somethingElse 
+; CHECK: slwi 3, [[SUM]], 3
+;
+; Jump to epilogue
+; DISABLE: b .[[EPILOG_BB:LBB[0-9_]+]]
+;
+; DISABLE: .[[ELSE_LABEL]]: # %if.else
+; Shift second argument by one and store into returned register.
+; DISABLE: slwi 3, 4, 1
+;
+; DISABLE: .[[EPILOG_BB]]: # %if.end
+; Epilog code
+; CHECK: mtlr {{[0-9]+}}
+; CHECK-NEXT: blr
+; 
+; ENABLE: .[[ELSE_LABEL]]: # %if.else
+; Shift second argument by one and store into returned register.
+; ENABLE: slwi 3, 4, 1
+; ENABLE-NEXT: blr
+define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
+  %call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  tail call void bitcast (void (...)* @somethingElse to void ()*)()
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+declare void @somethingElse(...)
+
+; Check with a more complex case that we do not have restore within the loop and
+; save outside.
+; CHECK-LABEL: loopInfoRestoreOutsideLoop:
+;
+; ENABLE: cmplwi 0, 3, 0
+; ENABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the link register
+; CHECK: mflr {{[0-9]+}}
+;
+; DISABLE: cmplwi 0, 3, 0
+; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
+;
+; CHECK: bl somethingElse
+;
+; Loop preheader
+; CHECK-DAG: li [[SUM:[0-9]+]], 0
+; CHECK-DAG: li [[IV:[0-9]+]], 10
+; 
+; Loop body
+; CHECK: .[[LOOP:LBB[0-9_]+]]: # %for.body
+; CHECK: bl something
+; CHECK-DAG: addi [[IV]], [[IV]], -1
+; CHECK-DAG: add [[SUM]], 3, [[SUM]] 
+; CHECK-NEXT: cmplwi [[IV]], 0
+; CHECK-NEXT: bne 0, .[[LOOP]]
+;
+; Next BB. 
+; slwi 3, [[SUM]], 3
+;
+; DISABLE: b .[[EPILOG_BB:LBB[0-9_]+]]
+;
+; DISABLE: .[[ELSE_LABEL]]: # %if.else
+; Shift second argument by one and store into returned register.
+; DISABLE: slwi 3, 4, 1
+; DISABLE: .[[EPILOG_BB]]: # %if.end
+;
+; Epilogue code.
+; CHECK: mtlr {{[0-9]+}}
+; CHECK-NEXT: blr
+;
+; ENABLE: .[[ELSE_LABEL]]: # %if.else
+; Shift second argument by one and store into returned register.
+; ENABLE: slwi 3, 4, 1
+; ENABLE-NEXT: blr
+define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) #0 {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void bitcast (void (...)* @somethingElse to void ()*)()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %if.then
+  %i.05 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
+  %sum.04 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
+  %call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+; Check that we handle function with no frame information correctly.
+; CHECK-LABEL: emptyFrame:
+; CHECK: # %entry
+; CHECK-NEXT: li 3, 0
+; CHECK-NEXT: blr
+define i32 @emptyFrame() {
+entry:
+  ret i32 0
+}
+
+
+; Check that we handle inline asm correctly.
+; CHECK-LABEL: inlineAsm:
+;
+; ENABLE: cmplwi 0, 3, 0
+; ENABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r14
+; ENABLE-DAG: li [[IV:[0-9]+]], 10
+; ENABLE-DAG: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
+;
+; DISABLE: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
+; DISABLE: cmplwi 0, 3, 0
+; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
+; DISABLE: li [[IV:[0-9]+]], 10
+;
+; CHECK: nop
+; CHECK: mtctr [[IV]]
+;
+; CHECK: .[[LOOP_LABEL:LBB[0-9_]+]]: # %for.body
+; Inline asm statement.
+; CHECK: addi 14, 14, 1
+; CHECK: bdnz .[[LOOP_LABEL]]
+;
+; Epilogue code.
+; CHECK: li 3, 0
+; CHECK-DAG: ld 14, -[[STACK_OFFSET]](1) # 8-byte Folded Reload
+; CHECK: nop
+; CHECK: blr
+;
+; CHECK: [[ELSE_LABEL]]
+; CHECK-NEXT: slwi 3, 4, 1
+; DISABLE: ld 14, -[[STACK_OFFSET]](1) # 8-byte Folded Reload
+; CHECK-NEXT blr
+; 
+define i32 @inlineAsm(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  tail call void asm "addi 14, 14, 1", "~{r14}"()
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  tail call void asm "nop", ""()
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.else
+  %sum.0 = phi i32 [ %mul, %if.else ], [ 0, %for.exit ]
+  ret i32 %sum.0
+}
+
+
+; Check that we handle calls to variadic functions correctly.
+; CHECK-LABEL: callVariadicFunc:
+;
+; ENABLE: cmplwi 0, 3, 0
+; ENABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; CHECK: mflr {{[0-9]+}}
+; 
+; DISABLE: cmplwi 0, 3, 0
+; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Setup of the varags.
+; CHECK: mr 4, 3
+; CHECK-NEXT: mr 5, 3
+; CHECK-NEXT: mr 6, 3
+; CHECK-NEXT: mr 7, 3
+; CHECK-NEXT: mr 8, 3
+; CHECK-NEXT: mr 9, 3
+; CHECK-NEXT: bl someVariadicFunc
+; CHECK: slwi 3, 3, 3
+; DISABLE: b .[[EPILOGUE_BB:LBB[0-9_]+]]
+;
+; ENABLE: mtlr {{[0-9]+}}
+; ENABLE-NEXT: blr
+;
+; CHECK: .[[ELSE_LABEL]]: # %if.else
+; CHECK-NEXT: slwi 3, 4, 1
+; 
+; DISABLE: .[[EPILOGUE_BB]]: # %if.end
+; DISABLE: mtlr
+; CHECK: blr
+define i32 @callVariadicFunc(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 (i32, ...) @someVariadicFunc(i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N)
+  %shl = shl i32 %call, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %sum.0 = phi i32 [ %shl, %if.then ], [ %mul, %if.else ]
+  ret i32 %sum.0
+}
+
+declare i32 @someVariadicFunc(i32, ...)
+
+
+
+; Make sure we do not insert unreachable code after noreturn function.
+; Although this is not incorrect to insert such code, it is useless
+; and it hurts the binary size.
+;
+; CHECK-LABEL: noreturn:
+; DISABLE: mflr {{[0-9]+}}
+;
+; CHECK: cmplwi 3, 0
+; CHECK-NEXT: bne{{[-]?}} 0, .[[ABORT:LBB[0-9_]+]]
+;
+; CHECK: li 3, 42
+;
+; DISABLE: mtlr {{[0-9]+}}
+;
+; CHECK-NEXT: blr
+;
+; CHECK: .[[ABORT]]: # %if.abort
+;
+; ENABLE: mflr {{[0-9]+}}
+;
+; CHECK: bl abort
+; ENABLE-NOT: mtlr {{[0-9]+}}
+define i32 @noreturn(i8 signext %bad_thing) {
+entry:
+  %tobool = icmp eq i8 %bad_thing, 0
+  br i1 %tobool, label %if.end, label %if.abort
+
+if.abort:
+  tail call void @abort() #0
+  unreachable
+
+if.end:
+  ret i32 42
+}
+
+declare void @abort() #0
+
+attributes #0 = { noreturn nounwind }
+
+
+; Make sure that we handle infinite loops properly When checking that the Save
+; and Restore blocks are control flow equivalent, the loop searches for the
+; immediate (post) dominator for the (restore) save blocks. When either the Save
+; or Restore block is located in an infinite loop the only immediate (post)
+; dominator is itself. In this case, we cannot perform shrink wrapping, but we
+; should return gracefully and continue compilation.
+; The only condition for this test is the compilation finishes correctly.
+; 
+; CHECK-LABEL: infiniteloop
+; CHECK: blr
+define void @infiniteloop() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %ptr = alloca i32, i32 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
+  %call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
+  %add = add nsw i32 %call, %sum.03
+  store i32 %add, i32* %ptr
+  br label %for.body
+
+if.end:
+  ret void
+}
+
+; Another infinite loop test this time with a body bigger than just one block.
+; CHECK-LABEL: infiniteloop2
+; CHECK: blr
+define void @infiniteloop2() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %ptr = alloca i32, i32 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi i32 [ 0, %if.then ], [ %add, %body1 ], [ 1, %body2]
+  %call = tail call i32 asm "mftb $0, 268", "=r,~{r14}"()
+  %add = add nsw i32 %call, %sum.03
+  store i32 %add, i32* %ptr
+  br i1 undef, label %body1, label %body2
+
+body1:
+  tail call void asm sideeffect "nop", "~{r14}"()
+  br label %for.body
+
+body2:
+  tail call void asm sideeffect "nop", "~{r14}"()
+  br label %for.body
+
+if.end:
+  ret void
+}
+
+; Another infinite loop test this time with two nested infinite loop.
+; CHECK-LABEL: infiniteloop3
+; CHECK: # %end
+define void @infiniteloop3() {
+entry:
+  br i1 undef, label %loop2a, label %body
+
+body:                                             ; preds = %entry
+  br i1 undef, label %loop2a, label %end
+
+loop1:                                            ; preds = %loop2a, %loop2b
+  %var.phi = phi i32* [ %next.phi, %loop2b ], [ %var, %loop2a ]
+  %next.phi = phi i32* [ %next.load, %loop2b ], [ %next.var, %loop2a ]
+  %0 = icmp eq i32* %var, null
+  %next.load = load i32*, i32** undef
+  br i1 %0, label %loop2a, label %loop2b
+
+loop2a:                                           ; preds = %loop1, %body, %entry
+  %var = phi i32* [ null, %body ], [ null, %entry ], [ %next.phi, %loop1 ]
+  %next.var = phi i32* [ undef, %body ], [ null, %entry ], [ %next.load, %loop1 ]
+  br label %loop1
+
+loop2b:                                           ; preds = %loop1
+  %gep1 = bitcast i32* %var.phi to i32*
+  %next.ptr = bitcast i32* %gep1 to i32**
+  store i32* %next.phi, i32** %next.ptr
+  br label %loop1
+
+end:
+  ret void
+}
+
+@columns = external global [0 x i32], align 4
+@lock = common global i32 0, align 4
+@htindex = common global i32 0, align 4
+@stride = common global i32 0, align 4
+@ht = common global i32* null, align 8
+@he = common global i8* null, align 8
+
+; Test for a bug that was caused when save point was equal to restore point.
+; Function Attrs: nounwind
+; CHECK-LABEL: transpose
+;
+; Store of callee-save register saved by shrink wrapping
+; CHECK: std [[CSR:[0-9]+]], -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
+;
+; Reload of callee-save register
+; CHECK: ld [[CSR]], -[[STACK_OFFSET]](1) # 8-byte Folded Reload
+;
+; Ensure no subsequent uses of callee-save register before end of function
+; CHECK-NOT: {{[a-z]+}} [[CSR]]
+; CHECK: blr
+define signext i32 @transpose() {
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @columns, i64 0, i64 1), align 4
+  %shl.i = shl i32 %0, 7
+  %1 = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @columns, i64 0, i64 2), align 4
+  %or.i = or i32 %shl.i, %1
+  %shl1.i = shl i32 %or.i, 7
+  %2 = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @columns, i64 0, i64 3), align 4
+  %or2.i = or i32 %shl1.i, %2
+  %3 = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @columns, i64 0, i64 7), align 4
+  %shl3.i = shl i32 %3, 7
+  %4 = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @columns, i64 0, i64 6), align 4
+  %or4.i = or i32 %shl3.i, %4
+  %shl5.i = shl i32 %or4.i, 7
+  %5 = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @columns, i64 0, i64 5), align 4
+  %or6.i = or i32 %shl5.i, %5
+  %cmp.i = icmp ugt i32 %or2.i, %or6.i
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:
+  %shl7.i = shl i32 %or2.i, 7
+  %6 = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @columns, i64 0, i64 4), align 4
+  %or8.i = or i32 %6, %shl7.i
+  %conv.i = zext i32 %or8.i to i64
+  %shl9.i = shl nuw nsw i64 %conv.i, 21
+  %conv10.i = zext i32 %or6.i to i64
+  %or11.i = or i64 %shl9.i, %conv10.i
+  br label %hash.exit
+
+cond.false.i:
+  %shl12.i = shl i32 %or6.i, 7
+  %7 = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @columns, i64 0, i64 4), align 4
+  %or13.i = or i32 %7, %shl12.i
+  %conv14.i = zext i32 %or13.i to i64
+  %shl15.i = shl nuw nsw i64 %conv14.i, 21
+  %conv16.i = zext i32 %or2.i to i64
+  %or17.i = or i64 %shl15.i, %conv16.i
+  br label %hash.exit
+
+hash.exit:
+  %cond.i = phi i64 [ %or11.i, %cond.true.i ], [ %or17.i, %cond.false.i ]
+  %shr.29.i = lshr i64 %cond.i, 17
+  %conv18.i = trunc i64 %shr.29.i to i32
+  store i32 %conv18.i, i32* @lock, align 4
+  %rem.i = srem i64 %cond.i, 1050011
+  %conv19.i = trunc i64 %rem.i to i32
+  store i32 %conv19.i, i32* @htindex, align 4
+  %rem20.i = urem i32 %conv18.i, 179
+  %add.i = or i32 %rem20.i, 131072
+  store i32 %add.i, i32* @stride, align 4
+  %8 = load i32*, i32** @ht, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %8, i64 %rem.i
+  %9 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp eq i32 %9, %conv18.i
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %idxprom.lcssa = phi i64 [ %rem.i, %hash.exit ], [ %idxprom.1, %if.end ], [ %idxprom.2, %if.end.1 ], [ %idxprom.3, %if.end.2 ], [ %idxprom.4, %if.end.3 ], [ %idxprom.5, %if.end.4 ], [ %idxprom.6, %if.end.5 ], [ %idxprom.7, %if.end.6 ]
+  %10 = load i8*, i8** @he, align 8
+  %arrayidx3 = getelementptr inbounds i8, i8* %10, i64 %idxprom.lcssa
+  %11 = load i8, i8* %arrayidx3, align 1
+  %conv = sext i8 %11 to i32
+  br label %cleanup
+
+if.end:
+  %add = add nsw i32 %add.i, %conv19.i
+  %cmp4 = icmp sgt i32 %add, 1050010
+  %sub = add nsw i32 %add, -1050011
+  %sub.add = select i1 %cmp4, i32 %sub, i32 %add
+  %idxprom.1 = sext i32 %sub.add to i64
+  %arrayidx.1 = getelementptr inbounds i32, i32* %8, i64 %idxprom.1
+  %12 = load i32, i32* %arrayidx.1, align 4
+  %cmp1.1 = icmp eq i32 %12, %conv18.i
+  br i1 %cmp1.1, label %if.then, label %if.end.1
+
+cleanup:
+  %retval.0 = phi i32 [ %conv, %if.then ], [ -128, %if.end.6 ]
+  ret i32 %retval.0
+
+if.end.1:
+  %add.1 = add nsw i32 %add.i, %sub.add
+  %cmp4.1 = icmp sgt i32 %add.1, 1050010
+  %sub.1 = add nsw i32 %add.1, -1050011
+  %sub.add.1 = select i1 %cmp4.1, i32 %sub.1, i32 %add.1
+  %idxprom.2 = sext i32 %sub.add.1 to i64
+  %arrayidx.2 = getelementptr inbounds i32, i32* %8, i64 %idxprom.2
+  %13 = load i32, i32* %arrayidx.2, align 4
+  %cmp1.2 = icmp eq i32 %13, %conv18.i
+  br i1 %cmp1.2, label %if.then, label %if.end.2
+
+if.end.2:
+  %add.2 = add nsw i32 %add.i, %sub.add.1
+  %cmp4.2 = icmp sgt i32 %add.2, 1050010
+  %sub.2 = add nsw i32 %add.2, -1050011
+  %sub.add.2 = select i1 %cmp4.2, i32 %sub.2, i32 %add.2
+  %idxprom.3 = sext i32 %sub.add.2 to i64
+  %arrayidx.3 = getelementptr inbounds i32, i32* %8, i64 %idxprom.3
+  %14 = load i32, i32* %arrayidx.3, align 4
+  %cmp1.3 = icmp eq i32 %14, %conv18.i
+  br i1 %cmp1.3, label %if.then, label %if.end.3
+
+if.end.3:
+  %add.3 = add nsw i32 %add.i, %sub.add.2
+  %cmp4.3 = icmp sgt i32 %add.3, 1050010
+  %sub.3 = add nsw i32 %add.3, -1050011
+  %sub.add.3 = select i1 %cmp4.3, i32 %sub.3, i32 %add.3
+  %idxprom.4 = sext i32 %sub.add.3 to i64
+  %arrayidx.4 = getelementptr inbounds i32, i32* %8, i64 %idxprom.4
+  %15 = load i32, i32* %arrayidx.4, align 4
+  %cmp1.4 = icmp eq i32 %15, %conv18.i
+  br i1 %cmp1.4, label %if.then, label %if.end.4
+
+if.end.4:
+  %add.4 = add nsw i32 %add.i, %sub.add.3
+  %cmp4.4 = icmp sgt i32 %add.4, 1050010
+  %sub.4 = add nsw i32 %add.4, -1050011
+  %sub.add.4 = select i1 %cmp4.4, i32 %sub.4, i32 %add.4
+  %idxprom.5 = sext i32 %sub.add.4 to i64
+  %arrayidx.5 = getelementptr inbounds i32, i32* %8, i64 %idxprom.5
+  %16 = load i32, i32* %arrayidx.5, align 4
+  %cmp1.5 = icmp eq i32 %16, %conv18.i
+  br i1 %cmp1.5, label %if.then, label %if.end.5
+
+if.end.5:
+  %add.5 = add nsw i32 %add.i, %sub.add.4
+  %cmp4.5 = icmp sgt i32 %add.5, 1050010
+  %sub.5 = add nsw i32 %add.5, -1050011
+  %sub.add.5 = select i1 %cmp4.5, i32 %sub.5, i32 %add.5
+  %idxprom.6 = sext i32 %sub.add.5 to i64
+  %arrayidx.6 = getelementptr inbounds i32, i32* %8, i64 %idxprom.6
+  %17 = load i32, i32* %arrayidx.6, align 4
+  %cmp1.6 = icmp eq i32 %17, %conv18.i
+  br i1 %cmp1.6, label %if.then, label %if.end.6
+
+if.end.6:
+  %add.6 = add nsw i32 %add.i, %sub.add.5
+  %cmp4.6 = icmp sgt i32 %add.6, 1050010
+  %sub.6 = add nsw i32 %add.6, -1050011
+  %sub.add.6 = select i1 %cmp4.6, i32 %sub.6, i32 %add.6
+  %idxprom.7 = sext i32 %sub.add.6 to i64
+  %arrayidx.7 = getelementptr inbounds i32, i32* %8, i64 %idxprom.7
+  %18 = load i32, i32* %arrayidx.7, align 4
+  %cmp1.7 = icmp eq i32 %18, %conv18.i
+  br i1 %cmp1.7, label %if.then, label %cleanup
+}
diff --git a/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll b/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
index ad8ed38da7fa8..028006320cb52 100644
--- a/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
+++ b/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
@@ -10,7 +10,7 @@ define void @main() {
 }
 
 ; CHECK-LABEL: @main
-; CHECK-DAG li 4, 0
+; CHECK-DAG: li 4, 0
 ; CHECK-DAG: crxor 6, 6, 6
 ; CHECK: bl printf
 
diff --git a/test/CodeGen/PowerPC/ppc64-icbt-pwr7.ll b/test/CodeGen/PowerPC/ppc64-icbt-pwr7.ll
index e8617ccfc8a56..65b45ea555ae9 100644
--- a/test/CodeGen/PowerPC/ppc64-icbt-pwr7.ll
+++ b/test/CodeGen/PowerPC/ppc64-icbt-pwr7.ll
@@ -10,10 +10,10 @@ entry:
   ret void
 
 ; FIXME: Crashing is not really the correct behavior here, we really should just emit nothing
-; CHECK: Cannot select: 0x{{[0-9,a-f]+}}: ch = Prefetch 
-; CHECK: 0x{{[0-9,a-f]+}}: i32 = Constant<0> 
-; CHECK-NEXT: 0x{{[0-9,a-f]+}}: i32 = Constant<3>
-; CHECK-NEXT: 0x{{[0-9,a-f]+}}: i32 = Constant<0>
+; CHECK: Cannot select: {{0x[0-9,a-f]+|t[0-9]+}}: ch = Prefetch
+; CHECK: {{0x[0-9,a-f]+|t[0-9]+}}: i32 = Constant<0>
+; CHECK-NEXT: {{0x[0-9,a-f]+|t[0-9]+}}: i32 = Constant<3>
+; CHECK-NEXT: {{0x[0-9,a-f]+|t[0-9]+}}: i32 = Constant<0>
 
 }
 
diff --git a/test/CodeGen/PowerPC/ppcsoftops.ll b/test/CodeGen/PowerPC/ppcsoftops.ll
new file mode 100644
index 0000000000000..56c057613bdcb
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppcsoftops.ll
@@ -0,0 +1,50 @@
+; RUN: llc  -mtriple=powerpc-unknown-linux-gnu -O0 < %s | FileCheck %s
+define double @foo() #0 {
+entry:
+  %a = alloca double, align 8
+  %b = alloca double, align 8
+  %0 = load double, double* %a, align 8
+  %1 = load double, double* %b, align 8
+  %add = fadd double %0, %1
+  ret double %add
+
+  ; CHECK-LABEL:      __adddf3
+}
+
+define double @foo1() #0 {
+entry:
+  %a = alloca double, align 8
+  %b = alloca double, align 8
+  %0 = load double, double* %a, align 8
+  %1 = load double, double* %b, align 8
+  %mul = fmul double %0, %1
+  ret double %mul
+
+  ; CHECK-LABEL:      __muldf3
+}
+
+define double @foo2() #0 {
+entry:
+  %a = alloca double, align 8
+  %b = alloca double, align 8
+  %0 = load double, double* %a, align 8
+  %1 = load double, double* %b, align 8
+  %sub = fsub double %0, %1
+  ret double %sub
+
+  ; CHECK-LABEL:      __subdf3
+}
+
+define double @foo3() #0 {
+entry:
+  %a = alloca double, align 8
+  %b = alloca double, align 8
+  %0 = load double, double* %a, align 8
+  %1 = load double, double* %b, align 8
+  %div = fdiv double %0, %1
+  ret double %div
+
+  ; CHECK-LABEL:      __divdf3
+}
+
+attributes #0 = {"use-soft-float"="true" }
diff --git a/test/CodeGen/PowerPC/pr17168.ll b/test/CodeGen/PowerPC/pr17168.ll
index 0968954913815..b1bac59c9ce12 100644
--- a/test/CodeGen/PowerPC/pr17168.ll
+++ b/test/CodeGen/PowerPC/pr17168.ll
@@ -9,7 +9,7 @@ target triple = "powerpc64-unknown-linux-gnu"
 @grid_points = external global [3 x i32], align 4
 
 ; Function Attrs: nounwind
-define fastcc void @compute_rhs() #0 {
+define fastcc void @compute_rhs() #0 !dbg !114 {
 entry:
   br i1 undef, label %for.cond871.preheader.for.inc960_crit_edge, label %for.end1042, !dbg !439
 
@@ -54,11 +54,11 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!438, !464}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 190311)", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !298, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 190311)", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !298, imports: !2)
 !1 = !DIFile(filename: "bt.c", directory: "/home/hfinkel/src/NPB2.3-omp-C/BT")
 !2 = !{}
 !3 = !{!4, !82, !102, !114, !132, !145, !154, !155, !162, !183, !200, !201, !207, !208, !215, !221, !230, !238, !246, !255, !260, !261, !268, !274, !279, !280, !287, !293}
-!4 = !DISubprogram(name: "main", line: 74, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 74, file: !1, scope: !5, type: !6, variables: !12)
+!4 = distinct !DISubprogram(name: "main", line: 74, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 74, file: !1, scope: !5, type: !6, variables: !12)
 !5 = !DIFile(filename: "bt.c", directory: "/home/hfinkel/src/NPB2.3-omp-C/BT")
 !6 = !DISubroutineType(types: !7)
 !7 = !{!8, !8, !9}
@@ -67,20 +67,20 @@ attributes #1 = { nounwind readnone }
 !10 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !11)
 !11 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
 !12 = !{!13, !14, !15, !16, !17, !18, !19, !21, !22, !23, !25, !26}
-!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argc", line: 74, arg: 1, scope: !4, file: !5, type: !8)
-!14 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argv", line: 74, arg: 2, scope: !4, file: !5, type: !9)
-!15 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "niter", line: 76, scope: !4, file: !5, type: !8)
-!16 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "step", line: 76, scope: !4, file: !5, type: !8)
-!17 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "n3", line: 76, scope: !4, file: !5, type: !8)
-!18 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "nthreads", line: 77, scope: !4, file: !5, type: !8)
-!19 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "navg", line: 78, scope: !4, file: !5, type: !20)
+!13 = !DILocalVariable(name: "argc", line: 74, arg: 1, scope: !4, file: !5, type: !8)
+!14 = !DILocalVariable(name: "argv", line: 74, arg: 2, scope: !4, file: !5, type: !9)
+!15 = !DILocalVariable(name: "niter", line: 76, scope: !4, file: !5, type: !8)
+!16 = !DILocalVariable(name: "step", line: 76, scope: !4, file: !5, type: !8)
+!17 = !DILocalVariable(name: "n3", line: 76, scope: !4, file: !5, type: !8)
+!18 = !DILocalVariable(name: "nthreads", line: 77, scope: !4, file: !5, type: !8)
+!19 = !DILocalVariable(name: "navg", line: 78, scope: !4, file: !5, type: !20)
 !20 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
-!21 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "mflops", line: 78, scope: !4, file: !5, type: !20)
-!22 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "tmax", line: 80, scope: !4, file: !5, type: !20)
-!23 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "verified", line: 81, scope: !4, file: !5, type: !24)
+!21 = !DILocalVariable(name: "mflops", line: 78, scope: !4, file: !5, type: !20)
+!22 = !DILocalVariable(name: "tmax", line: 80, scope: !4, file: !5, type: !20)
+!23 = !DILocalVariable(name: "verified", line: 81, scope: !4, file: !5, type: !24)
 !24 = !DIDerivedType(tag: DW_TAG_typedef, name: "boolean", line: 12, file: !1, baseType: !8)
-!25 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "class", line: 82, scope: !4, file: !5, type: !11)
-!26 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "fp", line: 83, scope: !4, file: !5, type: !27)
+!25 = !DILocalVariable(name: "class", line: 82, scope: !4, file: !5, type: !11)
+!26 = !DILocalVariable(name: "fp", line: 83, scope: !4, file: !5, type: !27)
 !27 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !28)
 !28 = !DIDerivedType(tag: DW_TAG_typedef, name: "FILE", line: 49, file: !1, baseType: !29)
 !29 = !DICompositeType(tag: DW_TAG_structure_type, name: "_IO_FILE", line: 271, size: 1728, align: 64, file: !30, elements: !31)
@@ -136,222 +136,222 @@ attributes #1 = { nounwind readnone }
 !79 = !DICompositeType(tag: DW_TAG_array_type, size: 160, align: 8, baseType: !11, elements: !80)
 !80 = !{!81}
 !81 = !DISubrange(count: 20)
-!82 = !DISubprogram(name: "verify", line: 2388, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2388, file: !1, scope: !5, type: !83, variables: !86)
+!82 = distinct !DISubprogram(name: "verify", line: 2388, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2388, file: !1, scope: !5, type: !83, variables: !86)
 !83 = !DISubroutineType(types: !84)
 !84 = !{null, !8, !10, !85}
 !85 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !24)
 !86 = !{!87, !88, !89, !90, !94, !95, !96, !97, !98, !99, !100, !101}
-!87 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "no_time_steps", line: 2388, arg: 1, scope: !82, file: !5, type: !8)
-!88 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "class", line: 2388, arg: 2, scope: !82, file: !5, type: !10)
-!89 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "verified", line: 2388, arg: 3, scope: !82, file: !5, type: !85)
-!90 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "xcrref", line: 2397, scope: !82, file: !5, type: !91)
+!87 = !DILocalVariable(name: "no_time_steps", line: 2388, arg: 1, scope: !82, file: !5, type: !8)
+!88 = !DILocalVariable(name: "class", line: 2388, arg: 2, scope: !82, file: !5, type: !10)
+!89 = !DILocalVariable(name: "verified", line: 2388, arg: 3, scope: !82, file: !5, type: !85)
+!90 = !DILocalVariable(name: "xcrref", line: 2397, scope: !82, file: !5, type: !91)
 !91 = !DICompositeType(tag: DW_TAG_array_type, size: 320, align: 64, baseType: !20, elements: !92)
 !92 = !{!93}
 !93 = !DISubrange(count: 5)
-!94 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "xceref", line: 2397, scope: !82, file: !5, type: !91)
-!95 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "xcrdif", line: 2397, scope: !82, file: !5, type: !91)
-!96 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "xcedif", line: 2397, scope: !82, file: !5, type: !91)
-!97 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "epsilon", line: 2398, scope: !82, file: !5, type: !20)
-!98 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "xce", line: 2398, scope: !82, file: !5, type: !91)
-!99 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "xcr", line: 2398, scope: !82, file: !5, type: !91)
-!100 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "dtref", line: 2398, scope: !82, file: !5, type: !20)
-!101 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 2399, scope: !82, file: !5, type: !8)
-!102 = !DISubprogram(name: "rhs_norm", line: 266, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 266, file: !1, scope: !5, type: !103, variables: !106)
+!94 = !DILocalVariable(name: "xceref", line: 2397, scope: !82, file: !5, type: !91)
+!95 = !DILocalVariable(name: "xcrdif", line: 2397, scope: !82, file: !5, type: !91)
+!96 = !DILocalVariable(name: "xcedif", line: 2397, scope: !82, file: !5, type: !91)
+!97 = !DILocalVariable(name: "epsilon", line: 2398, scope: !82, file: !5, type: !20)
+!98 = !DILocalVariable(name: "xce", line: 2398, scope: !82, file: !5, type: !91)
+!99 = !DILocalVariable(name: "xcr", line: 2398, scope: !82, file: !5, type: !91)
+!100 = !DILocalVariable(name: "dtref", line: 2398, scope: !82, file: !5, type: !20)
+!101 = !DILocalVariable(name: "m", line: 2399, scope: !82, file: !5, type: !8)
+!102 = distinct !DISubprogram(name: "rhs_norm", line: 266, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 266, file: !1, scope: !5, type: !103, variables: !106)
 !103 = !DISubroutineType(types: !104)
 !104 = !{null, !105}
 !105 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !20)
 !106 = !{!107, !108, !109, !110, !111, !112, !113}
-!107 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "rms", line: 266, arg: 1, scope: !102, file: !5, type: !105)
-!108 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 271, scope: !102, file: !5, type: !8)
-!109 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 271, scope: !102, file: !5, type: !8)
-!110 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 271, scope: !102, file: !5, type: !8)
-!111 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "d", line: 271, scope: !102, file: !5, type: !8)
-!112 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 271, scope: !102, file: !5, type: !8)
-!113 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "add", line: 272, scope: !102, file: !5, type: !20)
-!114 = !DISubprogram(name: "compute_rhs", line: 1767, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1767, file: !1, scope: !5, type: !115, function: void ()* @compute_rhs, variables: !117)
+!107 = !DILocalVariable(name: "rms", line: 266, arg: 1, scope: !102, file: !5, type: !105)
+!108 = !DILocalVariable(name: "i", line: 271, scope: !102, file: !5, type: !8)
+!109 = !DILocalVariable(name: "j", line: 271, scope: !102, file: !5, type: !8)
+!110 = !DILocalVariable(name: "k", line: 271, scope: !102, file: !5, type: !8)
+!111 = !DILocalVariable(name: "d", line: 271, scope: !102, file: !5, type: !8)
+!112 = !DILocalVariable(name: "m", line: 271, scope: !102, file: !5, type: !8)
+!113 = !DILocalVariable(name: "add", line: 272, scope: !102, file: !5, type: !20)
+!114 = distinct !DISubprogram(name: "compute_rhs", line: 1767, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1767, file: !1, scope: !5, type: !115, variables: !117)
 !115 = !DISubroutineType(types: !116)
 !116 = !{null}
 !117 = !{!118, !119, !120, !121, !122, !123, !124, !125, !126, !127, !128, !129, !130, !131}
-!118 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 1769, scope: !114, file: !5, type: !8)
-!119 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 1769, scope: !114, file: !5, type: !8)
-!120 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 1769, scope: !114, file: !5, type: !8)
-!121 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 1769, scope: !114, file: !5, type: !8)
-!122 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "rho_inv", line: 1770, scope: !114, file: !5, type: !20)
-!123 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "uijk", line: 1770, scope: !114, file: !5, type: !20)
-!124 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "up1", line: 1770, scope: !114, file: !5, type: !20)
-!125 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "um1", line: 1770, scope: !114, file: !5, type: !20)
-!126 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "vijk", line: 1770, scope: !114, file: !5, type: !20)
-!127 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "vp1", line: 1770, scope: !114, file: !5, type: !20)
-!128 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "vm1", line: 1770, scope: !114, file: !5, type: !20)
-!129 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "wijk", line: 1770, scope: !114, file: !5, type: !20)
-!130 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "wp1", line: 1770, scope: !114, file: !5, type: !20)
-!131 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "wm1", line: 1770, scope: !114, file: !5, type: !20)
-!132 = !DISubprogram(name: "error_norm", line: 225, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 225, file: !1, scope: !5, type: !103, variables: !133)
+!118 = !DILocalVariable(name: "i", line: 1769, scope: !114, file: !5, type: !8)
+!119 = !DILocalVariable(name: "j", line: 1769, scope: !114, file: !5, type: !8)
+!120 = !DILocalVariable(name: "k", line: 1769, scope: !114, file: !5, type: !8)
+!121 = !DILocalVariable(name: "m", line: 1769, scope: !114, file: !5, type: !8)
+!122 = !DILocalVariable(name: "rho_inv", line: 1770, scope: !114, file: !5, type: !20)
+!123 = !DILocalVariable(name: "uijk", line: 1770, scope: !114, file: !5, type: !20)
+!124 = !DILocalVariable(name: "up1", line: 1770, scope: !114, file: !5, type: !20)
+!125 = !DILocalVariable(name: "um1", line: 1770, scope: !114, file: !5, type: !20)
+!126 = !DILocalVariable(name: "vijk", line: 1770, scope: !114, file: !5, type: !20)
+!127 = !DILocalVariable(name: "vp1", line: 1770, scope: !114, file: !5, type: !20)
+!128 = !DILocalVariable(name: "vm1", line: 1770, scope: !114, file: !5, type: !20)
+!129 = !DILocalVariable(name: "wijk", line: 1770, scope: !114, file: !5, type: !20)
+!130 = !DILocalVariable(name: "wp1", line: 1770, scope: !114, file: !5, type: !20)
+!131 = !DILocalVariable(name: "wm1", line: 1770, scope: !114, file: !5, type: !20)
+!132 = distinct !DISubprogram(name: "error_norm", line: 225, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 225, file: !1, scope: !5, type: !103, variables: !133)
 !133 = !{!134, !135, !136, !137, !138, !139, !140, !141, !142, !143, !144}
-!134 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "rms", line: 225, arg: 1, scope: !132, file: !5, type: !105)
-!135 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 232, scope: !132, file: !5, type: !8)
-!136 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 232, scope: !132, file: !5, type: !8)
-!137 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 232, scope: !132, file: !5, type: !8)
-!138 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 232, scope: !132, file: !5, type: !8)
-!139 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "d", line: 232, scope: !132, file: !5, type: !8)
-!140 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "xi", line: 233, scope: !132, file: !5, type: !20)
-!141 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "eta", line: 233, scope: !132, file: !5, type: !20)
-!142 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "zeta", line: 233, scope: !132, file: !5, type: !20)
-!143 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "u_exact", line: 233, scope: !132, file: !5, type: !91)
-!144 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "add", line: 233, scope: !132, file: !5, type: !20)
-!145 = !DISubprogram(name: "exact_solution", line: 643, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 644, file: !1, scope: !5, type: !146, variables: !148)
+!134 = !DILocalVariable(name: "rms", line: 225, arg: 1, scope: !132, file: !5, type: !105)
+!135 = !DILocalVariable(name: "i", line: 232, scope: !132, file: !5, type: !8)
+!136 = !DILocalVariable(name: "j", line: 232, scope: !132, file: !5, type: !8)
+!137 = !DILocalVariable(name: "k", line: 232, scope: !132, file: !5, type: !8)
+!138 = !DILocalVariable(name: "m", line: 232, scope: !132, file: !5, type: !8)
+!139 = !DILocalVariable(name: "d", line: 232, scope: !132, file: !5, type: !8)
+!140 = !DILocalVariable(name: "xi", line: 233, scope: !132, file: !5, type: !20)
+!141 = !DILocalVariable(name: "eta", line: 233, scope: !132, file: !5, type: !20)
+!142 = !DILocalVariable(name: "zeta", line: 233, scope: !132, file: !5, type: !20)
+!143 = !DILocalVariable(name: "u_exact", line: 233, scope: !132, file: !5, type: !91)
+!144 = !DILocalVariable(name: "add", line: 233, scope: !132, file: !5, type: !20)
+!145 = distinct !DISubprogram(name: "exact_solution", line: 643, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 644, file: !1, scope: !5, type: !146, variables: !148)
 !146 = !DISubroutineType(types: !147)
 !147 = !{null, !20, !20, !20, !105}
 !148 = !{!149, !150, !151, !152, !153}
-!149 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "xi", line: 643, arg: 1, scope: !145, file: !5, type: !20)
-!150 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "eta", line: 643, arg: 2, scope: !145, file: !5, type: !20)
-!151 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "zeta", line: 643, arg: 3, scope: !145, file: !5, type: !20)
-!152 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "dtemp", line: 644, arg: 4, scope: !145, file: !5, type: !105)
-!153 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 653, scope: !145, file: !5, type: !8)
-!154 = !DISubprogram(name: "set_constants", line: 2191, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2191, file: !1, scope: !5, type: !115, variables: !2)
-!155 = !DISubprogram(name: "lhsinit", line: 855, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 855, file: !1, scope: !5, type: !115, variables: !156)
+!149 = !DILocalVariable(name: "xi", line: 643, arg: 1, scope: !145, file: !5, type: !20)
+!150 = !DILocalVariable(name: "eta", line: 643, arg: 2, scope: !145, file: !5, type: !20)
+!151 = !DILocalVariable(name: "zeta", line: 643, arg: 3, scope: !145, file: !5, type: !20)
+!152 = !DILocalVariable(name: "dtemp", line: 644, arg: 4, scope: !145, file: !5, type: !105)
+!153 = !DILocalVariable(name: "m", line: 653, scope: !145, file: !5, type: !8)
+!154 = distinct !DISubprogram(name: "set_constants", line: 2191, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2191, file: !1, scope: !5, type: !115, variables: !2)
+!155 = distinct !DISubprogram(name: "lhsinit", line: 855, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 855, file: !1, scope: !5, type: !115, variables: !156)
 !156 = !{!157, !158, !159, !160, !161}
-!157 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 857, scope: !155, file: !5, type: !8)
-!158 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 857, scope: !155, file: !5, type: !8)
-!159 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 857, scope: !155, file: !5, type: !8)
-!160 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 857, scope: !155, file: !5, type: !8)
-!161 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "n", line: 857, scope: !155, file: !5, type: !8)
-!162 = !DISubprogram(name: "initialize", line: 669, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 669, file: !1, scope: !5, type: !115, variables: !163)
+!157 = !DILocalVariable(name: "i", line: 857, scope: !155, file: !5, type: !8)
+!158 = !DILocalVariable(name: "j", line: 857, scope: !155, file: !5, type: !8)
+!159 = !DILocalVariable(name: "k", line: 857, scope: !155, file: !5, type: !8)
+!160 = !DILocalVariable(name: "m", line: 857, scope: !155, file: !5, type: !8)
+!161 = !DILocalVariable(name: "n", line: 857, scope: !155, file: !5, type: !8)
+!162 = distinct !DISubprogram(name: "initialize", line: 669, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 669, file: !1, scope: !5, type: !115, variables: !163)
 !163 = !{!164, !165, !166, !167, !168, !169, !170, !171, !172, !173, !174, !179, !180, !181, !182}
-!164 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 679, scope: !162, file: !5, type: !8)
-!165 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 679, scope: !162, file: !5, type: !8)
-!166 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 679, scope: !162, file: !5, type: !8)
-!167 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 679, scope: !162, file: !5, type: !8)
-!168 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "ix", line: 679, scope: !162, file: !5, type: !8)
-!169 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "iy", line: 679, scope: !162, file: !5, type: !8)
-!170 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "iz", line: 679, scope: !162, file: !5, type: !8)
-!171 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "xi", line: 680, scope: !162, file: !5, type: !20)
-!172 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "eta", line: 680, scope: !162, file: !5, type: !20)
-!173 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "zeta", line: 680, scope: !162, file: !5, type: !20)
-!174 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "Pface", line: 680, scope: !162, file: !5, type: !175)
+!164 = !DILocalVariable(name: "i", line: 679, scope: !162, file: !5, type: !8)
+!165 = !DILocalVariable(name: "j", line: 679, scope: !162, file: !5, type: !8)
+!166 = !DILocalVariable(name: "k", line: 679, scope: !162, file: !5, type: !8)
+!167 = !DILocalVariable(name: "m", line: 679, scope: !162, file: !5, type: !8)
+!168 = !DILocalVariable(name: "ix", line: 679, scope: !162, file: !5, type: !8)
+!169 = !DILocalVariable(name: "iy", line: 679, scope: !162, file: !5, type: !8)
+!170 = !DILocalVariable(name: "iz", line: 679, scope: !162, file: !5, type: !8)
+!171 = !DILocalVariable(name: "xi", line: 680, scope: !162, file: !5, type: !20)
+!172 = !DILocalVariable(name: "eta", line: 680, scope: !162, file: !5, type: !20)
+!173 = !DILocalVariable(name: "zeta", line: 680, scope: !162, file: !5, type: !20)
+!174 = !DILocalVariable(name: "Pface", line: 680, scope: !162, file: !5, type: !175)
 !175 = !DICompositeType(tag: DW_TAG_array_type, size: 1920, align: 64, baseType: !20, elements: !176)
 !176 = !{!177, !178, !93}
 !177 = !DISubrange(count: 2)
 !178 = !DISubrange(count: 3)
-!179 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "Pxi", line: 680, scope: !162, file: !5, type: !20)
-!180 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "Peta", line: 680, scope: !162, file: !5, type: !20)
-!181 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "Pzeta", line: 680, scope: !162, file: !5, type: !20)
-!182 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "temp", line: 680, scope: !162, file: !5, type: !91)
-!183 = !DISubprogram(name: "exact_rhs", line: 301, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 301, file: !1, scope: !5, type: !115, variables: !184)
+!179 = !DILocalVariable(name: "Pxi", line: 680, scope: !162, file: !5, type: !20)
+!180 = !DILocalVariable(name: "Peta", line: 680, scope: !162, file: !5, type: !20)
+!181 = !DILocalVariable(name: "Pzeta", line: 680, scope: !162, file: !5, type: !20)
+!182 = !DILocalVariable(name: "temp", line: 680, scope: !162, file: !5, type: !91)
+!183 = distinct !DISubprogram(name: "exact_rhs", line: 301, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 301, file: !1, scope: !5, type: !115, variables: !184)
 !184 = !{!185, !186, !187, !188, !189, !190, !191, !192, !193, !194, !195, !196, !197, !198, !199}
-!185 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "dtemp", line: 310, scope: !183, file: !5, type: !91)
-!186 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "xi", line: 310, scope: !183, file: !5, type: !20)
-!187 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "eta", line: 310, scope: !183, file: !5, type: !20)
-!188 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "zeta", line: 310, scope: !183, file: !5, type: !20)
-!189 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "dtpp", line: 310, scope: !183, file: !5, type: !20)
-!190 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 311, scope: !183, file: !5, type: !8)
-!191 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 311, scope: !183, file: !5, type: !8)
-!192 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 311, scope: !183, file: !5, type: !8)
-!193 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 311, scope: !183, file: !5, type: !8)
-!194 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "ip1", line: 311, scope: !183, file: !5, type: !8)
-!195 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "im1", line: 311, scope: !183, file: !5, type: !8)
-!196 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "jp1", line: 311, scope: !183, file: !5, type: !8)
-!197 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "jm1", line: 311, scope: !183, file: !5, type: !8)
-!198 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "km1", line: 311, scope: !183, file: !5, type: !8)
-!199 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "kp1", line: 311, scope: !183, file: !5, type: !8)
-!200 = !DISubprogram(name: "adi", line: 210, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 210, file: !1, scope: !5, type: !115, variables: !2)
-!201 = !DISubprogram(name: "add", line: 187, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 187, file: !1, scope: !5, type: !115, variables: !202)
+!185 = !DILocalVariable(name: "dtemp", line: 310, scope: !183, file: !5, type: !91)
+!186 = !DILocalVariable(name: "xi", line: 310, scope: !183, file: !5, type: !20)
+!187 = !DILocalVariable(name: "eta", line: 310, scope: !183, file: !5, type: !20)
+!188 = !DILocalVariable(name: "zeta", line: 310, scope: !183, file: !5, type: !20)
+!189 = !DILocalVariable(name: "dtpp", line: 310, scope: !183, file: !5, type: !20)
+!190 = !DILocalVariable(name: "m", line: 311, scope: !183, file: !5, type: !8)
+!191 = !DILocalVariable(name: "i", line: 311, scope: !183, file: !5, type: !8)
+!192 = !DILocalVariable(name: "j", line: 311, scope: !183, file: !5, type: !8)
+!193 = !DILocalVariable(name: "k", line: 311, scope: !183, file: !5, type: !8)
+!194 = !DILocalVariable(name: "ip1", line: 311, scope: !183, file: !5, type: !8)
+!195 = !DILocalVariable(name: "im1", line: 311, scope: !183, file: !5, type: !8)
+!196 = !DILocalVariable(name: "jp1", line: 311, scope: !183, file: !5, type: !8)
+!197 = !DILocalVariable(name: "jm1", line: 311, scope: !183, file: !5, type: !8)
+!198 = !DILocalVariable(name: "km1", line: 311, scope: !183, file: !5, type: !8)
+!199 = !DILocalVariable(name: "kp1", line: 311, scope: !183, file: !5, type: !8)
+!200 = distinct !DISubprogram(name: "adi", line: 210, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 210, file: !1, scope: !5, type: !115, variables: !2)
+!201 = distinct !DISubprogram(name: "add", line: 187, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 187, file: !1, scope: !5, type: !115, variables: !202)
 !202 = !{!203, !204, !205, !206}
-!203 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 193, scope: !201, file: !5, type: !8)
-!204 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 193, scope: !201, file: !5, type: !8)
-!205 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 193, scope: !201, file: !5, type: !8)
-!206 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 193, scope: !201, file: !5, type: !8)
-!207 = !DISubprogram(name: "z_solve", line: 3457, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3457, file: !1, scope: !5, type: !115, variables: !2)
-!208 = !DISubprogram(name: "z_backsubstitute", line: 3480, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3480, file: !1, scope: !5, type: !115, variables: !209)
+!203 = !DILocalVariable(name: "i", line: 193, scope: !201, file: !5, type: !8)
+!204 = !DILocalVariable(name: "j", line: 193, scope: !201, file: !5, type: !8)
+!205 = !DILocalVariable(name: "k", line: 193, scope: !201, file: !5, type: !8)
+!206 = !DILocalVariable(name: "m", line: 193, scope: !201, file: !5, type: !8)
+!207 = distinct !DISubprogram(name: "z_solve", line: 3457, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3457, file: !1, scope: !5, type: !115, variables: !2)
+!208 = distinct !DISubprogram(name: "z_backsubstitute", line: 3480, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3480, file: !1, scope: !5, type: !115, variables: !209)
 !209 = !{!210, !211, !212, !213, !214}
-!210 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 3492, scope: !208, file: !5, type: !8)
-!211 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 3492, scope: !208, file: !5, type: !8)
-!212 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 3492, scope: !208, file: !5, type: !8)
-!213 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 3492, scope: !208, file: !5, type: !8)
-!214 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "n", line: 3492, scope: !208, file: !5, type: !8)
-!215 = !DISubprogram(name: "z_solve_cell", line: 3512, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3512, file: !1, scope: !5, type: !115, variables: !216)
+!210 = !DILocalVariable(name: "i", line: 3492, scope: !208, file: !5, type: !8)
+!211 = !DILocalVariable(name: "j", line: 3492, scope: !208, file: !5, type: !8)
+!212 = !DILocalVariable(name: "k", line: 3492, scope: !208, file: !5, type: !8)
+!213 = !DILocalVariable(name: "m", line: 3492, scope: !208, file: !5, type: !8)
+!214 = !DILocalVariable(name: "n", line: 3492, scope: !208, file: !5, type: !8)
+!215 = distinct !DISubprogram(name: "z_solve_cell", line: 3512, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3512, file: !1, scope: !5, type: !115, variables: !216)
 !216 = !{!217, !218, !219, !220}
-!217 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 3527, scope: !215, file: !5, type: !8)
-!218 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 3527, scope: !215, file: !5, type: !8)
-!219 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 3527, scope: !215, file: !5, type: !8)
-!220 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "ksize", line: 3527, scope: !215, file: !5, type: !8)
-!221 = !DISubprogram(name: "binvrhs", line: 3154, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3154, file: !1, scope: !5, type: !222, variables: !225)
+!217 = !DILocalVariable(name: "i", line: 3527, scope: !215, file: !5, type: !8)
+!218 = !DILocalVariable(name: "j", line: 3527, scope: !215, file: !5, type: !8)
+!219 = !DILocalVariable(name: "k", line: 3527, scope: !215, file: !5, type: !8)
+!220 = !DILocalVariable(name: "ksize", line: 3527, scope: !215, file: !5, type: !8)
+!221 = distinct !DISubprogram(name: "binvrhs", line: 3154, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3154, file: !1, scope: !5, type: !222, variables: !225)
 !222 = !DISubroutineType(types: !223)
 !223 = !{null, !224, !105}
 !224 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !91)
 !225 = !{!226, !227, !228, !229}
-!226 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "lhs", line: 3154, arg: 1, scope: !221, file: !5, type: !224)
-!227 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "r", line: 3154, arg: 2, scope: !221, file: !5, type: !105)
-!228 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "pivot", line: 3159, scope: !221, file: !5, type: !20)
-!229 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "coeff", line: 3159, scope: !221, file: !5, type: !20)
-!230 = !DISubprogram(name: "matmul_sub", line: 2841, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2842, file: !1, scope: !5, type: !231, variables: !233)
+!226 = !DILocalVariable(name: "lhs", line: 3154, arg: 1, scope: !221, file: !5, type: !224)
+!227 = !DILocalVariable(name: "r", line: 3154, arg: 2, scope: !221, file: !5, type: !105)
+!228 = !DILocalVariable(name: "pivot", line: 3159, scope: !221, file: !5, type: !20)
+!229 = !DILocalVariable(name: "coeff", line: 3159, scope: !221, file: !5, type: !20)
+!230 = distinct !DISubprogram(name: "matmul_sub", line: 2841, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2842, file: !1, scope: !5, type: !231, variables: !233)
 !231 = !DISubroutineType(types: !232)
 !232 = !{null, !224, !224, !224}
 !233 = !{!234, !235, !236, !237}
-!234 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "ablock", line: 2841, arg: 1, scope: !230, file: !5, type: !224)
-!235 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "bblock", line: 2841, arg: 2, scope: !230, file: !5, type: !224)
-!236 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "cblock", line: 2842, arg: 3, scope: !230, file: !5, type: !224)
-!237 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 2851, scope: !230, file: !5, type: !8)
-!238 = !DISubprogram(name: "matvec_sub", line: 2814, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2814, file: !1, scope: !5, type: !239, variables: !241)
+!234 = !DILocalVariable(name: "ablock", line: 2841, arg: 1, scope: !230, file: !5, type: !224)
+!235 = !DILocalVariable(name: "bblock", line: 2841, arg: 2, scope: !230, file: !5, type: !224)
+!236 = !DILocalVariable(name: "cblock", line: 2842, arg: 3, scope: !230, file: !5, type: !224)
+!237 = !DILocalVariable(name: "j", line: 2851, scope: !230, file: !5, type: !8)
+!238 = distinct !DISubprogram(name: "matvec_sub", line: 2814, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2814, file: !1, scope: !5, type: !239, variables: !241)
 !239 = !DISubroutineType(types: !240)
 !240 = !{null, !224, !105, !105}
 !241 = !{!242, !243, !244, !245}
-!242 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "ablock", line: 2814, arg: 1, scope: !238, file: !5, type: !224)
-!243 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "avec", line: 2814, arg: 2, scope: !238, file: !5, type: !105)
-!244 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "bvec", line: 2814, arg: 3, scope: !238, file: !5, type: !105)
-!245 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 2823, scope: !238, file: !5, type: !8)
-!246 = !DISubprogram(name: "binvcrhs", line: 2885, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2885, file: !1, scope: !5, type: !247, variables: !249)
+!242 = !DILocalVariable(name: "ablock", line: 2814, arg: 1, scope: !238, file: !5, type: !224)
+!243 = !DILocalVariable(name: "avec", line: 2814, arg: 2, scope: !238, file: !5, type: !105)
+!244 = !DILocalVariable(name: "bvec", line: 2814, arg: 3, scope: !238, file: !5, type: !105)
+!245 = !DILocalVariable(name: "i", line: 2823, scope: !238, file: !5, type: !8)
+!246 = distinct !DISubprogram(name: "binvcrhs", line: 2885, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2885, file: !1, scope: !5, type: !247, variables: !249)
 !247 = !DISubroutineType(types: !248)
 !248 = !{null, !224, !224, !105}
 !249 = !{!250, !251, !252, !253, !254}
-!250 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "lhs", line: 2885, arg: 1, scope: !246, file: !5, type: !224)
-!251 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 2885, arg: 2, scope: !246, file: !5, type: !224)
-!252 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "r", line: 2885, arg: 3, scope: !246, file: !5, type: !105)
-!253 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "pivot", line: 2890, scope: !246, file: !5, type: !20)
-!254 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "coeff", line: 2890, scope: !246, file: !5, type: !20)
-!255 = !DISubprogram(name: "lhsz", line: 1475, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1475, file: !1, scope: !5, type: !115, variables: !256)
+!250 = !DILocalVariable(name: "lhs", line: 2885, arg: 1, scope: !246, file: !5, type: !224)
+!251 = !DILocalVariable(name: "c", line: 2885, arg: 2, scope: !246, file: !5, type: !224)
+!252 = !DILocalVariable(name: "r", line: 2885, arg: 3, scope: !246, file: !5, type: !105)
+!253 = !DILocalVariable(name: "pivot", line: 2890, scope: !246, file: !5, type: !20)
+!254 = !DILocalVariable(name: "coeff", line: 2890, scope: !246, file: !5, type: !20)
+!255 = distinct !DISubprogram(name: "lhsz", line: 1475, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1475, file: !1, scope: !5, type: !115, variables: !256)
 !256 = !{!257, !258, !259}
-!257 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 1484, scope: !255, file: !5, type: !8)
-!258 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 1484, scope: !255, file: !5, type: !8)
-!259 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 1484, scope: !255, file: !5, type: !8)
-!260 = !DISubprogram(name: "y_solve", line: 3299, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3299, file: !1, scope: !5, type: !115, variables: !2)
-!261 = !DISubprogram(name: "y_backsubstitute", line: 3323, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3323, file: !1, scope: !5, type: !115, variables: !262)
+!257 = !DILocalVariable(name: "i", line: 1484, scope: !255, file: !5, type: !8)
+!258 = !DILocalVariable(name: "j", line: 1484, scope: !255, file: !5, type: !8)
+!259 = !DILocalVariable(name: "k", line: 1484, scope: !255, file: !5, type: !8)
+!260 = distinct !DISubprogram(name: "y_solve", line: 3299, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3299, file: !1, scope: !5, type: !115, variables: !2)
+!261 = distinct !DISubprogram(name: "y_backsubstitute", line: 3323, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3323, file: !1, scope: !5, type: !115, variables: !262)
 !262 = !{!263, !264, !265, !266, !267}
-!263 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 3335, scope: !261, file: !5, type: !8)
-!264 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 3335, scope: !261, file: !5, type: !8)
-!265 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 3335, scope: !261, file: !5, type: !8)
-!266 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 3335, scope: !261, file: !5, type: !8)
-!267 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "n", line: 3335, scope: !261, file: !5, type: !8)
-!268 = !DISubprogram(name: "y_solve_cell", line: 3355, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3355, file: !1, scope: !5, type: !115, variables: !269)
+!263 = !DILocalVariable(name: "i", line: 3335, scope: !261, file: !5, type: !8)
+!264 = !DILocalVariable(name: "j", line: 3335, scope: !261, file: !5, type: !8)
+!265 = !DILocalVariable(name: "k", line: 3335, scope: !261, file: !5, type: !8)
+!266 = !DILocalVariable(name: "m", line: 3335, scope: !261, file: !5, type: !8)
+!267 = !DILocalVariable(name: "n", line: 3335, scope: !261, file: !5, type: !8)
+!268 = distinct !DISubprogram(name: "y_solve_cell", line: 3355, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3355, file: !1, scope: !5, type: !115, variables: !269)
 !269 = !{!270, !271, !272, !273}
-!270 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 3370, scope: !268, file: !5, type: !8)
-!271 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 3370, scope: !268, file: !5, type: !8)
-!272 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 3370, scope: !268, file: !5, type: !8)
-!273 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "jsize", line: 3370, scope: !268, file: !5, type: !8)
-!274 = !DISubprogram(name: "lhsy", line: 1181, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1181, file: !1, scope: !5, type: !115, variables: !275)
+!270 = !DILocalVariable(name: "i", line: 3370, scope: !268, file: !5, type: !8)
+!271 = !DILocalVariable(name: "j", line: 3370, scope: !268, file: !5, type: !8)
+!272 = !DILocalVariable(name: "k", line: 3370, scope: !268, file: !5, type: !8)
+!273 = !DILocalVariable(name: "jsize", line: 3370, scope: !268, file: !5, type: !8)
+!274 = distinct !DISubprogram(name: "lhsy", line: 1181, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1181, file: !1, scope: !5, type: !115, variables: !275)
 !275 = !{!276, !277, !278}
-!276 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 1190, scope: !274, file: !5, type: !8)
-!277 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 1190, scope: !274, file: !5, type: !8)
-!278 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 1190, scope: !274, file: !5, type: !8)
-!279 = !DISubprogram(name: "x_solve", line: 2658, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2658, file: !1, scope: !5, type: !115, variables: !2)
-!280 = !DISubprogram(name: "x_backsubstitute", line: 2684, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2684, file: !1, scope: !5, type: !115, variables: !281)
+!276 = !DILocalVariable(name: "i", line: 1190, scope: !274, file: !5, type: !8)
+!277 = !DILocalVariable(name: "j", line: 1190, scope: !274, file: !5, type: !8)
+!278 = !DILocalVariable(name: "k", line: 1190, scope: !274, file: !5, type: !8)
+!279 = distinct !DISubprogram(name: "x_solve", line: 2658, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2658, file: !1, scope: !5, type: !115, variables: !2)
+!280 = distinct !DISubprogram(name: "x_backsubstitute", line: 2684, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2684, file: !1, scope: !5, type: !115, variables: !281)
 !281 = !{!282, !283, !284, !285, !286}
-!282 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 2696, scope: !280, file: !5, type: !8)
-!283 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 2696, scope: !280, file: !5, type: !8)
-!284 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 2696, scope: !280, file: !5, type: !8)
-!285 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 2696, scope: !280, file: !5, type: !8)
-!286 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "n", line: 2696, scope: !280, file: !5, type: !8)
-!287 = !DISubprogram(name: "x_solve_cell", line: 2716, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2716, file: !1, scope: !5, type: !115, variables: !288)
+!282 = !DILocalVariable(name: "i", line: 2696, scope: !280, file: !5, type: !8)
+!283 = !DILocalVariable(name: "j", line: 2696, scope: !280, file: !5, type: !8)
+!284 = !DILocalVariable(name: "k", line: 2696, scope: !280, file: !5, type: !8)
+!285 = !DILocalVariable(name: "m", line: 2696, scope: !280, file: !5, type: !8)
+!286 = !DILocalVariable(name: "n", line: 2696, scope: !280, file: !5, type: !8)
+!287 = distinct !DISubprogram(name: "x_solve_cell", line: 2716, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2716, file: !1, scope: !5, type: !115, variables: !288)
 !288 = !{!289, !290, !291, !292}
-!289 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 2728, scope: !287, file: !5, type: !8)
-!290 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 2728, scope: !287, file: !5, type: !8)
-!291 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 2728, scope: !287, file: !5, type: !8)
-!292 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "isize", line: 2728, scope: !287, file: !5, type: !8)
-!293 = !DISubprogram(name: "lhsx", line: 898, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 898, file: !1, scope: !5, type: !115, variables: !294)
+!289 = !DILocalVariable(name: "i", line: 2728, scope: !287, file: !5, type: !8)
+!290 = !DILocalVariable(name: "j", line: 2728, scope: !287, file: !5, type: !8)
+!291 = !DILocalVariable(name: "k", line: 2728, scope: !287, file: !5, type: !8)
+!292 = !DILocalVariable(name: "isize", line: 2728, scope: !287, file: !5, type: !8)
+!293 = distinct !DISubprogram(name: "lhsx", line: 898, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 898, file: !1, scope: !5, type: !115, variables: !294)
 !294 = !{!295, !296, !297}
-!295 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 907, scope: !293, file: !5, type: !8)
-!296 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "j", line: 907, scope: !293, file: !5, type: !8)
-!297 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 907, scope: !293, file: !5, type: !8)
+!295 = !DILocalVariable(name: "i", line: 907, scope: !293, file: !5, type: !8)
+!296 = !DILocalVariable(name: "j", line: 907, scope: !293, file: !5, type: !8)
+!297 = !DILocalVariable(name: "k", line: 907, scope: !293, file: !5, type: !8)
 !298 = !{!299, !304, !305, !309, !310, !311, !312, !313, !314, !315, !316, !317, !318, !319, !320, !321, !322, !323, !324, !325, !326, !327, !328, !329, !330, !331, !332, !333, !334, !335, !336, !337, !338, !339, !340, !341, !342, !343, !347, !350, !351, !352, !353, !354, !355, !356, !360, !361, !362, !363, !364, !365, !366, !367, !368, !369, !370, !371, !372, !373, !374, !375, !376, !377, !378, !379, !380, !381, !382, !383, !384, !385, !386, !387, !388, !389, !390, !391, !392, !393, !394, !395, !396, !397, !398, !399, !400, !401, !402, !403, !404, !405, !406, !407, !408, !409, !410, !411, !412, !413, !414, !415, !416, !417, !418, !419, !422, !426, !427, !430, !431, !434, !435, !436, !437}
 !299 = !DIGlobalVariable(name: "grid_points", line: 28, isLocal: true, isDefinition: true, scope: null, file: !300, type: !302, variable: [3 x i32]* @grid_points)
 !300 = !DIFile(filename: "./header.h", directory: "/home/hfinkel/src/NPB2.3-omp-C/BT")
diff --git a/test/CodeGen/PowerPC/pr24546.ll b/test/CodeGen/PowerPC/pr24546.ll
index 3bb638af2343e..06f6bc93da995 100644
--- a/test/CodeGen/PowerPC/pr24546.ll
+++ b/test/CodeGen/PowerPC/pr24546.ll
@@ -6,7 +6,7 @@
 @php_intpow10.powers = external unnamed_addr constant [23 x double], align 8
 
 ; Function Attrs: nounwind
-define double @_php_math_round(double %value, i32 signext %places, i32 signext %mode) #0 {
+define double @_php_math_round(double %value, i32 signext %places, i32 signext %mode) #0 !dbg !6 {
 entry:
   br i1 undef, label %if.then, label %if.else, !dbg !32
 
@@ -62,23 +62,23 @@ attributes #3 = { nounwind }
 !3 = !{!4}
 !4 = !DIBasicType(name: "double", size: 64, align: 64, encoding: DW_ATE_float)
 !5 = !{!6, !18}
-!6 = !DISubprogram(name: "_php_math_round", scope: !1, file: !1, line: 15, type: !7, isLocal: false, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: true, function: double (double, i32, i32)* @_php_math_round, variables: !10)
+!6 = distinct !DISubprogram(name: "_php_math_round", scope: !1, file: !1, line: 15, type: !7, isLocal: false, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: true, variables: !10)
 !7 = !DISubroutineType(types: !8)
 !8 = !{!4, !4, !9, !9}
 !9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !10 = !{!11, !12, !13, !14, !15, !16, !17}
-!11 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "value", arg: 1, scope: !6, file: !1, line: 15, type: !4)
-!12 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "places", arg: 2, scope: !6, file: !1, line: 15, type: !9)
-!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "mode", arg: 3, scope: !6, file: !1, line: 15, type: !9)
-!14 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "f1", scope: !6, file: !1, line: 17, type: !4)
-!15 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "f2", scope: !6, file: !1, line: 17, type: !4)
-!16 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "tmp_value", scope: !6, file: !1, line: 18, type: !4)
-!17 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "precision_places", scope: !6, file: !1, line: 19, type: !9)
-!18 = !DISubprogram(name: "php_intpow10", scope: !1, file: !1, line: 1, type: !19, isLocal: true, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, variables: !21)
+!11 = !DILocalVariable(name: "value", arg: 1, scope: !6, file: !1, line: 15, type: !4)
+!12 = !DILocalVariable(name: "places", arg: 2, scope: !6, file: !1, line: 15, type: !9)
+!13 = !DILocalVariable(name: "mode", arg: 3, scope: !6, file: !1, line: 15, type: !9)
+!14 = !DILocalVariable(name: "f1", scope: !6, file: !1, line: 17, type: !4)
+!15 = !DILocalVariable(name: "f2", scope: !6, file: !1, line: 17, type: !4)
+!16 = !DILocalVariable(name: "tmp_value", scope: !6, file: !1, line: 18, type: !4)
+!17 = !DILocalVariable(name: "precision_places", scope: !6, file: !1, line: 19, type: !9)
+!18 = distinct !DISubprogram(name: "php_intpow10", scope: !1, file: !1, line: 1, type: !19, isLocal: true, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, variables: !21)
 !19 = !DISubroutineType(types: !20)
 !20 = !{!4, !9}
 !21 = !{!22}
-!22 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "power", arg: 1, scope: !18, file: !1, line: 1, type: !9)
+!22 = !DILocalVariable(name: "power", arg: 1, scope: !18, file: !1, line: 1, type: !9)
 !23 = !{!24}
 !24 = !DIGlobalVariable(name: "powers", scope: !18, file: !1, line: 3, type: !25, isLocal: true, isDefinition: true, variable: [23 x double]* @php_intpow10.powers)
 !25 = !DICompositeType(tag: DW_TAG_array_type, baseType: !26, size: 1472, align: 64, elements: !27)
diff --git a/test/CodeGen/PowerPC/pr24636.ll b/test/CodeGen/PowerPC/pr24636.ll
new file mode 100644
index 0000000000000..cc51dd38f9e29
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr24636.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@c = external global i32, align 4
+@b = external global [1 x i32], align 4
+
+; Function Attrs: nounwind
+define void @fn2() #0 align 4 {
+  br i1 undef, label %.lr.ph, label %4
+
+; We used to crash because a bad DAGCombine was creating i32-typed SETCC nodes,
+; even when crbits are enabled.
+; CHECK-LABEL: @fn2
+; CHECK: blr
+
+.lr.ph:                                           ; preds = %0
+  br i1 undef, label %.lr.ph.split, label %.preheader
+
+.preheader:                                       ; preds = %.preheader, %.lr.ph
+  br i1 undef, label %.lr.ph.split, label %.preheader
+
+.lr.ph.split:                                     ; preds = %.preheader, %.lr.ph
+  br i1 undef, label %._crit_edge, label %.lr.ph.split.split
+
+.lr.ph.split.split:                               ; preds = %.lr.ph.split.split, %.lr.ph.split
+  %1 = phi i32 [ %2, %.lr.ph.split.split ], [ undef, %.lr.ph.split ]
+  %2 = and i32 %1, and (i32 and (i32 and (i32 and (i32 and (i32 and (i32 and (i32 zext (i1 select (i1 icmp eq ([1 x i32]* bitcast (i32* @c to [1 x i32]*), [1 x i32]* @b), i1 true, i1 false) to i32), i32 zext (i1 select (i1 icmp eq ([1 x i32]* bitcast (i32* @c to [1 x i32]*), [1 x i32]* @b), i1 true, i1 false) to i32)), i32 zext (i1 select (i1 icmp eq ([1 x i32]* bitcast (i32* @c to [1 x i32]*), [1 x i32]* @b), i1 true, i1 false) to i32)), i32 zext (i1 select (i1 icmp eq ([1 x i32]* bitcast (i32* @c to [1 x i32]*), [1 x i32]* @b), i1 true, i1 false) to i32)), i32 zext (i1 select (i1 icmp eq ([1 x i32]* bitcast (i32* @c to [1 x i32]*), [1 x i32]* @b), i1 true, i1 false) to i32)), i32 zext (i1 select (i1 icmp eq ([1 x i32]* bitcast (i32* @c to [1 x i32]*), [1 x i32]* @b), i1 true, i1 false) to i32)), i32 zext (i1 select (i1 icmp eq ([1 x i32]* bitcast (i32* @c to [1 x i32]*), [1 x i32]* @b), i1 true, i1 false) to i32)), i32 zext (i1 select (i1 icmp eq ([1 x i32]* bitcast (i32* @c to [1 x i32]*), [1 x i32]* @b), i1 true, i1 false) to i32))
+  %3 = icmp slt i32 undef, 4
+  br i1 %3, label %.lr.ph.split.split, label %._crit_edge
+
+._crit_edge:                                      ; preds = %.lr.ph.split.split, %.lr.ph.split
+  %.lcssa = phi i32 [ undef, %.lr.ph.split ], [ %2, %.lr.ph.split.split ]
+  br label %4
+
+; <label>:4                                       ; preds = %._crit_edge, %0
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="ppc64le" }
+
diff --git a/test/CodeGen/PowerPC/pr25157-peephole.ll b/test/CodeGen/PowerPC/pr25157-peephole.ll
new file mode 100644
index 0000000000000..c5bd49b492ccd
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr25157-peephole.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+; Verify peephole simplification of splats and swaps.  Bugpoint-reduced
+; test from Eric Schweitz.
+
+%struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625 = type <{ [28 x i8] }>
+%struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626 = type <{ [64 x i8] }>
+
+@.BSS38 = external global %struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625, align 32
+@_main1_2_ = external global %struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626, section ".comm", align 16
+
+define void @aercalc_() {
+L.entry:
+  br i1 undef, label %L.LB38_2426, label %L.LB38_2911
+
+L.LB38_2911:
+  br i1 undef, label %L.LB38_2140, label %L.LB38_2640
+
+L.LB38_2640:
+  unreachable
+
+L.LB38_2426:
+  br i1 undef, label %L.LB38_2438, label %L.LB38_2920
+
+L.LB38_2920:
+  br i1 undef, label %L.LB38_2438, label %L.LB38_2921
+
+L.LB38_2921:
+  br label %L.LB38_2140
+
+L.LB38_2140:
+  ret void
+
+L.LB38_2438:
+  br i1 undef, label %L.LB38_2451, label %L.LB38_2935
+
+L.LB38_2935:
+  br i1 undef, label %L.LB38_2451, label %L.LB38_2936
+
+L.LB38_2936:
+  unreachable
+
+L.LB38_2451:
+  br i1 undef, label %L.LB38_2452, label %L.LB38_2937
+
+L.LB38_2937:
+  unreachable
+
+L.LB38_2452:
+  %0 = load float, float* bitcast (i8* getelementptr inbounds (%struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625, %struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625* @.BSS38, i64 0, i32 0, i64 16) to float*), align 16
+  %1 = fpext float %0 to double
+  %2 = insertelement <2 x double> undef, double %1, i32 1
+  store <2 x double> %2, <2 x double>* bitcast (i8* getelementptr inbounds (%struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626, %struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626* @_main1_2_, i64 0, i32 0, i64 32) to <2 x double>*), align 16
+  unreachable
+}
+
+; CHECK-LABEL: @aercalc_
+; CHECK: lxsspx
+; CHECK: xxspltd
+; CHECK: stxvd2x
+; CHECK-NOT: xxswapd
diff --git a/test/CodeGen/PowerPC/preincprep-nontrans-crash.ll b/test/CodeGen/PowerPC/preincprep-nontrans-crash.ll
new file mode 100644
index 0000000000000..cfec302d46905
--- /dev/null
+++ b/test/CodeGen/PowerPC/preincprep-nontrans-crash.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64-i64:64-n32:64"
+target triple = "powerpc64le-linux"
+
+%struct.BSS1.0.9.28.39.43.46.47.54.56.57.64.65.69.71.144 = type <{ [220 x i8] }>
+
+@.BSS1 = external unnamed_addr global %struct.BSS1.0.9.28.39.43.46.47.54.56.57.64.65.69.71.144, align 32
+
+; Function Attrs: noinline nounwind
+define void @ety2_() #0 {
+
+; This test case used to crash because the preinc prep pass would assume that
+; if X-Y could be simplified to a constant, than so could Y-X. While not
+; desirable, we cannot actually make this guarantee.
+; CHECK-LABEL: @ety2_
+
+L.entry:
+  %0 = load i32, i32* undef, align 4
+  %1 = sext i32 %0 to i64
+  %2 = shl nsw i64 %1, 3
+  %3 = add nsw i64 %2, 8
+  br label %L.LB1_425
+
+L.LB1_425:                                        ; preds = %L.LB1_427, %L.entry
+  %4 = phi i64 [ %21, %L.LB1_427 ], [ undef, %L.entry ]
+  br i1 undef, label %L.LB1_427, label %L.LB1_816
+
+L.LB1_816:                                        ; preds = %L.LB1_425
+  switch i32 undef, label %L.LB1_432 [
+    i32 30, label %L.LB1_805
+    i32 10, label %L.LB1_451
+    i32 20, label %L.LB1_451
+  ]
+
+L.LB1_451:                                        ; preds = %L.LB1_816, %L.LB1_816
+  unreachable
+
+L.LB1_432:                                        ; preds = %L.LB1_816
+  %.in.31 = lshr i64 %4, 32
+  %5 = trunc i64 %.in.31 to i32
+  br i1 undef, label %L.LB1_769, label %L.LB1_455
+
+L.LB1_455:                                        ; preds = %L.LB1_432
+  unreachable
+
+L.LB1_769:                                        ; preds = %L.LB1_432
+  %6 = sext i32 %5 to i64
+  %7 = add nsw i64 %6, 2
+  %8 = add nsw i64 %6, -1
+  %9 = mul i64 %8, %1
+  %10 = add i64 %9, %7
+  %11 = shl i64 %10, 3
+  %12 = getelementptr i8, i8* undef, i64 %11
+  %13 = mul nsw i64 %6, %1
+  %14 = add i64 %7, %13
+  %15 = shl i64 %14, 3
+  %16 = getelementptr i8, i8* undef, i64 %15
+  br i1 undef, label %L.LB1_662, label %L.LB1_662.prol
+
+L.LB1_662.prol:                                   ; preds = %L.LB1_662.prol, %L.LB1_769
+  %indvars.iv.next20.prol = add nuw nsw i64 undef, 1
+  br i1 undef, label %L.LB1_662, label %L.LB1_662.prol
+
+L.LB1_662:                                        ; preds = %L.LB1_437.2, %L.LB1_662.prol, %L.LB1_769
+  %indvars.iv19 = phi i64 [ %indvars.iv.next20.3, %L.LB1_437.2 ], [ 0, %L.LB1_769 ], [ %indvars.iv.next20.prol, %L.LB1_662.prol ]
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %17 = mul i64 %indvars.iv.next20, %3
+  %18 = getelementptr i8, i8* %16, i64 %17
+  %19 = bitcast i8* %18 to double*
+  store double 0.000000e+00, double* %19, align 8
+  %indvars.iv.next20.1 = add nsw i64 %indvars.iv19, 2
+  %20 = mul i64 %indvars.iv.next20.1, %3
+  br i1 undef, label %L.LB1_437.2, label %L.LB1_824.2
+
+L.LB1_427:                                        ; preds = %L.LB1_425
+  %21 = load i64, i64* bitcast (i8* getelementptr inbounds (%struct.BSS1.0.9.28.39.43.46.47.54.56.57.64.65.69.71.144, %struct.BSS1.0.9.28.39.43.46.47.54.56.57.64.65.69.71.144* @.BSS1, i64 0, i32 0, i64 8) to i64*), align 8
+  br label %L.LB1_425
+
+L.LB1_805:                                        ; preds = %L.LB1_816
+  ret void
+
+L.LB1_824.2:                                      ; preds = %L.LB1_662
+  %22 = getelementptr i8, i8* %12, i64 %20
+  %23 = bitcast i8* %22 to double*
+  store double 0.000000e+00, double* %23, align 8
+  br label %L.LB1_437.2
+
+L.LB1_437.2:                                      ; preds = %L.LB1_824.2, %L.LB1_662
+  %indvars.iv.next20.3 = add nsw i64 %indvars.iv19, 4
+  br label %L.LB1_662
+}
+
+attributes #0 = { noinline nounwind }
+
diff --git a/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll b/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll
new file mode 100644
index 0000000000000..606c0551a56cd
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll
@@ -0,0 +1,217 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
+entry:
+  br label %vector.body
+
+; CHECK-LABEL: @foo
+; Make sure that the offset constants we use are all even (only the last should be odd).
+; CHECK-DAG: li {{[0-9]+}}, 1056
+; CHECK-DAG: li {{[0-9]+}}, 1088
+; CHECK-DAG: li {{[0-9]+}}, 1152
+; CHECK-DAG: li {{[0-9]+}}, 1216
+; CHECK-DAG: li {{[0-9]+}}, 1280
+; CHECK-DAG: li {{[0-9]+}}, 1344
+; CHECK-DAG: li {{[0-9]+}}, 1408
+; CHECK-DAG: li {{[0-9]+}}, 1472
+; CHECK-DAG: li {{[0-9]+}}, 1536
+; CHECK-DAG: li {{[0-9]+}}, 1600
+; CHECK-DAG: li {{[0-9]+}}, 1568
+; CHECK-DAG: li {{[0-9]+}}, 1664
+; CHECK-DAG: li {{[0-9]+}}, 1632
+; CHECK-DAG: li {{[0-9]+}}, 1728
+; CHECK-DAG: li {{[0-9]+}}, 1696
+; CHECK-DAG: li {{[0-9]+}}, 1792
+; CHECK-DAG: li {{[0-9]+}}, 1760
+; CHECK-DAG: li {{[0-9]+}}, 1856
+; CHECK-DAG: li {{[0-9]+}}, 1824
+; CHECK-DAG: li {{[0-9]+}}, 1920
+; CHECK-DAG: li {{[0-9]+}}, 1888
+; CHECK-DAG: li {{[0-9]+}}, 1984
+; CHECK-DAG: li {{[0-9]+}}, 1952
+; CHECK-DAG: li {{[0-9]+}}, 2016
+; CHECK-DAG: li {{[0-9]+}}, 1024
+; CHECK-DAG: li {{[0-9]+}}, 1120
+; CHECK-DAG: li {{[0-9]+}}, 1184
+; CHECK-DAG: li {{[0-9]+}}, 1248
+; CHECK-DAG: li {{[0-9]+}}, 1312
+; CHECK-DAG: li {{[0-9]+}}, 1376
+; CHECK-DAG: li {{[0-9]+}}, 1440
+; CHECK-DAG: li {{[0-9]+}}, 1504
+; CHECK-DAG: li {{[0-9]+}}, 2047
+; CHECK: blr
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next.15, %vector.body ]
+  %0 = shl i64 %index, 1
+  %1 = getelementptr inbounds double, double* %b, i64 %0
+  %2 = bitcast double* %1 to <8 x double>*
+  %wide.vec = load <8 x double>, <8 x double>* %2, align 8
+  %strided.vec = shufflevector <8 x double> %wide.vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = fadd <4 x double> %strided.vec, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %4 = getelementptr inbounds double, double* %a, i64 %index
+  %5 = bitcast double* %4 to <4 x double>*
+  store <4 x double> %3, <4 x double>* %5, align 8
+  %index.next = or i64 %index, 4
+  %6 = shl i64 %index.next, 1
+  %7 = getelementptr inbounds double, double* %b, i64 %6
+  %8 = bitcast double* %7 to <8 x double>*
+  %wide.vec.1 = load <8 x double>, <8 x double>* %8, align 8
+  %strided.vec.1 = shufflevector <8 x double> %wide.vec.1, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %9 = fadd <4 x double> %strided.vec.1, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %10 = getelementptr inbounds double, double* %a, i64 %index.next
+  %11 = bitcast double* %10 to <4 x double>*
+  store <4 x double> %9, <4 x double>* %11, align 8
+  %index.next.1 = or i64 %index, 8
+  %12 = shl i64 %index.next.1, 1
+  %13 = getelementptr inbounds double, double* %b, i64 %12
+  %14 = bitcast double* %13 to <8 x double>*
+  %wide.vec.2 = load <8 x double>, <8 x double>* %14, align 8
+  %strided.vec.2 = shufflevector <8 x double> %wide.vec.2, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %15 = fadd <4 x double> %strided.vec.2, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %16 = getelementptr inbounds double, double* %a, i64 %index.next.1
+  %17 = bitcast double* %16 to <4 x double>*
+  store <4 x double> %15, <4 x double>* %17, align 8
+  %index.next.2 = or i64 %index, 12
+  %18 = shl i64 %index.next.2, 1
+  %19 = getelementptr inbounds double, double* %b, i64 %18
+  %20 = bitcast double* %19 to <8 x double>*
+  %wide.vec.3 = load <8 x double>, <8 x double>* %20, align 8
+  %strided.vec.3 = shufflevector <8 x double> %wide.vec.3, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %21 = fadd <4 x double> %strided.vec.3, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %22 = getelementptr inbounds double, double* %a, i64 %index.next.2
+  %23 = bitcast double* %22 to <4 x double>*
+  store <4 x double> %21, <4 x double>* %23, align 8
+  %index.next.3 = or i64 %index, 16
+  %24 = shl i64 %index.next.3, 1
+  %25 = getelementptr inbounds double, double* %b, i64 %24
+  %26 = bitcast double* %25 to <8 x double>*
+  %wide.vec.4 = load <8 x double>, <8 x double>* %26, align 8
+  %strided.vec.4 = shufflevector <8 x double> %wide.vec.4, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %27 = fadd <4 x double> %strided.vec.4, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %28 = getelementptr inbounds double, double* %a, i64 %index.next.3
+  %29 = bitcast double* %28 to <4 x double>*
+  store <4 x double> %27, <4 x double>* %29, align 8
+  %index.next.4 = or i64 %index, 20
+  %30 = shl i64 %index.next.4, 1
+  %31 = getelementptr inbounds double, double* %b, i64 %30
+  %32 = bitcast double* %31 to <8 x double>*
+  %wide.vec.5 = load <8 x double>, <8 x double>* %32, align 8
+  %strided.vec.5 = shufflevector <8 x double> %wide.vec.5, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %33 = fadd <4 x double> %strided.vec.5, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %34 = getelementptr inbounds double, double* %a, i64 %index.next.4
+  %35 = bitcast double* %34 to <4 x double>*
+  store <4 x double> %33, <4 x double>* %35, align 8
+  %index.next.5 = or i64 %index, 24
+  %36 = shl i64 %index.next.5, 1
+  %37 = getelementptr inbounds double, double* %b, i64 %36
+  %38 = bitcast double* %37 to <8 x double>*
+  %wide.vec.6 = load <8 x double>, <8 x double>* %38, align 8
+  %strided.vec.6 = shufflevector <8 x double> %wide.vec.6, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %39 = fadd <4 x double> %strided.vec.6, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %40 = getelementptr inbounds double, double* %a, i64 %index.next.5
+  %41 = bitcast double* %40 to <4 x double>*
+  store <4 x double> %39, <4 x double>* %41, align 8
+  %index.next.6 = or i64 %index, 28
+  %42 = shl i64 %index.next.6, 1
+  %43 = getelementptr inbounds double, double* %b, i64 %42
+  %44 = bitcast double* %43 to <8 x double>*
+  %wide.vec.7 = load <8 x double>, <8 x double>* %44, align 8
+  %strided.vec.7 = shufflevector <8 x double> %wide.vec.7, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %45 = fadd <4 x double> %strided.vec.7, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %46 = getelementptr inbounds double, double* %a, i64 %index.next.6
+  %47 = bitcast double* %46 to <4 x double>*
+  store <4 x double> %45, <4 x double>* %47, align 8
+  %index.next.7 = or i64 %index, 32
+  %48 = shl i64 %index.next.7, 1
+  %49 = getelementptr inbounds double, double* %b, i64 %48
+  %50 = bitcast double* %49 to <8 x double>*
+  %wide.vec.8 = load <8 x double>, <8 x double>* %50, align 8
+  %strided.vec.8 = shufflevector <8 x double> %wide.vec.8, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %51 = fadd <4 x double> %strided.vec.8, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %52 = getelementptr inbounds double, double* %a, i64 %index.next.7
+  %53 = bitcast double* %52 to <4 x double>*
+  store <4 x double> %51, <4 x double>* %53, align 8
+  %index.next.8 = or i64 %index, 36
+  %54 = shl i64 %index.next.8, 1
+  %55 = getelementptr inbounds double, double* %b, i64 %54
+  %56 = bitcast double* %55 to <8 x double>*
+  %wide.vec.9 = load <8 x double>, <8 x double>* %56, align 8
+  %strided.vec.9 = shufflevector <8 x double> %wide.vec.9, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %57 = fadd <4 x double> %strided.vec.9, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %58 = getelementptr inbounds double, double* %a, i64 %index.next.8
+  %59 = bitcast double* %58 to <4 x double>*
+  store <4 x double> %57, <4 x double>* %59, align 8
+  %index.next.9 = or i64 %index, 40
+  %60 = shl i64 %index.next.9, 1
+  %61 = getelementptr inbounds double, double* %b, i64 %60
+  %62 = bitcast double* %61 to <8 x double>*
+  %wide.vec.10 = load <8 x double>, <8 x double>* %62, align 8
+  %strided.vec.10 = shufflevector <8 x double> %wide.vec.10, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %63 = fadd <4 x double> %strided.vec.10, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %64 = getelementptr inbounds double, double* %a, i64 %index.next.9
+  %65 = bitcast double* %64 to <4 x double>*
+  store <4 x double> %63, <4 x double>* %65, align 8
+  %index.next.10 = or i64 %index, 44
+  %66 = shl i64 %index.next.10, 1
+  %67 = getelementptr inbounds double, double* %b, i64 %66
+  %68 = bitcast double* %67 to <8 x double>*
+  %wide.vec.11 = load <8 x double>, <8 x double>* %68, align 8
+  %strided.vec.11 = shufflevector <8 x double> %wide.vec.11, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %69 = fadd <4 x double> %strided.vec.11, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %70 = getelementptr inbounds double, double* %a, i64 %index.next.10
+  %71 = bitcast double* %70 to <4 x double>*
+  store <4 x double> %69, <4 x double>* %71, align 8
+  %index.next.11 = or i64 %index, 48
+  %72 = shl i64 %index.next.11, 1
+  %73 = getelementptr inbounds double, double* %b, i64 %72
+  %74 = bitcast double* %73 to <8 x double>*
+  %wide.vec.12 = load <8 x double>, <8 x double>* %74, align 8
+  %strided.vec.12 = shufflevector <8 x double> %wide.vec.12, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %75 = fadd <4 x double> %strided.vec.12, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %76 = getelementptr inbounds double, double* %a, i64 %index.next.11
+  %77 = bitcast double* %76 to <4 x double>*
+  store <4 x double> %75, <4 x double>* %77, align 8
+  %index.next.12 = or i64 %index, 52
+  %78 = shl i64 %index.next.12, 1
+  %79 = getelementptr inbounds double, double* %b, i64 %78
+  %80 = bitcast double* %79 to <8 x double>*
+  %wide.vec.13 = load <8 x double>, <8 x double>* %80, align 8
+  %strided.vec.13 = shufflevector <8 x double> %wide.vec.13, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %81 = fadd <4 x double> %strided.vec.13, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %82 = getelementptr inbounds double, double* %a, i64 %index.next.12
+  %83 = bitcast double* %82 to <4 x double>*
+  store <4 x double> %81, <4 x double>* %83, align 8
+  %index.next.13 = or i64 %index, 56
+  %84 = shl i64 %index.next.13, 1
+  %85 = getelementptr inbounds double, double* %b, i64 %84
+  %86 = bitcast double* %85 to <8 x double>*
+  %wide.vec.14 = load <8 x double>, <8 x double>* %86, align 8
+  %strided.vec.14 = shufflevector <8 x double> %wide.vec.14, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %87 = fadd <4 x double> %strided.vec.14, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %88 = getelementptr inbounds double, double* %a, i64 %index.next.13
+  %89 = bitcast double* %88 to <4 x double>*
+  store <4 x double> %87, <4 x double>* %89, align 8
+  %index.next.14 = or i64 %index, 60
+  %90 = shl i64 %index.next.14, 1
+  %91 = getelementptr inbounds double, double* %b, i64 %90
+  %92 = bitcast double* %91 to <8 x double>*
+  %wide.vec.15 = load <8 x double>, <8 x double>* %92, align 8
+  %strided.vec.15 = shufflevector <8 x double> %wide.vec.15, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %93 = fadd <4 x double> %strided.vec.15, <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+  %94 = getelementptr inbounds double, double* %a, i64 %index.next.14
+  %95 = bitcast double* %94 to <4 x double>*
+  store <4 x double> %93, <4 x double>* %95, align 8
+  %index.next.15 = add nsw i64 %index, 64
+  %96 = icmp eq i64 %index.next.15, 1600
+  br i1 %96, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="a2q" }
+
diff --git a/test/CodeGen/PowerPC/retaddr2.ll b/test/CodeGen/PowerPC/retaddr2.ll
index 8581f6cb9a38b..1038cd03d154a 100644
--- a/test/CodeGen/PowerPC/retaddr2.ll
+++ b/test/CodeGen/PowerPC/retaddr2.ll
@@ -10,11 +10,11 @@ entry:
 }
 
 ; CHECK-LABEL: @test1
-; CHECK: mflr 0
+; CHECK: mflr {{[0-9]+}}
 ; CHECK: std 0, 16(1)
 ; CHECK-DAG: ld 3, 64(1)
-; CHECK-DAG: ld 0, 16(1)
-; CHECK: mtlr 0
+; CHECK-DAG: ld [[SR:[0-9]+]], 16(1)
+; CHECK: mtlr [[SR]]
 ; CHECK: blr
 
 ; Function Attrs: nounwind readnone
diff --git a/test/CodeGen/PowerPC/rm-zext.ll b/test/CodeGen/PowerPC/rm-zext.ll
index 97c546c0145fe..df5fe4f7a154a 100644
--- a/test/CodeGen/PowerPC/rm-zext.ll
+++ b/test/CodeGen/PowerPC/rm-zext.ll
@@ -9,7 +9,7 @@ entry:
   %shr2 = lshr i32 %mul, 5
   ret i32 %shr2
 
-; CHECK-LABEL @foo
+; CHECK-LABEL: @foo
 ; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
 ; CHECK: blr
 }
@@ -23,7 +23,7 @@ entry:
   %or = or i32 %shr, %shl
   ret i32 %or
 
-; CHECK-LABEL @test6
+; CHECK-LABEL: @test6
 ; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
 ; CHECK: blr
 }
@@ -34,7 +34,7 @@ entry:
   %cond = select i1 %cmp, i32 %a, i32 %b
   ret i32 %cond
 
-; CHECK-LABEL @min
+; CHECK-LABEL: @min
 ; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
 ; CHECK: blr
 }
diff --git a/test/CodeGen/PowerPC/rotl-rotr-crash.ll b/test/CodeGen/PowerPC/rotl-rotr-crash.ll
new file mode 100644
index 0000000000000..3fbb67ecf25ed
--- /dev/null
+++ b/test/CodeGen/PowerPC/rotl-rotr-crash.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8
+
+; Ensure this does not crash
+
+; Function Attrs: norecurse nounwind
+define <4 x i32> @func1 (<4 x i32> %a) {
+entry:
+  %0 = lshr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
+  %1 = shl <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
+  %2 = or <4 x i32> %1, %0
+  ret <4 x i32> %2
+}
diff --git a/test/CodeGen/PowerPC/sdiv-pow2.ll b/test/CodeGen/PowerPC/sdiv-pow2.ll
index 5ec019dfb4af1..d1f60da6c740b 100644
--- a/test/CodeGen/PowerPC/sdiv-pow2.ll
+++ b/test/CodeGen/PowerPC/sdiv-pow2.ll
@@ -9,7 +9,7 @@ entry:
   %div = sdiv i32 %a, 8
   ret i32 %div
 
-; CHECK-LABEL @foo4
+; CHECK-LABEL: @foo4
 ; CHECK: srawi [[REG1:[0-9]+]], 3, 3
 ; CHECK: addze [[REG2:[0-9]+]], [[REG1]]
 ; CHECK: extsw 3, [[REG2]]
@@ -22,12 +22,12 @@ entry:
   %div = sdiv i64 %a, 8
   ret i64 %div
 
-; CHECK-LABEL @foo8
+; CHECK-LABEL: @foo8
 ; CHECK: sradi [[REG1:[0-9]+]], 3, 3
 ; CHECK: addze 3, [[REG1]]
 ; CHECK: blr
 
-; CHECK-32-LABEL @foo8
+; CHECK-32-LABEL: @foo8
 ; CHECK-32-NOT: sradi
 ; CHECK-32: blr
 }
@@ -58,7 +58,7 @@ entry:
 ; CHECK: neg 3, [[REG2]]
 ; CHECK: blr
 
-; CHECK-32-LABEL @foo8n
+; CHECK-32-LABEL: @foo8n
 ; CHECK-32-NOT: sradi
 ; CHECK-32: blr
 }
diff --git a/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll b/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll
new file mode 100644
index 0000000000000..79dccaa98ca15
--- /dev/null
+++ b/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=powerpc64-bgq-linux < %s
+
+; Check that llc does not crash due to an illegal APInt operation
+
+define i1 @f(i8* %ptr) {
+ entry:
+  %val = load i8, i8* %ptr, align 8, !range !0
+  %tobool = icmp eq i8 %val, 0
+  ret i1 %tobool
+}
+
+!0 = !{i8 0, i8 2}
diff --git a/test/CodeGen/PowerPC/seteq-0.ll b/test/CodeGen/PowerPC/seteq-0.ll
index 4afb8fee1776e..b7dd78085eb12 100644
--- a/test/CodeGen/PowerPC/seteq-0.ll
+++ b/test/CodeGen/PowerPC/seteq-0.ll
@@ -5,7 +5,7 @@ define i32 @eq0(i32 %a) {
         %tmp.2 = zext i1 %tmp.1 to i32          ; <i32> [#uses=1]
         ret i32 %tmp.2
 
-; CHECK: cntlz [[REG:r[0-9]+]], r3
+; CHECK: cntlzw [[REG:r[0-9]+]], r3
 ; CHECK: rlwinm r3, [[REG]], 27, 31, 31
 ; CHECK: blr
 }
diff --git a/test/CodeGen/PowerPC/sjlj.ll b/test/CodeGen/PowerPC/sjlj.ll
index dcbdd69d5d500..8c6682ca706e4 100644
--- a/test/CodeGen/PowerPC/sjlj.ll
+++ b/test/CodeGen/PowerPC/sjlj.ll
@@ -74,24 +74,24 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK-DAG: std [[REGA]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
 ; CHECK-DAG: std 1, 16([[REGA]])
 ; CHECK-DAG: std 2, 24([[REGA]])
-; CHECK: bcl 20, 31, .LBB1_1
+; CHECK: bcl 20, 31, .LBB1_5
 ; CHECK: li 3, 1
-; CHECK: #EH_SjLj_Setup	.LBB1_1
-; CHECK: b .LBB1_2
+; CHECK: #EH_SjLj_Setup	.LBB1_5
+; CHECK: b .LBB1_1
 
-; CHECK: .LBB1_1:
-; CHECK: mflr [[REGL:[0-9]+]]
-; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31)                   # 8-byte Folded Reload
-; CHECK: std [[REGL]], 8([[REG2]])
-; CHECK: li 3, 0
-
-; CHECK: .LBB1_2:
+; CHECK: .LBB1_4:
 
 ; CHECK: lfd
 ; CHECK: lvx
 ; CHECK: ld
 ; CHECK: blr
 
+; CHECK: .LBB1_5:
+; CHECK: mflr [[REGL:[0-9]+]]
+; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31)                   # 8-byte Folded Reload
+; CHECK: std [[REGL]], 8([[REG2]])
+; CHECK: li 3, 0
+
 ; CHECK-NOAV: @main
 ; CHECK-NOAV-NOT: stvx
 ; CHECK-NOAV: bcl
diff --git a/test/CodeGen/PowerPC/stack-realign.ll b/test/CodeGen/PowerPC/stack-realign.ll
index e91b563af72e6..1c93d665c16d3 100644
--- a/test/CodeGen/PowerPC/stack-realign.ll
+++ b/test/CodeGen/PowerPC/stack-realign.ll
@@ -29,7 +29,7 @@ entry:
 
 ; CHECK-LABEL: @goo
 
-; CHECK-DAG: mflr 0
+; CHECK-DAG: mflr {{[0-9]+}}
 ; CHECK-DAG: clrldi [[REG:[0-9]+]], 1, 59
 ; CHECK-DAG: std 30, -16(1)
 ; CHECK-DAG: mr 30, 1
@@ -44,14 +44,14 @@ entry:
 ; CHECK: std 3, 48(30)
 
 ; CHECK: ld 1, 0(1)
-; CHECK-DAG: ld 0, 16(1)
+; CHECK-DAG: ld [[SR:[0-9]+]], 16(1)
 ; CHECK-DAG: ld 30, -16(1)
-; CHECK-DAG: mtlr 0
+; CHECK-DAG: mtlr [[SR]]
 ; CHECK: blr
 
 ; CHECK-FP-LABEL: @goo
 
-; CHECK-FP-DAG: mflr 0
+; CHECK-FP-DAG: mflr {{[0-9]+}}
 ; CHECK-FP-DAG: clrldi [[REG:[0-9]+]], 1, 59
 ; CHECK-FP-DAG: std 31, -8(1)
 ; CHECK-FP-DAG: std 30, -16(1)
@@ -70,14 +70,14 @@ entry:
 ; CHECK-FP: std 3, 48(30)
 
 ; CHECK-FP: ld 1, 0(1)
-; CHECK-FP-DAG: ld 0, 16(1)
+; CHECK-FP-DAG: ld [[SR:[0-9]+]], 16(1)
 ; CHECK-FP-DAG: ld 31, -8(1)
 ; CHECK-FP-DAG: ld 30, -16(1)
-; CHECK-FP-DAG: mtlr 0
+; CHECK-FP-DAG: mtlr [[SR]]
 ; CHECK-FP: blr
 
 ; CHECK-32-LABEL: @goo
-; CHECK-32-DAG: mflr 0
+; CHECK-32-DAG: mflr {{[0-9]+}}
 ; CHECK-32-DAG: clrlwi [[REG:[0-9]+]], 1, 27
 ; CHECK-32-DAG: stw 30, -8(1)
 ; CHECK-32-DAG: mr 30, 1
@@ -86,7 +86,7 @@ entry:
 ; CHECK-32: stwux 1, 1, 0
 
 ; CHECK-32-PIC-LABEL: @goo
-; CHECK-32-PIC-DAG: mflr 0
+; CHECK-32-PIC-DAG: mflr {{[0-9]+}}
 ; CHECK-32-PIC-DAG: clrlwi [[REG:[0-9]+]], 1, 27
 ; CHECK-32-PIC-DAG: stw 29, -12(1)
 ; CHECK-32-PIC-DAG: mr 29, 1
@@ -114,7 +114,7 @@ entry:
 
 ; CHECK-DAG: lis [[REG1:[0-9]+]], -13
 ; CHECK-DAG: clrldi [[REG3:[0-9]+]], 1, 59
-; CHECK-DAG: mflr 0
+; CHECK-DAG: mflr {{[0-9]+}}
 ; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 51808
 ; CHECK-DAG: std 30, -16(1)
 ; CHECK-DAG: mr 30, 1
@@ -130,7 +130,7 @@ entry:
 
 ; CHECK-32-DAG: lis [[REG1:[0-9]+]], -13
 ; CHECK-32-DAG: clrlwi [[REG3:[0-9]+]], 1, 27
-; CHECK-32-DAG: mflr 0
+; CHECK-32-DAG: mflr {{[0-9]+}}
 ; CHECK-32-DAG: ori [[REG2:[0-9]+]], [[REG1]], 51904
 ; CHECK-32-DAG: stw 30, -8(1)
 ; CHECK-32-DAG: mr 30, 1
@@ -144,7 +144,7 @@ entry:
 
 ; CHECK-32-PIC-DAG: lis [[REG1:[0-9]+]], -13
 ; CHECK-32-PIC-DAG: clrlwi [[REG3:[0-9]+]], 1, 27
-; CHECK-32-PIC-DAG: mflr 0
+; CHECK-32-PIC-DAG: mflr {{[0-9]+}}
 ; CHECK-32-PIC-DAG: ori [[REG2:[0-9]+]], [[REG1]], 51904
 ; CHECK-32-PIC-DAG: stw 29, -12(1)
 ; CHECK-32-PIC-DAG: mr 29, 1
@@ -174,7 +174,7 @@ entry:
 
 ; CHECK-LABEL: @loo
 
-; CHECK-DAG: mflr 0
+; CHECK-DAG: mflr {{[0-9]+}}
 ; CHECK-DAG: clrldi [[REG:[0-9]+]], 1, 59
 ; CHECK-DAG: std 30, -32(1)
 ; CHECK-DAG: mr 30, 1
@@ -190,7 +190,7 @@ entry:
 
 ; CHECK-FP-LABEL: @loo
 
-; CHECK-FP-DAG: mflr 0
+; CHECK-FP-DAG: mflr {{[0-9]+}}
 ; CHECK-FP-DAG: clrldi [[REG:[0-9]+]], 1, 59
 ; CHECK-FP-DAG: std 31, -24(1)
 ; CHECK-FP-DAG: std 30, -32(1)
diff --git a/test/CodeGen/PowerPC/stackmap-frame-setup.ll b/test/CodeGen/PowerPC/stackmap-frame-setup.ll
new file mode 100644
index 0000000000000..487da00faa1ce
--- /dev/null
+++ b/test/CodeGen/PowerPC/stackmap-frame-setup.ll
@@ -0,0 +1,20 @@
+; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
+; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
+
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+; ISEL:      ADJCALLSTACKDOWN 0, implicit-def
+; ISEL-NEXT: STACKMAP
+; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+; FAST-ISEL:      ADJCALLSTACKDOWN 0, implicit-def
+; FAST-ISEL-NEXT: STACKMAP
+; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/PowerPC/swaps-le-5.ll b/test/CodeGen/PowerPC/swaps-le-5.ll
index 5cd739a0efa97..3e13bd16c23b4 100644
--- a/test/CodeGen/PowerPC/swaps-le-5.ll
+++ b/test/CodeGen/PowerPC/swaps-le-5.ll
@@ -15,11 +15,11 @@ entry:
 }
 
 ; CHECK-LABEL: @bar0
-; CHECK-DAG: xxswapd {{[0-9]+}}, 1
 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
 ; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
 ; CHECK: xxpermdi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 1
 ; CHECK: stxvd2x [[REG3]]
+; CHECK-NOT: xxswapd
 
 define void @bar1(double %y) {
 entry:
@@ -30,11 +30,11 @@ entry:
 }
 
 ; CHECK-LABEL: @bar1
-; CHECK-DAG: xxswapd {{[0-9]+}}, 1
 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
 ; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
 ; CHECK: xxmrghd [[REG3:[0-9]+]], [[REG1]], [[REG2]]
 ; CHECK: stxvd2x [[REG3]]
+; CHECK-NOT: xxswapd
 
 define void @baz0() {
 entry:
diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll
new file mode 100644
index 0000000000000..df88322e4fd8e
--- /dev/null
+++ b/test/CodeGen/PowerPC/swaps-le-6.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -O3 < %s | FileCheck %s
+
+; These tests verify that VSX swap optimization works when loading a scalar
+; into a vector register.
+
+
+@x = global <2 x double> <double 9.970000e+01, double -1.032220e+02>, align 16
+@z = global <2 x double> <double 2.332000e+01, double 3.111111e+01>, align 16
+@y = global double 1.780000e+00, align 8
+
+define void @bar0() {
+entry:
+  %0 = load <2 x double>, <2 x double>* @x, align 16
+  %1 = load double, double* @y, align 8
+  %vecins = insertelement <2 x double> %0, double %1, i32 0
+  store <2 x double> %vecins, <2 x double>* @z, align 16
+  ret void
+}
+
+; CHECK-LABEL: @bar0
+; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
+; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
+; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
+; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG4]], [[REG1]], 1
+; CHECK: stxvd2x [[REG5]]
+
+define void @bar1() {
+entry:
+  %0 = load <2 x double>, <2 x double>* @x, align 16
+  %1 = load double, double* @y, align 8
+  %vecins = insertelement <2 x double> %0, double %1, i32 1
+  store <2 x double> %vecins, <2 x double>* @z, align 16
+  ret void
+}
+
+; CHECK-LABEL: @bar1
+; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
+; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
+; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
+; CHECK: xxmrghd [[REG5:[0-9]+]], [[REG1]], [[REG4]]
+; CHECK: stxvd2x [[REG5]]
+
diff --git a/test/CodeGen/PowerPC/unal-vec-ldst.ll b/test/CodeGen/PowerPC/unal-vec-ldst.ll
new file mode 100644
index 0000000000000..260e7f6115f94
--- /dev/null
+++ b/test/CodeGen/PowerPC/unal-vec-ldst.ll
@@ -0,0 +1,580 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define <16 x i8> @test_l_v16i8(<16 x i8>* %p) #0 {
+entry:
+  %r = load <16 x i8>, <16 x i8>* %p, align 1
+  ret <16 x i8> %r
+
+; CHECK-LABEL: @test_l_v16i8
+; CHECK-DAG: li [[REG1:[0-9]+]], 15
+; CHECK-DAG: lvsl [[REG2:[0-9]+]], 0, 3
+; CHECK-DAG: lvx [[REG3:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: lvx [[REG4:[0-9]+]], 0, 3
+; CHECK: vperm 2, [[REG4]], [[REG3]], [[REG2]]
+; CHECK: blr
+}
+
+define <32 x i8> @test_l_v32i8(<32 x i8>* %p) #0 {
+entry:
+  %r = load <32 x i8>, <32 x i8>* %p, align 1
+  ret <32 x i8> %r
+
+; CHECK-LABEL: @test_l_v32i8
+; CHECK-DAG: li [[REG1:[0-9]+]], 31
+; CHECK-DAG: li [[REG2:[0-9]+]], 16
+; CHECK-DAG: lvsl [[REG3:[0-9]+]], 0, 3
+; CHECK-DAG: lvx [[REG4:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: lvx [[REG5:[0-9]+]], 3, [[REG2]]
+; CHECK-DAG: lvx [[REG6:[0-9]+]], 0, 3
+; CHECK-DAG: vperm 3, [[REG5]], [[REG4]], [[REG3]]
+; CHECK-DAG: vperm 2, [[REG6]], [[REG5]], [[REG3]]
+; CHECK: blr
+}
+
+define <8 x i16> @test_l_v8i16(<8 x i16>* %p) #0 {
+entry:
+  %r = load <8 x i16>, <8 x i16>* %p, align 2
+  ret <8 x i16> %r
+
+; CHECK-LABEL: @test_l_v8i16
+; CHECK-DAG: li [[REG1:[0-9]+]], 15
+; CHECK-DAG: lvsl [[REG2:[0-9]+]], 0, 3
+; CHECK-DAG: lvx [[REG3:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: lvx [[REG4:[0-9]+]], 0, 3
+; CHECK: vperm 2, [[REG4]], [[REG3]], [[REG2]]
+; CHECK: blr
+}
+
+define <16 x i16> @test_l_v16i16(<16 x i16>* %p) #0 {
+entry:
+  %r = load <16 x i16>, <16 x i16>* %p, align 2
+  ret <16 x i16> %r
+
+; CHECK-LABEL: @test_l_v16i16
+; CHECK-DAG: li [[REG1:[0-9]+]], 31
+; CHECK-DAG: li [[REG2:[0-9]+]], 16
+; CHECK-DAG: lvsl [[REG3:[0-9]+]], 0, 3
+; CHECK-DAG: lvx [[REG4:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: lvx [[REG5:[0-9]+]], 3, [[REG2]]
+; CHECK-DAG: lvx [[REG6:[0-9]+]], 0, 3
+; CHECK-DAG: vperm 3, [[REG5]], [[REG4]], [[REG3]]
+; CHECK-DAG: vperm 2, [[REG6]], [[REG5]], [[REG3]]
+; CHECK: blr
+}
+
+define <4 x i32> @test_l_v4i32(<4 x i32>* %p) #0 {
+entry:
+  %r = load <4 x i32>, <4 x i32>* %p, align 4
+  ret <4 x i32> %r
+
+; CHECK-LABEL: @test_l_v4i32
+; CHECK-DAG: li [[REG1:[0-9]+]], 15
+; CHECK-DAG: lvsl [[REG2:[0-9]+]], 0, 3
+; CHECK-DAG: lvx [[REG3:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: lvx [[REG4:[0-9]+]], 0, 3
+; CHECK: vperm 2, [[REG4]], [[REG3]], [[REG2]]
+; CHECK: blr
+}
+
+define <8 x i32> @test_l_v8i32(<8 x i32>* %p) #0 {
+entry:
+  %r = load <8 x i32>, <8 x i32>* %p, align 4
+  ret <8 x i32> %r
+
+; CHECK-LABEL: @test_l_v8i32
+; CHECK-DAG: li [[REG1:[0-9]+]], 31
+; CHECK-DAG: li [[REG2:[0-9]+]], 16
+; CHECK-DAG: lvsl [[REG3:[0-9]+]], 0, 3
+; CHECK-DAG: lvx [[REG4:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: lvx [[REG5:[0-9]+]], 3, [[REG2]]
+; CHECK-DAG: lvx [[REG6:[0-9]+]], 0, 3
+; CHECK-DAG: vperm 3, [[REG5]], [[REG4]], [[REG3]]
+; CHECK-DAG: vperm 2, [[REG6]], [[REG5]], [[REG3]]
+; CHECK: blr
+}
+
+define <2 x i64> @test_l_v2i64(<2 x i64>* %p) #0 {
+entry:
+  %r = load <2 x i64>, <2 x i64>* %p, align 8
+  ret <2 x i64> %r
+
+; CHECK-LABEL: @test_l_v2i64
+; CHECK: lxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define <4 x i64> @test_l_v4i64(<4 x i64>* %p) #0 {
+entry:
+  %r = load <4 x i64>, <4 x i64>* %p, align 8
+  ret <4 x i64> %r
+
+; CHECK-LABEL: @test_l_v4i64
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: lxvd2x 34, 0, 3
+; CHECK-DAG: lxvd2x 35, 3, [[REG1]]
+; CHECK: blr
+}
+
+define <4 x float> @test_l_v4float(<4 x float>* %p) #0 {
+entry:
+  %r = load <4 x float>, <4 x float>* %p, align 4
+  ret <4 x float> %r
+
+; CHECK-LABEL: @test_l_v4float
+; CHECK-DAG: li [[REG1:[0-9]+]], 15
+; CHECK-DAG: lvsl [[REG2:[0-9]+]], 0, 3
+; CHECK-DAG: lvx [[REG3:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: lvx [[REG4:[0-9]+]], 0, 3
+; CHECK: vperm 2, [[REG4]], [[REG3]], [[REG2]]
+; CHECK: blr
+}
+
+define <8 x float> @test_l_v8float(<8 x float>* %p) #0 {
+entry:
+  %r = load <8 x float>, <8 x float>* %p, align 4
+  ret <8 x float> %r
+
+; CHECK-LABEL: @test_l_v8float
+; CHECK-DAG: li [[REG1:[0-9]+]], 31
+; CHECK-DAG: li [[REG2:[0-9]+]], 16
+; CHECK-DAG: lvsl [[REG3:[0-9]+]], 0, 3
+; CHECK-DAG: lvx [[REG4:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: lvx [[REG5:[0-9]+]], 3, [[REG2]]
+; CHECK-DAG: lvx [[REG6:[0-9]+]], 0, 3
+; CHECK-DAG: vperm 3, [[REG5]], [[REG4]], [[REG3]]
+; CHECK-DAG: vperm 2, [[REG6]], [[REG5]], [[REG3]]
+; CHECK: blr
+}
+
+define <2 x double> @test_l_v2double(<2 x double>* %p) #0 {
+entry:
+  %r = load <2 x double>, <2 x double>* %p, align 8
+  ret <2 x double> %r
+
+; CHECK-LABEL: @test_l_v2double
+; CHECK: lxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define <4 x double> @test_l_v4double(<4 x double>* %p) #0 {
+entry:
+  %r = load <4 x double>, <4 x double>* %p, align 8
+  ret <4 x double> %r
+
+; CHECK-LABEL: @test_l_v4double
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: lxvd2x 34, 0, 3
+; CHECK-DAG: lxvd2x 35, 3, [[REG1]]
+; CHECK: blr
+}
+
+define <16 x i8> @test_l_p8v16i8(<16 x i8>* %p) #2 {
+entry:
+  %r = load <16 x i8>, <16 x i8>* %p, align 1
+  ret <16 x i8> %r
+
+; CHECK-LABEL: @test_l_p8v16i8
+; CHECK: lxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define <32 x i8> @test_l_p8v32i8(<32 x i8>* %p) #2 {
+entry:
+  %r = load <32 x i8>, <32 x i8>* %p, align 1
+  ret <32 x i8> %r
+
+; CHECK-LABEL: @test_l_p8v32i8
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: lxvw4x 34, 0, 3
+; CHECK-DAG: lxvw4x 35, 3, [[REG1]]
+; CHECK: blr
+}
+
+define <8 x i16> @test_l_p8v8i16(<8 x i16>* %p) #2 {
+entry:
+  %r = load <8 x i16>, <8 x i16>* %p, align 2
+  ret <8 x i16> %r
+
+; CHECK-LABEL: @test_l_p8v8i16
+; CHECK: lxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define <16 x i16> @test_l_p8v16i16(<16 x i16>* %p) #2 {
+entry:
+  %r = load <16 x i16>, <16 x i16>* %p, align 2
+  ret <16 x i16> %r
+
+; CHECK-LABEL: @test_l_p8v16i16
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: lxvw4x 34, 0, 3
+; CHECK-DAG: lxvw4x 35, 3, [[REG1]]
+; CHECK: blr
+}
+
+define <4 x i32> @test_l_p8v4i32(<4 x i32>* %p) #2 {
+entry:
+  %r = load <4 x i32>, <4 x i32>* %p, align 4
+  ret <4 x i32> %r
+
+; CHECK-LABEL: @test_l_p8v4i32
+; CHECK: lxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define <8 x i32> @test_l_p8v8i32(<8 x i32>* %p) #2 {
+entry:
+  %r = load <8 x i32>, <8 x i32>* %p, align 4
+  ret <8 x i32> %r
+
+; CHECK-LABEL: @test_l_p8v8i32
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: lxvw4x 34, 0, 3
+; CHECK-DAG: lxvw4x 35, 3, [[REG1]]
+; CHECK: blr
+}
+
+define <2 x i64> @test_l_p8v2i64(<2 x i64>* %p) #2 {
+entry:
+  %r = load <2 x i64>, <2 x i64>* %p, align 8
+  ret <2 x i64> %r
+
+; CHECK-LABEL: @test_l_p8v2i64
+; CHECK: lxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define <4 x i64> @test_l_p8v4i64(<4 x i64>* %p) #2 {
+entry:
+  %r = load <4 x i64>, <4 x i64>* %p, align 8
+  ret <4 x i64> %r
+
+; CHECK-LABEL: @test_l_p8v4i64
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: lxvd2x 34, 0, 3
+; CHECK-DAG: lxvd2x 35, 3, [[REG1]]
+; CHECK: blr
+}
+
+define <4 x float> @test_l_p8v4float(<4 x float>* %p) #2 {
+entry:
+  %r = load <4 x float>, <4 x float>* %p, align 4
+  ret <4 x float> %r
+
+; CHECK-LABEL: @test_l_p8v4float
+; CHECK: lxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define <8 x float> @test_l_p8v8float(<8 x float>* %p) #2 {
+entry:
+  %r = load <8 x float>, <8 x float>* %p, align 4
+  ret <8 x float> %r
+
+; CHECK-LABEL: @test_l_p8v8float
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: lxvw4x 34, 0, 3
+; CHECK-DAG: lxvw4x 35, 3, [[REG1]]
+; CHECK: blr
+}
+
+define <2 x double> @test_l_p8v2double(<2 x double>* %p) #2 {
+entry:
+  %r = load <2 x double>, <2 x double>* %p, align 8
+  ret <2 x double> %r
+
+; CHECK-LABEL: @test_l_p8v2double
+; CHECK: lxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define <4 x double> @test_l_p8v4double(<4 x double>* %p) #2 {
+entry:
+  %r = load <4 x double>, <4 x double>* %p, align 8
+  ret <4 x double> %r
+
+; CHECK-LABEL: @test_l_p8v4double
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: lxvd2x 34, 0, 3
+; CHECK-DAG: lxvd2x 35, 3, [[REG1]]
+; CHECK: blr
+}
+
+define <4 x float> @test_l_qv4float(<4 x float>* %p) #1 {
+entry:
+  %r = load <4 x float>, <4 x float>* %p, align 4
+  ret <4 x float> %r
+
+; CHECK-LABEL: @test_l_qv4float
+; CHECK-DAG: li [[REG1:[0-9]+]], 15
+; CHECK-DAG: qvlpclsx 0, 0, 3
+; CHECK-DAG: qvlfsx [[REG2:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: qvlfsx [[REG3:[0-9]+]], 0, 3
+; CHECK: qvfperm 1, [[REG3]], [[REG2]], 0
+; CHECK: blr
+}
+
+define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 {
+entry:
+  %r = load <8 x float>, <8 x float>* %p, align 4
+  ret <8 x float> %r
+
+; CHECK-LABEL: @test_l_qv8float
+; CHECK-DAG: li [[REG1:[0-9]+]], 31
+; CHECK-DAG: li [[REG2:[0-9]+]], 16
+; CHECK-DAG: qvlfsx [[REG3:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: qvlfsx [[REG4:[0-9]+]], 3, [[REG2]]
+; CHECK-DAG: qvlpclsx [[REG5:[0-5]+]], 0, 3
+; CHECK-DAG: qvlfsx [[REG6:[0-9]+]], 0, 3
+; CHECK-DAG: qvfperm 2, [[REG4]], [[REG3]], [[REG5]]
+; CHECK-DAG: qvfperm 1, [[REG6]], [[REG4]], [[REG5]]
+; CHECK: blr
+}
+
+define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 {
+entry:
+  %r = load <4 x double>, <4 x double>* %p, align 8
+  ret <4 x double> %r
+
+; CHECK-LABEL: @test_l_qv4double
+; CHECK-DAG: li [[REG1:[0-9]+]], 31
+; CHECK-DAG: qvlpcldx 0, 0, 3
+; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: qvlfdx [[REG3:[0-9]+]], 0, 3
+; CHECK: qvfperm 1, [[REG3]], [[REG2]], 0
+; CHECK: blr
+}
+
+define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 {
+entry:
+  %r = load <8 x double>, <8 x double>* %p, align 8
+  ret <8 x double> %r
+
+; CHECK-LABEL: @test_l_qv8double
+; CHECK-DAG: li [[REG1:[0-9]+]], 63
+; CHECK-DAG: li [[REG2:[0-9]+]], 32
+; CHECK-DAG: qvlfdx [[REG3:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: qvlfdx [[REG4:[0-9]+]], 3, [[REG2]]
+; CHECK-DAG: qvlpcldx [[REG5:[0-5]+]], 0, 3
+; CHECK-DAG: qvlfdx [[REG6:[0-9]+]], 0, 3
+; CHECK-DAG: qvfperm 2, [[REG4]], [[REG3]], [[REG5]]
+; CHECK-DAG: qvfperm 1, [[REG6]], [[REG4]], [[REG5]]
+; CHECK: blr
+}
+
+define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {
+entry:
+  store <16 x i8> %v, <16 x i8>* %p, align 1
+  ret void
+
+; CHECK-LABEL: @test_s_v16i8
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v32i8(<32 x i8>* %p, <32 x i8> %v) #0 {
+entry:
+  store <32 x i8> %v, <32 x i8>* %p, align 1
+  ret void
+
+; CHECK-LABEL: @test_s_v32i8
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: stxvw4x 35, 3, [[REG1]]
+; CHECK-DAG: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v8i16(<8 x i16>* %p, <8 x i16> %v) #0 {
+entry:
+  store <8 x i16> %v, <8 x i16>* %p, align 2
+  ret void
+
+; CHECK-LABEL: @test_s_v8i16
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v16i16(<16 x i16>* %p, <16 x i16> %v) #0 {
+entry:
+  store <16 x i16> %v, <16 x i16>* %p, align 2
+  ret void
+
+; CHECK-LABEL: @test_s_v16i16
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: stxvw4x 35, 3, [[REG1]]
+; CHECK-DAG: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 {
+entry:
+  store <4 x i32> %v, <4 x i32>* %p, align 4
+  ret void
+
+; CHECK-LABEL: @test_s_v4i32
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v8i32(<8 x i32>* %p, <8 x i32> %v) #0 {
+entry:
+  store <8 x i32> %v, <8 x i32>* %p, align 4
+  ret void
+
+; CHECK-LABEL: @test_s_v8i32
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: stxvw4x 35, 3, [[REG1]]
+; CHECK-DAG: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v2i64(<2 x i64>* %p, <2 x i64> %v) #0 {
+entry:
+  store <2 x i64> %v, <2 x i64>* %p, align 8
+  ret void
+
+; CHECK-LABEL: @test_s_v2i64
+; CHECK: stxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
+entry:
+  store <4 x i64> %v, <4 x i64>* %p, align 8
+  ret void
+
+; CHECK-LABEL: @test_s_v4i64
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: stxvd2x 35, 3, [[REG1]]
+; CHECK-DAG: stxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v4float(<4 x float>* %p, <4 x float> %v) #0 {
+entry:
+  store <4 x float> %v, <4 x float>* %p, align 4
+  ret void
+
+; CHECK-LABEL: @test_s_v4float
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v8float(<8 x float>* %p, <8 x float> %v) #0 {
+entry:
+  store <8 x float> %v, <8 x float>* %p, align 4
+  ret void
+
+; CHECK-LABEL: @test_s_v8float
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: stxvw4x 35, 3, [[REG1]]
+; CHECK-DAG: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v2double(<2 x double>* %p, <2 x double> %v) #0 {
+entry:
+  store <2 x double> %v, <2 x double>* %p, align 8
+  ret void
+
+; CHECK-LABEL: @test_s_v2double
+; CHECK: stxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_v4double(<4 x double>* %p, <4 x double> %v) #0 {
+entry:
+  store <4 x double> %v, <4 x double>* %p, align 8
+  ret void
+
+; CHECK-LABEL: @test_s_v4double
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK-DAG: stxvd2x 35, 3, [[REG1]]
+; CHECK-DAG: stxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test_s_qv4float(<4 x float>* %p, <4 x float> %v) #1 {
+entry:
+  store <4 x float> %v, <4 x float>* %p, align 4
+  ret void
+
+; CHECK-LABEL: @test_s_qv4float
+; CHECK-DAG: qvesplati [[REG1:[0-9]+]], 1, 3
+; CHECK-DAG: qvesplati [[REG2:[0-9]+]], 1, 2
+; CHECK-DAG: qvesplati [[REG3:[0-9]+]], 1, 1
+; CHECK-DAG: stfs 1, 0(3)
+; CHECK-DAG: stfs [[REG1]], 12(3)
+; CHECK-DAG: stfs [[REG2]], 8(3)
+; CHECK-DAG: stfs [[REG3]], 4(3)
+; CHECK: blr
+}
+
+define void @test_s_qv8float(<8 x float>* %p, <8 x float> %v) #1 {
+entry:
+  store <8 x float> %v, <8 x float>* %p, align 4
+  ret void
+
+; CHECK-LABEL: @test_s_qv8float
+; CHECK-DAG: qvesplati [[REG1:[0-9]+]], 2, 3
+; CHECK-DAG: qvesplati [[REG2:[0-9]+]], 2, 2
+; CHECK-DAG: qvesplati [[REG3:[0-9]+]], 2, 1
+; CHECK-DAG: qvesplati [[REG4:[0-9]+]], 1, 3
+; CHECK-DAG: qvesplati [[REG5:[0-9]+]], 1, 2
+; CHECK-DAG: qvesplati [[REG6:[0-9]+]], 1, 1
+; CHECK-DAG: stfs 2, 16(3)
+; CHECK-DAG: stfs 1, 0(3)
+; CHECK-DAG: stfs [[REG1]], 28(3)
+; CHECK-DAG: stfs [[REG2]], 24(3)
+; CHECK-DAG: stfs [[REG3]], 20(3)
+; CHECK-DAG: stfs [[REG4]], 12(3)
+; CHECK-DAG: stfs [[REG5]], 8(3)
+; CHECK-DAG: stfs [[REG6]], 4(3)
+; CHECK: blr
+}
+
+define void @test_s_qv4double(<4 x double>* %p, <4 x double> %v) #1 {
+entry:
+  store <4 x double> %v, <4 x double>* %p, align 8
+  ret void
+
+; CHECK-LABEL: @test_s_qv4double
+; CHECK-DAG: qvesplati [[REG1:[0-9]+]], 1, 3
+; CHECK-DAG: qvesplati [[REG2:[0-9]+]], 1, 2
+; CHECK-DAG: qvesplati [[REG3:[0-9]+]], 1, 1
+; CHECK-DAG: stfd 1, 0(3)
+; CHECK-DAG: stfd [[REG1]], 24(3)
+; CHECK-DAG: stfd [[REG2]], 16(3)
+; CHECK-DAG: stfd [[REG3]], 8(3)
+; CHECK: blr
+}
+
+define void @test_s_qv8double(<8 x double>* %p, <8 x double> %v) #1 {
+entry:
+  store <8 x double> %v, <8 x double>* %p, align 8
+  ret void
+
+; CHECK-LABEL: @test_s_qv8double
+; CHECK-DAG: qvesplati [[REG1:[0-9]+]], 2, 3
+; CHECK-DAG: qvesplati [[REG2:[0-9]+]], 2, 2
+; CHECK-DAG: qvesplati [[REG3:[0-9]+]], 2, 1
+; CHECK-DAG: qvesplati [[REG4:[0-9]+]], 1, 3
+; CHECK-DAG: qvesplati [[REG5:[0-9]+]], 1, 2
+; CHECK-DAG: qvesplati [[REG6:[0-9]+]], 1, 1
+; CHECK-DAG: stfd 2, 32(3)
+; CHECK-DAG: stfd 1, 0(3)
+; CHECK-DAG: stfd [[REG1]], 56(3)
+; CHECK-DAG: stfd [[REG2]], 48(3)
+; CHECK-DAG: stfd [[REG3]], 40(3)
+; CHECK-DAG: stfd [[REG4]], 24(3)
+; CHECK-DAG: stfd [[REG5]], 16(3)
+; CHECK-DAG: stfd [[REG6]], 8(3)
+; CHECK: blr
+}
+
+attributes #0 = { nounwind "target-cpu"="pwr7" }
+attributes #1 = { nounwind "target-cpu"="a2q" }
+attributes #2 = { nounwind "target-cpu"="pwr8" }
+
diff --git a/test/CodeGen/PowerPC/unal-vec-negarith.ll b/test/CodeGen/PowerPC/unal-vec-negarith.ll
new file mode 100644
index 0000000000000..faac891f5c6fa
--- /dev/null
+++ b/test/CodeGen/PowerPC/unal-vec-negarith.ll
@@ -0,0 +1,17 @@
+; RUN: llc -debug-only=isel <%s >%t 2>&1 && FileCheck <%t %s
+; REQUIRES: asserts
+
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define <16 x i8> @test_l_v16i8(<16 x i8>* %p) #0 {
+entry:
+  %r = load <16 x i8>, <16 x i8>* %p, align 1
+  ret <16 x i8> %r
+
+; CHECK-NOT: v4i32,ch = llvm.ppc.altivec.lvx{{.*}}<LD31[%p+4294967281](align=1)>
+; CHECK:     v4i32,ch = llvm.ppc.altivec.lvx{{.*}}<LD31[%p+-15](align=1)>
+}
+
+attributes #0 = { nounwind "target-cpu"="pwr7" }
+
diff --git a/test/CodeGen/PowerPC/unwind-dw2-g.ll b/test/CodeGen/PowerPC/unwind-dw2-g.ll
index 8bd158867c794..e44da85f5b360 100644
--- a/test/CodeGen/PowerPC/unwind-dw2-g.ll
+++ b/test/CodeGen/PowerPC/unwind-dw2-g.ll
@@ -3,7 +3,7 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "powerpc64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind
-define void @foo() #0 {
+define void @foo() #0 !dbg !4 {
 entry:
   call void @llvm.eh.unwind.init(), !dbg !9
   ret void, !dbg !10
@@ -21,11 +21,11 @@ attributes #0 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !11}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
 !1 = !DIFile(filename: "/tmp/unwind-dw2.c", directory: "/tmp")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 1, file: !1, scope: !5, type: !6, function: void ()* @foo, variables: !2)
+!4 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2)
 !5 = !DIFile(filename: "/tmp/unwind-dw2.c", directory: "/tmp")
 !6 = !DISubroutineType(types: !7)
 !7 = !{null}
diff --git a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
new file mode 100644
index 0000000000000..3d4789360f558
--- /dev/null
+++ b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
@@ -0,0 +1,114 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \
+; RUN:  --check-prefix=CHECK-BE
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \
+; RUN:  --check-prefix=CHECK-P7
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @geti(<4 x i32> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 %b
+  ret i32 %vecext
+; CHECK-LABEL: @geti
+; CHECK-P7-LABEL: @geti
+; CHECK-BE-LABEL: @geti
+; CHECK-DAG: li [[TRUNCREG:[0-9]+]], 2
+; CHECK-DAG: andc [[MASKREG:[0-9]+]], [[TRUNCREG]], 5
+; CHECK-DAG: sldi [[SHIFTREG:[0-9]+]], [[MASKREG]], 2
+; CHECK-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
+; CHECK-DAG: vperm [[PERMVEC:[0-9]+]], 2, 2, [[SHMSKREG]]
+; CHECK-DAG: li [[ONEREG:[0-9]+]], 1
+; CHECK-DAG: and [[ELEMSREG:[0-9]+]], [[ONEREG]], 5
+; CHECK-DAG: sldi [[SHAMREG:[0-9]+]], [[ELEMSREG]], 5
+; CHECK: mfvsrd [[TOGPR:[0-9]+]],
+; CHECK: srd [[RSHREG:[0-9]+]], [[TOGPR]], [[SHAMREG]]
+; CHECK: extsw 3, [[RSHREG]]
+; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: stxvw4x 34,
+; CHECK-P7: lwax 3, [[ELEMOFFREG]],
+; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2
+; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 2
+; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-BE-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK-BE-DAG: li [[IMMREG:[0-9]+]], 1
+; CHECK-BE-DAG: andc [[ANDCREG:[0-9]+]], [[IMMREG]], 5
+; CHECK-BE-DAG: sldi [[SHAMREG:[0-9]+]], [[ANDCREG]], 5
+; CHECK-BE: mfvsrd [[TOGPR:[0-9]+]],
+; CHECK-BE: srd [[RSHREG:[0-9]+]], [[TOGPR]], [[SHAMREG]]
+; CHECk-BE: extsw 3, [[RSHREG]]
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @getl(<2 x i64> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <2 x i64> %a, i32 %b
+  ret i64 %vecext
+; CHECK-LABEL: @getl
+; CHECK-P7-LABEL: @getl
+; CHECK-BE-LABEL: @getl
+; CHECK-DAG: li [[TRUNCREG:[0-9]+]], 1
+; CHECK-DAG: andc [[MASKREG:[0-9]+]], [[TRUNCREG]], 5
+; CHECK-DAG: sldi [[SHIFTREG:[0-9]+]], [[MASKREG]], 3
+; CHECK-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
+; CHECK-DAG: vperm [[PERMVEC:[0-9]+]], 2, 2, [[SHMSKREG]]
+; CHECK: mfvsrd 3,
+; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 3
+; CHECK-P7-DAG: stxvd2x 34,
+; CHECK-P7: ldx 3, [[ELEMOFFREG]],
+; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
+; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
+; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-BE-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK-BE: mfvsrd 3,
+}
+
+; Function Attrs: norecurse nounwind readnone
+define float @getf(<4 x float> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <4 x float> %a, i32 %b
+  ret float %vecext
+; CHECK-LABEL: @getf
+; CHECK-P7-LABEL: @getf
+; CHECK-BE-LABEL: @getf
+; CHECK: li [[IMMREG:[0-9]+]], 3
+; CHECK: xor [[TRUNCREG:[0-9]+]], [[IMMREG]], 5
+; CHECK: lvsl [[SHMSKREG:[0-9]+]], 0, [[TRUNCREG]]
+; CHECK: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK: xscvspdpn 1,
+; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: stxvw4x 34,
+; CHECK-P7: lfsx 1, [[ELEMOFFREG]],
+; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2
+; CHECK-BE: lvsl [[SHMSKREG:[0-9]+]], 0, [[ELNOREG]]
+; CHECK-BE: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK-BE: xscvspdpn 1,
+}
+
+; Function Attrs: norecurse nounwind readnone
+define double @getd(<2 x double> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <2 x double> %a, i32 %b
+  ret double %vecext
+; CHECK-LABEL: @getd
+; CHECK-P7-LABEL: @getd
+; CHECK-BE-LABEL: @getd
+; CHECK: li [[TRUNCREG:[0-9]+]], 1
+; CHECK: andc [[MASKREG:[0-9]+]], [[TRUNCREG]], 5
+; CHECK: sldi [[SHIFTREG:[0-9]+]], [[MASKREG]], 3
+; CHECK: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
+; CHECK: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; FIXME: the instruction below is a redundant regclass copy, to be removed
+; CHECK: xxlor 1,
+; CHECK-P7-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
+; CHECK-P7-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
+; CHECK-P7-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-P7-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; FIXME: the instruction below is a redundant regclass copy, to be removed
+; CHECK-P7: xxlor 1,
+; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
+; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
+; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-BE-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; FIXME: the instruction below is a redundant regclass copy, to be removed
+; CHECK-BE: xxlor 1,
+}
diff --git a/test/CodeGen/PowerPC/vec-asm-disabled.ll b/test/CodeGen/PowerPC/vec-asm-disabled.ll
new file mode 100644
index 0000000000000..333ccce6b89fa
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec-asm-disabled.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -mcpu=pwr7 -o /dev/null %s 2>&1 | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define <4 x i32> @testi1(<4 x i32> %b1, <4 x i32> %b2) #0 {
+entry:
+  %0 = call <4 x i32> asm "xxland $0, $1, $2", "=^wd,^wd,^wd"(<4 x i32> %b1, <4 x i32> %b2) #0
+  ret <4 x i32> %0
+
+; CHECK: error: couldn't allocate output register for constraint 'wd'
+}
+
+attributes #0 = { nounwind "target-features"="-vsx" }
+
diff --git a/test/CodeGen/PowerPC/vec_add_sub_quadword.ll b/test/CodeGen/PowerPC/vec_add_sub_quadword.ll
index f7ebf479755cd..9e79b52c40496 100644
--- a/test/CodeGen/PowerPC/vec_add_sub_quadword.ll
+++ b/test/CodeGen/PowerPC/vec_add_sub_quadword.ll
@@ -14,7 +14,7 @@ define <1 x i128> @increment_by_one(<1 x i128> %x) nounwind {
        %result = add <1 x i128> %x, <i128 1>
        ret <1 x i128> %result
 ; CHECK-LABEL: @increment_by_one
-; CHECK vadduqm 2, 2, 3
+; CHECK: vadduqm 2, 2, 3
 }
 
 define <1 x i128> @increment_by_val(<1 x i128> %x, i128 %val) nounwind {
@@ -37,7 +37,7 @@ define <1 x i128> @decrement_by_one(<1 x i128> %x) nounwind {
        %result = sub <1 x i128> %x, <i128 1>
        ret <1 x i128> %result
 ; CHECK-LABEL: @decrement_by_one
-; CHECK vsubuqm 2, 2, 3
+; CHECK: vsubuqm 2, 2, 3
 }
 
 define <1 x i128> @decrement_by_val(<1 x i128> %x, i128 %val) nounwind {
@@ -46,7 +46,7 @@ define <1 x i128> @decrement_by_val(<1 x i128> %x, i128 %val) nounwind {
        %result = sub <1 x i128> %x, %tmpvec2
        ret <1 x i128> %result
 ; CHECK-LABEL: @decrement_by_val
-; CHECK vsubuqm   2, 2, 3
+; CHECK: vsubuqm   2, 2, 3
 }
 
 declare <1 x i128> @llvm.ppc.altivec.vaddeuqm(<1 x i128> %x, 
diff --git a/test/CodeGen/PowerPC/vector-merge-store-fp-constants.ll b/test/CodeGen/PowerPC/vector-merge-store-fp-constants.ll
new file mode 100644
index 0000000000000..db92f20c352a2
--- /dev/null
+++ b/test/CodeGen/PowerPC/vector-merge-store-fp-constants.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=ppc64 -mtriple=ppc64-apple-darwin < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}_merge_8_float_zero_stores:
+; CHECK: li [[ZEROREG:r[0-9]+]], 0
+; CHECK-DAG: std [[ZEROREG]], 0([[PTR:r[0-9]+]])
+; CHECK-DAG: std [[ZEROREG]], 8([[PTR]])
+; CHECK-DAG: std [[ZEROREG]], 16([[PTR]])
+; CHECK-DAG: std [[ZEROREG]], 24([[PTR]])
+; CHECK: blr
+define void @merge_8_float_zero_stores(float* %ptr) {
+  %idx0 = getelementptr float, float* %ptr, i64 0
+  %idx1 = getelementptr float, float* %ptr, i64 1
+  %idx2 = getelementptr float, float* %ptr, i64 2
+  %idx3 = getelementptr float, float* %ptr, i64 3
+  %idx4 = getelementptr float, float* %ptr, i64 4
+  %idx5 = getelementptr float, float* %ptr, i64 5
+  %idx6 = getelementptr float, float* %ptr, i64 6
+  %idx7 = getelementptr float, float* %ptr, i64 7
+  store float 0.0, float* %idx0, align 4
+  store float 0.0, float* %idx1, align 4
+  store float 0.0, float* %idx2, align 4
+  store float 0.0, float* %idx3, align 4
+  store float 0.0, float* %idx4, align 4
+  store float 0.0, float* %idx5, align 4
+  store float 0.0, float* %idx6, align 4
+  store float 0.0, float* %idx7, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll
index dceb2516c6969..b2eefb666760e 100644
--- a/test/CodeGen/PowerPC/vsx.ll
+++ b/test/CodeGen/PowerPC/vsx.ll
@@ -1226,11 +1226,10 @@ define <2 x i32> @test80(i32 %v) {
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test80
-; CHECK-LE-DAG: addi [[R1:[0-9]+]], 1, -16
+; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3
 ; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI
-; CHECK-LE-DAG: lxvd2x [[V1:[0-9]+]], 0, [[R1]]
 ; CHECK-LE-DAG: lxvd2x [[V2:[0-9]+]], 0, [[R2]]
-; CHECK-LE-DAG: xxswapd 34, [[V1]]
+; CHECK-LE-DAG: xxspltd 34, [[R1]]
 ; CHECK-LE-DAG: xxswapd 35, [[V2]]
 ; CHECK-LE: vaddudm 2, 2, 3
 ; CHECK-LE: blr
diff --git a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
index 6c89b1092bdfc..97e1548f965f4 100644
--- a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
+++ b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
@@ -33,12 +33,8 @@ define double @teste0(<2 x double>* %p1) {
   %r = extractelement <2 x double> %v, i32 0
   ret double %r
 
-; FIXME: Swap optimization will collapse this into lxvd2x 1, 0, 3.
-
 ; CHECK-LABEL: teste0
-; CHECK: lxvd2x 0, 0, 3
-; CHECK: xxswapd 0, 0
-; CHECK: xxswapd 1, 0
+; CHECK: lxvd2x 1, 0, 3
 }
 
 define double @teste1(<2 x double>* %p1) {
diff --git a/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll b/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll
index 1029708859631..c2cb71c58881b 100644
--- a/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll
+++ b/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll
@@ -55,8 +55,7 @@ entry:
   ret void
 ; CHECK-LABEL: @intToFlt
 ; CHECK: lxsiwax [[REGLD2:[0-9]+]],
-; FIXME: the below will change when the VSX form is implemented
-; CHECK: fcfids {{[0-9]}}, [[REGLD2]]
+; CHECK: xscvsxdsp {{[0-9]}}, [[REGLD2]]
 }
 
 ; Function Attrs: nounwind
@@ -108,8 +107,7 @@ entry:
   ret void
 ; CHECK-LABEL: @uIntToFlt
 ; CHECK: lxsiwzx [[REGLD4:[0-9]+]],
-; FIXME: the below will change when the VSX form is implemented
-; CHECK: fcfidus {{[0-9]+}}, [[REGLD4]]
+; CHECK: xscvuxdsp {{[0-9]+}}, [[REGLD4]]
 }
 
 ; Function Attrs: nounwind
diff --git a/test/CodeGen/PowerPC/vsx_shuffle_le.ll b/test/CodeGen/PowerPC/vsx_shuffle_le.ll
index dcfa0e7888676..4f767c7ca78f7 100644
--- a/test/CodeGen/PowerPC/vsx_shuffle_le.ll
+++ b/test/CodeGen/PowerPC/vsx_shuffle_le.ll
@@ -8,8 +8,7 @@ define <2 x double> @test00(<2 x double>* %p1, <2 x double>* %p2) {
 
 ; CHECK-LABEL: test00
 ; CHECK: lxvd2x 0, 0, 3
-; CHECK: xxswapd 0, 0
-; CHECK: xxspltd 34, 0, 1
+; CHECK: xxspltd 34, 0, 0
 }
 
 define <2 x double> @test01(<2 x double>* %p1, <2 x double>* %p2) {
@@ -58,9 +57,7 @@ define <2 x double> @test10(<2 x double>* %p1, <2 x double>* %p2) {
   ret <2 x double> %v3
 
 ; CHECK-LABEL: @test10
-; CHECK: lxvd2x 0, 0, 3
-; CHECK: xxswapd 0, 0
-; CHECK: xxswapd 34, 0
+; CHECK: lxvd2x 34, 0, 3
 }
 
 define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) {
@@ -71,8 +68,7 @@ define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) {
 
 ; CHECK-LABEL: @test11
 ; CHECK: lxvd2x 0, 0, 3
-; CHECK: xxswapd 0, 0
-; CHECK: xxspltd 34, 0, 0
+; CHECK: xxspltd 34, 0, 1
 }
 
 define <2 x double> @test12(<2 x double>* %p1, <2 x double>* %p2) {
@@ -139,8 +135,7 @@ define <2 x double> @test22(<2 x double>* %p1, <2 x double>* %p2) {
 
 ; CHECK-LABEL: @test22
 ; CHECK: lxvd2x 0, 0, 4
-; CHECK: xxswapd 0, 0
-; CHECK: xxspltd 34, 0, 1
+; CHECK: xxspltd 34, 0, 0
 }
 
 define <2 x double> @test23(<2 x double>* %p1, <2 x double>* %p2) {
@@ -189,9 +184,7 @@ define <2 x double> @test32(<2 x double>* %p1, <2 x double>* %p2) {
   ret <2 x double> %v3
 
 ; CHECK-LABEL: @test32
-; CHECK: lxvd2x 0, 0, 4
-; CHECK: xxswapd 0, 0
-; CHECK: xxswapd 34, 0
+; CHECK: lxvd2x 34, 0, 4
 }
 
 define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) {
@@ -202,6 +195,5 @@ define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) {
 
 ; CHECK-LABEL: @test33
 ; CHECK: lxvd2x 0, 0, 4
-; CHECK: xxswapd 0, 0
-; CHECK: xxspltd 34, 0, 0
+; CHECK: xxspltd 34, 0, 1
 }
diff --git a/test/CodeGen/SPARC/2011-01-22-SRet.ll b/test/CodeGen/SPARC/2011-01-22-SRet.ll
index ae9764e820845..678544ebf2cb4 100644
--- a/test/CodeGen/SPARC/2011-01-22-SRet.ll
+++ b/test/CodeGen/SPARC/2011-01-22-SRet.ll
@@ -19,8 +19,8 @@ entry:
 define i32 @test() nounwind {
 entry:
 ;CHECK-LABEL: test:
-;CHECK: st {{.+}}, [%sp+64]
 ;CHECK: call make_foo
+;CHECK: st {{.+}}, [%sp+64]
 ;CHECK: unimp 12
   %f = alloca %struct.foo_t, align 8
   call void @make_foo(%struct.foo_t* noalias sret %f, i32 10, i32 20, i32 30) nounwind
diff --git a/test/CodeGen/SPARC/32abi.ll b/test/CodeGen/SPARC/32abi.ll
new file mode 100644
index 0000000000000..7ac1de5c09049
--- /dev/null
+++ b/test/CodeGen/SPARC/32abi.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -march=sparc -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc < %s -march=sparcel -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+
+; CHECK-LABEL: intarg:
+; The save/restore frame is not strictly necessary here, but we would need to
+; refer to %o registers instead.
+; CHECK: save %sp, -96, %sp
+; CHECK: ld [%fp+96], [[R2:%[gilo][0-7]]]
+; CHECK: ld [%fp+92], [[R1:%[gilo][0-7]]]
+; CHECK: stb %i0, [%i4]
+; CHECK: stb %i1, [%i4]
+; CHECK: sth %i2, [%i4]
+; CHECK: st  %i3, [%i4]
+; CHECK: st  %i4, [%i4]
+; CHECK: st  %i5, [%i4]
+; CHECK: st  [[R1]], [%i4]
+; CHECK: st  [[R2]], [%i4]
+; CHECK: restore
+define void @intarg(i8  %a0,   ; %i0
+                    i8  %a1,   ; %i1
+                    i16 %a2,   ; %i2
+                    i32 %a3,   ; %i3
+                    i8* %a4,   ; %i4
+                    i32 %a5,   ; %i5
+                    i32 signext %a6,   ; [%fp+92]
+                    i8* %a7) { ; [%fp+96]
+  store i8 %a0, i8* %a4
+  store i8 %a1, i8* %a4
+  %p16 = bitcast i8* %a4 to i16*
+  store i16 %a2, i16* %p16
+  %p32 = bitcast i8* %a4 to i32*
+  store i32 %a3, i32* %p32
+  %pp = bitcast i8* %a4 to i8**
+  store i8* %a4, i8** %pp
+  store i32 %a5, i32* %p32
+  store i32 %a6, i32* %p32
+  store i8* %a7, i8** %pp
+  ret void
+}
+
+; CHECK-LABEL: call_intarg:
+; CHECK: save %sp, -104, %sp
+; Use %o0-%o5 for outgoing arguments
+; CHECK: mov 5, %o5
+; CHECK: st %i0, [%sp+92]
+; CHECK: call intarg
+; CHECK-NOT: add %sp
+; CHECK: restore
+define void @call_intarg(i32 %i0, i8* %i1) {
+  call void @intarg(i8 0, i8 1, i16 2, i32 3, i8* undef, i32 5, i32 %i0, i8* %i1)
+  ret void
+}
+
+;; Verify doubles starting with an even reg, starting with an odd reg,
+;; straddling the boundary of regs and mem, and floats in regs and mem.
+;
+; CHECK-LABEL: floatarg:
+; CHECK: save %sp, -120, %sp
+; CHECK: mov %i5, %g2
+; CHECK-NEXT: ld [%fp+92], %g3
+; CHECK-NEXT: mov %i4, %i5
+; CHECK-NEXT: std %g2, [%fp+-24]
+; CHECK-NEXT: mov %i3, %i4
+; CHECK-NEXT: std %i4, [%fp+-16]
+; CHECK-NEXT: std %i0, [%fp+-8]
+; CHECK-NEXT: st %i2, [%fp+-28]
+; CHECK-NEXT: ld [%fp+104], %f0
+; CHECK-NEXT: ldd [%fp+96], %f2
+; CHECK-NEXT: ld [%fp+-28], %f1
+; CHECK-NEXT: ldd [%fp+-8], %f4
+; CHECK-NEXT: ldd [%fp+-16], %f6
+; CHECK-NEXT: ldd [%fp+-24], %f8
+; CHECK-NEXT: fstod %f1, %f10
+; CHECK-NEXT: faddd %f4, %f10, %f4
+; CHECK-NEXT: faddd %f6, %f4, %f4
+; CHECK-NEXT: faddd %f8, %f4, %f4
+; CHECK-NEXT: faddd %f2, %f4, %f2
+; CHECK-NEXT: fstod %f0, %f0
+; CHECK-NEXT: faddd %f0, %f2, %f0
+; CHECK-NEXT: restore
+define double @floatarg(double %a0,   ; %i0,%i1
+                        float %a1,    ; %i2
+                        double %a2,   ; %i3, %i4
+                        double %a3,   ; %i5, [%fp+92] (using 4 bytes)
+                        double %a4,   ; [%fp+96] (using 8 bytes)
+                        float %a5) {  ; [%fp+104] (using 4 bytes)
+  %d1 = fpext float %a1 to double
+  %s1 = fadd double %a0, %d1
+  %s2 = fadd double %a2, %s1
+  %s3 = fadd double %a3, %s2
+  %s4 = fadd double %a4, %s3
+  %d5 = fpext float %a5 to double
+  %s5 = fadd double %d5, %s4
+  ret double %s5
+}
+
+; CHECK-LABEL: call_floatarg:
+; CHECK: save %sp, -112, %sp
+; CHECK: mov %i2, %o1
+; CHECK-NEXT: mov %i1, %o0
+; CHECK-NEXT: st %i0, [%sp+104]
+; CHECK-NEXT: std %o0, [%sp+96]
+; CHECK-NEXT: st %o1, [%sp+92]
+; CHECK-NEXT: mov %i0, %o2
+; CHECK-NEXT: mov %o0, %o3
+; CHECK-NEXT: mov %o1, %o4
+; CHECK-NEXT: mov %o0, %o5
+; CHECK-NEXT: call floatarg
+; CHECK: std %f0, [%i4]
+; CHECK: restore
+define void @call_floatarg(float %f1, double %d2, float %f5, double *%p) {
+  %r = call double @floatarg(double %d2, float %f1, double %d2, double %d2,
+                             double %d2, float %f1)
+  store double %r, double* %p
+  ret void
+}
+
+;; i64 arguments should effectively work the same as double: split
+;; into two locations.  This is different for little-endian vs big
+;; endian, since the 64-bit math needs to be split
+; CHECK-LABEL: i64arg:
+; CHECK:  save %sp, -96, %sp
+; CHECK-BE: ld [%fp+100], %g2
+; CHECK-BE-NEXT: ld [%fp+96], %g3
+; CHECK-BE-NEXT: ld [%fp+92], %g4
+; CHECK-BE-NEXT: addcc %i1, %i2, %i1
+; CHECK-BE-NEXT: addxcc %i0, 0, %i0
+; CHECK-BE-NEXT: addcc %i4, %i1, %i1
+; CHECK-BE-NEXT: addxcc %i3, %i0, %i0
+; CHECK-BE-NEXT: addcc %g4, %i1, %i1
+; CHECK-BE-NEXT: ld [%fp+104], %i2
+; CHECK-BE-NEXT: addxcc %i5, %i0, %i0
+; CHECK-BE-NEXT: addcc %g2, %i1, %i1
+; CHECK-BE-NEXT: addxcc %g3, %i0, %i0
+; CHECK-BE-NEXT: addcc %i2, %i1, %i1
+; CHECK-BE-NEXT: addxcc %i0, 0, %i0
+;
+; CHECK-LE: ld [%fp+96], %g2
+; CHECK-LE-NEXT: ld [%fp+100], %g3
+; CHECK-LE-NEXT: ld [%fp+92], %g4
+; CHECK-LE-NEXT: addcc %i0, %i2, %i0
+; CHECK-LE-NEXT: addxcc %i1, 0, %i1
+; CHECK-LE-NEXT: addcc %i3, %i0, %i0
+; CHECK-LE-NEXT: addxcc %i4, %i1, %i1
+; CHECK-LE-NEXT: addcc %i5, %i0, %i0
+; CHECK-LE-NEXT: ld [%fp+104], %i2
+; CHECK-LE-NEXT: addxcc %g4, %i1, %i1
+; CHECK-LE-NEXT: addcc %g2, %i0, %i0
+; CHECK-LE-NEXT: addxcc %g3, %i1, %i1
+; CHECK-LE-NEXT: addcc %i2, %i0, %i0
+; CHECK-LE-NEXT: addxcc %i1, 0, %i1
+; CHECK-NEXT: restore
+
+
+define i64 @i64arg(i64 %a0,    ; %i0,%i1
+		   i32 %a1,    ; %i2
+		   i64 %a2,    ; %i3, %i4
+		   i64 %a3,    ; %i5, [%fp+92] (using 4 bytes)
+		   i64 %a4,    ; [%fp+96] (using 8 bytes)
+                   i32 %a5) {  ; [%fp+104] (using 4 bytes)
+  %a1L = zext i32 %a1 to i64
+  %s1 = add i64 %a0, %a1L
+  %s2 = add i64 %a2, %s1
+  %s3 = add i64 %a3, %s2
+  %s4 = add i64 %a4, %s3
+  %a5L = zext i32 %a5 to i64
+  %s5 = add i64 %a5L, %s4
+  ret i64 %s5
+}
+
+; CHECK-LABEL: call_i64arg:
+; CHECK: save %sp, -112, %sp
+; CHECK: st %i0, [%sp+104]
+; CHECK-NEXT: st %i2, [%sp+100]
+; CHECK-NEXT: st %i1, [%sp+96]
+; CHECK-NEXT: st %i2, [%sp+92]
+; CHECK-NEXT: mov      %i1, %o0
+; CHECK-NEXT: mov      %i2, %o1
+; CHECK-NEXT: mov      %i0, %o2
+; CHECK-NEXT: mov      %i1, %o3
+; CHECK-NEXT: mov      %i2, %o4
+; CHECK-NEXT: mov      %i1, %o5
+; CHECK-NEXT: call i64arg
+; CHECK: std %o0, [%i3]
+; CHECK-NEXT: restore
+
+define void @call_i64arg(i32 %a0, i64 %a1, i64* %p) {
+  %r = call i64 @i64arg(i64 %a1, i32 %a0, i64 %a1, i64 %a1, i64 %a1, i32 %a0)
+  store i64 %r, i64* %p
+  ret void
+}
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index 7c08998a14279..96104ecc3c68c 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -1,19 +1,19 @@
 ; RUN: llc < %s -march=sparcv9 -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s
 
-; CHECK: intarg
+; CHECK-LABEL: intarg:
 ; The save/restore frame is not strictly necessary here, but we would need to
 ; refer to %o registers instead.
 ; CHECK: save %sp, -128, %sp
+; CHECK: ldx [%fp+2231], [[R2:%[gilo][0-7]]]
+; CHECK: ld [%fp+2227], [[R1:%[gilo][0-7]]]
 ; CHECK: stb %i0, [%i4]
 ; CHECK: stb %i1, [%i4]
 ; CHECK: sth %i2, [%i4]
 ; CHECK: st  %i3, [%i4]
 ; CHECK: stx %i4, [%i4]
 ; CHECK: st  %i5, [%i4]
-; CHECK: ld [%fp+2227], [[R:%[gilo][0-7]]]
-; CHECK: st  [[R]], [%i4]
-; CHECK: ldx [%fp+2231], [[R:%[gilo][0-7]]]
-; CHECK: stx [[R]], [%i4]
+; CHECK: st  [[R1]], [%i4]
+; CHECK: stx [[R2]], [%i4]
 ; CHECK: restore
 define void @intarg(i8  %a0,   ; %i0
                     i8  %a1,   ; %i1
@@ -37,14 +37,14 @@ define void @intarg(i8  %a0,   ; %i0
   ret void
 }
 
-; CHECK: call_intarg
+; CHECK-LABEL: call_intarg:
 ; 16 saved + 8 args.
 ; CHECK: save %sp, -192, %sp
 ; Sign-extend and store the full 64 bits.
 ; CHECK: sra %i0, 0, [[R:%[gilo][0-7]]]
-; CHECK: stx [[R]], [%sp+2223]
 ; Use %o0-%o5 for outgoing arguments
 ; CHECK: mov 5, %o5
+; CHECK: stx [[R]], [%sp+2223]
 ; CHECK: call intarg
 ; CHECK-NOT: add %sp
 ; CHECK: restore
@@ -53,13 +53,13 @@ define void @call_intarg(i32 %i0, i8* %i1) {
   ret void
 }
 
-; CHECK: floatarg
+; CHECK-LABEL: floatarg:
 ; CHECK: save %sp, -128, %sp
+; CHECK: ld [%fp+2307], [[F:%f[0-9]+]]
 ; CHECK: fstod %f1,
 ; CHECK: faddd %f2,
 ; CHECK: faddd %f4,
 ; CHECK: faddd %f6,
-; CHECK: ld [%fp+2307], [[F:%f[0-9]+]]
 ; CHECK: fadds %f31, [[F]]
 define double @floatarg(float %a0,    ; %f1
                         double %a1,   ; %d2
@@ -89,12 +89,12 @@ define double @floatarg(float %a0,    ; %f1
   ret double %s17
 }
 
-; CHECK: call_floatarg
+; CHECK-LABEL: call_floatarg:
 ; CHECK: save %sp, -272, %sp
-; Store 4 bytes, right-aligned in slot.
-; CHECK: st %f1, [%sp+2307]
 ; Store 8 bytes in full slot.
 ; CHECK: std %f2, [%sp+2311]
+; Store 4 bytes, right-aligned in slot.
+; CHECK: st %f1, [%sp+2307]
 ; CHECK: fmovd %f2, %f4
 ; CHECK: call floatarg
 ; CHECK-NOT: add %sp
@@ -109,12 +109,12 @@ define void @call_floatarg(float %f1, double %d2, float %f5, double *%p) {
   ret void
 }
 
-; CHECK: mixedarg
+; CHECK-LABEL: mixedarg:
+; CHECK: ldx [%fp+2247]
+; CHECK: ldx [%fp+2231]
 ; CHECK: fstod %f3
 ; CHECK: faddd %f6
 ; CHECK: faddd %f16
-; CHECK: ldx [%fp+2231]
-; CHECK: ldx [%fp+2247]
 define void @mixedarg(i8 %a0,      ; %i0
                       float %a1,   ; %f3
                       i16 %a2,     ; %i2
@@ -133,7 +133,7 @@ define void @mixedarg(i8 %a0,      ; %i0
   ret void
 }
 
-; CHECK: call_mixedarg
+; CHECK-LABEL: call_mixedarg:
 ; CHECK: stx %i2, [%sp+2247]
 ; CHECK: stx %i0, [%sp+2223]
 ; CHECK: fmovd %f2, %f6
@@ -157,7 +157,7 @@ define void @call_mixedarg(i64 %i0, double %f2, i16* %i2) {
 
 ; The inreg attribute is used to indicate 32-bit sized struct elements that
 ; share an 8-byte slot.
-; CHECK: inreg_fi
+; CHECK-LABEL: inreg_fi:
 ; CHECK: fstoi %f1
 ; CHECK: srlx %i0, 32, [[R:%[gilo][0-7]]]
 ; CHECK: sub [[R]],
@@ -168,7 +168,7 @@ define i32 @inreg_fi(i32 inreg %a0,     ; high bits of %i0
   ret i32 %rv
 }
 
-; CHECK: call_inreg_fi
+; CHECK-LABEL: call_inreg_fi:
 ; Allocate space for 6 arguments, even when only 2 are used.
 ; CHECK: save %sp, -176, %sp
 ; CHECK: sllx %i1, 32, %o0
@@ -179,7 +179,7 @@ define void @call_inreg_fi(i32* %p, i32 %i1, float %f5) {
   ret void
 }
 
-; CHECK: inreg_ff
+; CHECK-LABEL: inreg_ff:
 ; CHECK: fsubs %f0, %f1, %f0
 define float @inreg_ff(float inreg %a0,   ; %f0
                        float inreg %a1) { ; %f1
@@ -187,7 +187,7 @@ define float @inreg_ff(float inreg %a0,   ; %f0
   ret float %rv
 }
 
-; CHECK: call_inreg_ff
+; CHECK-LABEL: call_inreg_ff:
 ; CHECK: fmovs %f3, %f0
 ; CHECK: fmovs %f5, %f1
 ; CHECK: call inreg_ff
@@ -196,7 +196,7 @@ define void @call_inreg_ff(i32* %p, float %f3, float %f5) {
   ret void
 }
 
-; CHECK: inreg_if
+; CHECK-LABEL: inreg_if:
 ; CHECK: fstoi %f0
 ; CHECK: sub %i0
 define i32 @inreg_if(float inreg %a0, ; %f0
@@ -206,7 +206,7 @@ define i32 @inreg_if(float inreg %a0, ; %f0
   ret i32 %rv
 }
 
-; CHECK: call_inreg_if
+; CHECK-LABEL: call_inreg_if:
 ; CHECK: fmovs %f3, %f0
 ; CHECK: mov %i2, %o0
 ; CHECK: call inreg_if
@@ -216,7 +216,7 @@ define void @call_inreg_if(i32* %p, float %f3, i32 %i2) {
 }
 
 ; The frontend shouldn't do this. Just pass i64 instead.
-; CHECK: inreg_ii
+; CHECK-LABEL: inreg_ii:
 ; CHECK: srlx %i0, 32, [[R:%[gilo][0-7]]]
 ; CHECK: sub %i0, [[R]], %i0
 define i32 @inreg_ii(i32 inreg %a0,   ; high bits of %i0
@@ -225,7 +225,7 @@ define i32 @inreg_ii(i32 inreg %a0,   ; high bits of %i0
   ret i32 %rv
 }
 
-; CHECK: call_inreg_ii
+; CHECK-LABEL: call_inreg_ii:
 ; CHECK: srl %i2, 0, [[R2:%[gilo][0-7]]]
 ; CHECK: sllx %i1, 32, [[R1:%[gilo][0-7]]]
 ; CHECK: or [[R1]], [[R2]], %o0
@@ -236,7 +236,7 @@ define void @call_inreg_ii(i32* %p, i32 %i1, i32 %i2) {
 }
 
 ; Structs up to 32 bytes in size can be returned in registers.
-; CHECK: ret_i64_pair
+; CHECK-LABEL: ret_i64_pair:
 ; CHECK: ldx [%i2], %i0
 ; CHECK: ldx [%i3], %i1
 define { i64, i64 } @ret_i64_pair(i32 %a0, i32 %a1, i64* %p, i64* %q) {
@@ -248,7 +248,7 @@ define { i64, i64 } @ret_i64_pair(i32 %a0, i32 %a1, i64* %p, i64* %q) {
   ret { i64, i64 } %rv2
 }
 
-; CHECK: call_ret_i64_pair
+; CHECK-LABEL: call_ret_i64_pair:
 ; CHECK: call ret_i64_pair
 ; CHECK: stx %o0, [%i0]
 ; CHECK: stx %o1, [%i0]
@@ -263,7 +263,7 @@ define void @call_ret_i64_pair(i64* %i0) {
 }
 
 ; This is not a C struct, the i32 member uses 8 bytes, but the float only 4.
-; CHECK: ret_i32_float_pair
+; CHECK-LABEL: ret_i32_float_pair:
 ; CHECK: ld [%i2], %i0
 ; CHECK: ld [%i3], %f2
 define { i32, float } @ret_i32_float_pair(i32 %a0, i32 %a1,
@@ -276,7 +276,7 @@ define { i32, float } @ret_i32_float_pair(i32 %a0, i32 %a1,
   ret { i32, float } %rv2
 }
 
-; CHECK: call_ret_i32_float_pair
+; CHECK-LABEL: call_ret_i32_float_pair:
 ; CHECK: call ret_i32_float_pair
 ; CHECK: st %o0, [%i0]
 ; CHECK: st %f2, [%i1]
@@ -291,10 +291,10 @@ define void @call_ret_i32_float_pair(i32* %i0, float* %i1) {
 }
 
 ; This is a C struct, each member uses 4 bytes.
-; CHECK: ret_i32_float_packed
+; CHECK-LABEL: ret_i32_float_packed:
 ; CHECK: ld [%i2], [[R:%[gilo][0-7]]]
-; CHECK: sllx [[R]], 32, %i0
 ; CHECK: ld [%i3], %f1
+; CHECK: sllx [[R]], 32, %i0
 define inreg { i32, float } @ret_i32_float_packed(i32 %a0, i32 %a1,
                                                   i32* %p, float* %q) {
   %r1 = load i32, i32* %p
@@ -305,7 +305,7 @@ define inreg { i32, float } @ret_i32_float_packed(i32 %a0, i32 %a1,
   ret { i32, float } %rv2
 }
 
-; CHECK: call_ret_i32_float_packed
+; CHECK-LABEL: call_ret_i32_float_packed:
 ; CHECK: call ret_i32_float_packed
 ; CHECK: srlx %o0, 32, [[R:%[gilo][0-7]]]
 ; CHECK: st [[R]], [%i0]
@@ -322,7 +322,7 @@ define void @call_ret_i32_float_packed(i32* %i0, float* %i1) {
 
 ; The C frontend should use i64 to return { i32, i32 } structs, but verify that
 ; we don't miscompile thi case where both struct elements are placed in %i0.
-; CHECK: ret_i32_packed
+; CHECK-LABEL: ret_i32_packed:
 ; CHECK: ld [%i2], [[R1:%[gilo][0-7]]]
 ; CHECK: ld [%i3], [[R2:%[gilo][0-7]]]
 ; CHECK: sllx [[R2]], 32, [[R3:%[gilo][0-7]]]
@@ -337,7 +337,7 @@ define inreg { i32, i32 } @ret_i32_packed(i32 %a0, i32 %a1,
   ret { i32, i32 } %rv2
 }
 
-; CHECK: call_ret_i32_packed
+; CHECK-LABEL: call_ret_i32_packed:
 ; CHECK: call ret_i32_packed
 ; CHECK: srlx %o0, 32, [[R:%[gilo][0-7]]]
 ; CHECK: st [[R]], [%i0]
@@ -353,31 +353,31 @@ define void @call_ret_i32_packed(i32* %i0, i32* %i1) {
 }
 
 ; The return value must be sign-extended to 64 bits.
-; CHECK: ret_sext
+; CHECK-LABEL: ret_sext:
 ; CHECK: sra %i0, 0, %i0
 define signext i32 @ret_sext(i32 %a0) {
   ret i32 %a0
 }
 
-; CHECK: ret_zext
+; CHECK-LABEL: ret_zext:
 ; CHECK: srl %i0, 0, %i0
 define zeroext i32 @ret_zext(i32 %a0) {
   ret i32 %a0
 }
 
-; CHECK: ret_nosext
+; CHECK-LABEL: ret_nosext:
 ; CHECK-NOT: sra
 define signext i32 @ret_nosext(i32 signext %a0) {
   ret i32 %a0
 }
 
-; CHECK: ret_nozext
+; CHECK-LABEL: ret_nozext:
 ; CHECK-NOT: srl
 define signext i32 @ret_nozext(i32 signext %a0) {
   ret i32 %a0
 }
 
-; CHECK-LABEL: test_register_directive
+; CHECK-LABEL: test_register_directive:
 ; CHECK:       .register %g2, #scratch
 ; CHECK:       .register %g3, #scratch
 ; CHECK:       add %i0, 2, %g2
@@ -391,7 +391,7 @@ entry:
   ret i32 %2
 }
 
-; CHECK-LABEL: test_large_stack
+; CHECK-LABEL: test_large_stack:
 
 ; CHECK:       sethi 16, %g1
 ; CHECK:       xor %g1, -176, %g1
@@ -412,7 +412,7 @@ entry:
 
 declare i32 @use_buf(i32, i8*)
 
-; CHECK-LABEL: test_fp128_args
+; CHECK-LABEL: test_fp128_args:
 ; CHECK-DAG:   std %f0, [%fp+{{.+}}]
 ; CHECK-DAG:   std %f2, [%fp+{{.+}}]
 ; CHECK-DAG:   std %f6, [%fp+{{.+}}]
@@ -428,7 +428,7 @@ entry:
 
 declare i64 @receive_fp128(i64 %a, ...)
 
-; CHECK-LABEL: test_fp128_variable_args
+; CHECK-LABEL: test_fp128_variable_args:
 ; CHECK-DAG:   std %f4, [%sp+[[Offset0:[0-9]+]]]
 ; CHECK-DAG:   std %f6, [%sp+[[Offset1:[0-9]+]]]
 ; CHECK-DAG:   ldx [%sp+[[Offset0]]], %o2
@@ -440,7 +440,7 @@ entry:
   ret i64 %0
 }
 
-; CHECK-LABEL: test_call_libfunc
+; CHECK-LABEL: test_call_libfunc:
 ; CHECK:       st %f1, [%fp+[[Offset0:[0-9]+]]]
 ; CHECK:       fmovs %f3, %f1
 ; CHECK:       call cosf
diff --git a/test/CodeGen/SPARC/basictest.ll b/test/CodeGen/SPARC/basictest.ll
index 3792100b2e630..889f5144413f2 100644
--- a/test/CodeGen/SPARC/basictest.ll
+++ b/test/CodeGen/SPARC/basictest.ll
@@ -71,12 +71,12 @@ define i64 @signed_multiply_32x32_64(i32 %a, i32 %b) {
 }
 
 ; CHECK-LABEL: unsigned_multiply_32x32_64:
-; CHECK: umul %o0, %o1, %o2
-; CHECK: rd %y, %o2
 ;FIXME: the smul in the output is totally redundant and should not there.
-; CHECK: smul %o0, %o1, %o1
+; CHECK: smul %o0, %o1, %o2
+; CHECK: umul %o0, %o1, %o0
+; CHECK: rd %y, %o0
 ; CHECK: retl
-; CHECK: mov      %o2, %o0
+; CHECK: mov      %o2, %o1
 define i64 @unsigned_multiply_32x32_64(i32 %a, i32 %b) {
   %xa = zext i32 %a to i64
   %xb = zext i32 %b to i64
@@ -84,3 +84,16 @@ define i64 @unsigned_multiply_32x32_64(i32 %a, i32 %b) {
   ret i64 %r
 }
 
+; CHECK-LABEL: load_store_64bit:
+; CHECK: ldd [%o0], %o2
+; CHECK: addcc %o3, 3, %o5
+; CHECK: addxcc %o2, 0, %o4
+; CHECK: retl
+; CHECK: std %o4, [%o1]
+define void @load_store_64bit(i64* %x, i64* %y) {
+entry:
+  %0 = load i64, i64* %x
+  %add = add nsw i64 %0, 3
+  store i64 %add, i64* %y
+  ret void
+}
diff --git a/test/CodeGen/SPARC/float-constants.ll b/test/CodeGen/SPARC/float-constants.ll
new file mode 100644
index 0000000000000..b3686ebdf4404
--- /dev/null
+++ b/test/CodeGen/SPARC/float-constants.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=sparc | FileCheck %s
+
+; TODO: actually fix the codegen to be optimal. At least we don't
+; crash for now, though...
+
+;; Bitcast should not do a runtime conversion, but rather emit a
+;; constant into integer registers directly.
+
+; CHECK-LABEL: bitcast:
+; TODO-CHECK: sethi 1049856, %o0
+; TODO-CHECK: sethi 0, %o1
+define <2 x i32> @bitcast() {
+  %1 = bitcast double 5.0 to <2 x i32>
+  ret <2 x i32> %1
+}
+
+;; Same thing for a call using a double (which gets passed in integer
+;; registers)
+
+; CHECK-LABEL: test_call
+; TODO-CHECK: sethi 1049856, %o0
+; TODO-CHECK: sethi 0, %o1
+declare void @a(double)
+define void @test_call() {
+  call void @a(double 5.0)
+  ret void
+}
+
+;; And for a libcall emitted from the pow intrinsic.  (libcall
+;; emission happens after SelectionDAG type legalization, so is a bit
+;; different than a normal function call. This was crashing before,
+;; due to an earlier broken workaround for this issue.)
+
+; CHECK-LABEL: test_intrins_call
+; TODO-CHECK: sethi 1049856, %o0
+; TODO-CHECK: sethi 0, %o1
+declare double @llvm.pow.f64(double, double)
+define double @test_intrins_call() {
+  %1 = call double @llvm.pow.f64(double 2.0, double 2.0)
+  ret double %1
+}
diff --git a/test/CodeGen/SPARC/float.ll b/test/CodeGen/SPARC/float.ll
index d7a79cb05a82c..c4cc04420ad7a 100644
--- a/test/CodeGen/SPARC/float.ll
+++ b/test/CodeGen/SPARC/float.ll
@@ -53,20 +53,18 @@ declare double @get_double()
 declare double @llvm.fabs.f64(double) nounwind readonly
 
 ; V8-LABEL:    test_v9_floatreg:
-; V8:          fsubd {{.+}}, {{.+}}, {{.+}}
-; V8:          faddd {{.+}}, {{.+}}, [[R:%f(((1|2)?(0|2|4|6|8))|30)]]
+; V8:          fsubd {{.+}}, {{.+}}, [[R:%f(((1|2)?(0|2|4|6|8))|30)]]
 ; V8:          std [[R]], [%{{.+}}]
 ; V8:          ldd [%{{.+}}], %f0
+; V8:          faddd {{.+}}, {{.+}}, {{.+}}
 
 ; V9-LABEL:    test_v9_floatreg:
 ; V9:          fsubd {{.+}}, {{.+}}, {{.+}}
-; V9:          faddd {{.+}}, {{.+}}, [[R:%f((3(2|4|6|8))|((4|5)(0|2|4|6|8))|(60|62))]]
-; V9:          fmovd [[R]], %f0
+; V9:          faddd {{.+}}, {{.+}}, %f0
 
 ; SPARC64-LABEL:    test_v9_floatreg:
 ; SPARC64:          fsubd {{.+}}, {{.+}}, {{.+}}
-; SPARC64:          faddd {{.+}}, {{.+}}, [[R:%f((3(2|4|6|8))|((4|5)(0|2|4|6|8))|(60|62))]]
-; SPARC64:          fmovd [[R]], %f0
+; SPARC64:          faddd {{.+}}, {{.+}}, %f0
 
 define double @test_v9_floatreg() {
 entry:
diff --git a/test/CodeGen/SPARC/fp128.ll b/test/CodeGen/SPARC/fp128.ll
index c864cb7d599bb..e0eaf93a733ea 100644
--- a/test/CodeGen/SPARC/fp128.ll
+++ b/test/CodeGen/SPARC/fp128.ll
@@ -45,14 +45,14 @@ entry:
 ; HARD:       std %f{{.+}}, [%[[S1:.+]]]
 ; HARD-DAG:   ldd [%[[S0]]], %f{{.+}}
 ; HARD-DAG:   ldd [%[[S1]]], %f{{.+}}
-; HARD:       jmp %o7+12
+; HARD:       jmp {{%[oi]7}}+12
 
 ; SOFT-LABEL: f128_spill
 ; SOFT:       std %f{{.+}}, [%[[S0:.+]]]
 ; SOFT:       std %f{{.+}}, [%[[S1:.+]]]
 ; SOFT-DAG:   ldd [%[[S0]]], %f{{.+}}
 ; SOFT-DAG:   ldd [%[[S1]]], %f{{.+}}
-; SOFT:       jmp %o7+12
+; SOFT:       jmp {{%[oi]7}}+12
 
 define void @f128_spill(fp128* noalias sret %scalar.result, fp128* byval %a) {
 entry:
diff --git a/test/CodeGen/SPARC/inlineasm.ll b/test/CodeGen/SPARC/inlineasm.ll
index 526cde8de8b4b..d54c5c6bc7801 100644
--- a/test/CodeGen/SPARC/inlineasm.ll
+++ b/test/CodeGen/SPARC/inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=sparc -no-integrated-as <%s | FileCheck %s
+; RUN: llc -march=sparc <%s | FileCheck %s
 
 ; CHECK-LABEL: test_constraint_r
 ; CHECK:       add %o1, %o0, %o0
@@ -8,7 +8,7 @@ entry:
   ret i32 %0
 }
 
-; CHECK-LABEL: test_constraint_I
+; CHECK-LABEL: test_constraint_I:
 ; CHECK:       add %o0, 1023, %o0
 define i32 @test_constraint_I(i32 %a) {
 entry:
@@ -16,7 +16,7 @@ entry:
   ret i32 %0
 }
 
-; CHECK-LABEL: test_constraint_I_neg
+; CHECK-LABEL: test_constraint_I_neg:
 ; CHECK:       add %o0, -4096, %o0
 define i32 @test_constraint_I_neg(i32 %a) {
 entry:
@@ -24,7 +24,7 @@ entry:
   ret i32 %0
 }
 
-; CHECK-LABEL: test_constraint_I_largeimm
+; CHECK-LABEL: test_constraint_I_largeimm:
 ; CHECK:       sethi 9, [[R0:%[gilo][0-7]]]
 ; CHECK:       or [[R0]], 784, [[R1:%[gilo][0-7]]]
 ; CHECK:       add %o0, [[R1]], %o0
@@ -34,12 +34,51 @@ entry:
   ret i32 %0
 }
 
-; CHECK-LABEL: test_constraint_reg
+; CHECK-LABEL: test_constraint_reg:
 ; CHECK:       ldda [%o1] 43, %g2
-; CHECK:       ldda [%o1] 43, %g3
+; CHECK:       ldda [%o1] 43, %g4
 define void @test_constraint_reg(i32 %s, i32* %ptr) {
 entry:
   %0 = tail call i64 asm sideeffect "ldda [$1] $2, $0", "={r2},r,n"(i32* %ptr, i32 43)
-  %1 = tail call i64 asm sideeffect "ldda [$1] $2, $0", "={g3},r,n"(i32* %ptr, i32 43)
+  %1 = tail call i64 asm sideeffect "ldda [$1] $2, $0", "={g4},r,n"(i32* %ptr, i32 43)
   ret void
 }
+
+;; Ensure that i64 args to asm are allocated to the IntPair register class.
+;; Also checks that register renaming for leaf proc works.
+; CHECK-LABEL: test_constraint_r_i64:
+; CHECK: mov %o0, %o5
+; CHECK: sra %o5, 31, %o4
+; CHECK: std %o4, [%o1]
+define i32 @test_constraint_r_i64(i32 %foo, i64* %out, i32 %o) {
+entry:
+  %conv = sext i32 %foo to i64
+  tail call void asm sideeffect "std $0, [$1]", "r,r,~{memory}"(i64 %conv, i64* %out)
+  ret i32 %o
+}
+
+;; Same test without leaf-proc opt
+; CHECK-LABEL: test_constraint_r_i64_noleaf:
+; CHECK: mov %i0, %i5
+; CHECK: sra %i5, 31, %i4
+; CHECK: std %i4, [%i1]
+define i32 @test_constraint_r_i64_noleaf(i32 %foo, i64* %out, i32 %o) #0 {
+entry:
+  %conv = sext i32 %foo to i64
+  tail call void asm sideeffect "std $0, [$1]", "r,r,~{memory}"(i64 %conv, i64* %out)
+  ret i32 %o
+}
+attributes #0 = { "no-frame-pointer-elim"="true" }
+
+;; Ensures that tied in and out gets allocated properly.
+; CHECK-LABEL: test_i64_inout:
+; CHECK: sethi 0, %o2
+; CHECK: mov 5, %o3
+; CHECK: xor %o2, %g0, %o2
+; CHECK: mov %o2, %o0
+; CHECK: ret
+define i64 @test_i64_inout() {
+entry:
+  %0 = call i64 asm sideeffect "xor $1, %g0, $0", "=r,0,~{i1}"(i64 5);
+  ret i64 %0
+}
diff --git a/test/CodeGen/SPARC/missing-sret.ll b/test/CodeGen/SPARC/missing-sret.ll
new file mode 100644
index 0000000000000..683d840bd2507
--- /dev/null
+++ b/test/CodeGen/SPARC/missing-sret.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=sparc -filetype=obj < %s > /dev/null 2> %t2
+
+define void @mul_double_cc({ double, double }* noalias sret %agg.result, double %a, double %b, double %c, double %d) {
+entry:
+  call void @__muldc3({ double, double }* sret %agg.result, double %a, double %b, double %c, double %d)
+  ret void
+}
+
+declare void @__muldc3({ double, double }*, double, double, double, double)
diff --git a/test/CodeGen/SPARC/reserved-regs.ll b/test/CodeGen/SPARC/reserved-regs.ll
new file mode 100644
index 0000000000000..fe208015827be
--- /dev/null
+++ b/test/CodeGen/SPARC/reserved-regs.ll
@@ -0,0 +1,135 @@
+; RUN: llc -march=sparc  < %s | FileCheck %s
+
+@g = common global [32 x i32] zeroinitializer, align 16
+@h = common global [16 x i64] zeroinitializer, align 16
+
+;; Ensures that we don't use registers which are supposed to be reserved.
+
+; CHECK-LABEL: use_all_i32_regs:
+; CHECK-NOT: %g0
+; CHECK-NOT: %g1
+; CHECK-NOT: %g5
+; CHECK-NOT: %g6
+; CHECK-NOT: %g7
+; CHECK-NOT: %o6
+; CHECK-NOT: %i6
+; CHECK-NOT: %i7
+; CHECK: ret
+define void @use_all_i32_regs() {
+entry:
+  %0 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 0), align 16
+  %1 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 1), align 4
+  %2 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 2), align 8
+  %3 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 3), align 4
+  %4 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 4), align 16
+  %5 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 5), align 4
+  %6 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 6), align 8
+  %7 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 7), align 4
+  %8 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 8), align 16
+  %9 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 9), align 4
+  %10 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 10), align 8
+  %11 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 11), align 4
+  %12 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 12), align 16
+  %13 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 13), align 4
+  %14 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 14), align 8
+  %15 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 15), align 4
+  %16 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 16), align 16
+  %17 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 17), align 4
+  %18 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 18), align 8
+  %19 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 19), align 4
+  %20 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 20), align 16
+  %21 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 21), align 4
+  %22 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 22), align 8
+  %23 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 23), align 4
+  %24 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 24), align 16
+  %25 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 25), align 4
+  %26 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 26), align 8
+  %27 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 27), align 4
+  %28 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 28), align 16
+  %29 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 29), align 4
+  %30 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 30), align 8
+  %31 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 31), align 4
+  store volatile i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 0), align 16
+  store volatile i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 1), align 4
+  store volatile i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 2), align 8
+  store volatile i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 3), align 4
+  store volatile i32 %5, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 4), align 16
+  store volatile i32 %6, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 5), align 4
+  store volatile i32 %7, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 6), align 8
+  store volatile i32 %8, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 7), align 4
+  store volatile i32 %9, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 8), align 16
+  store volatile i32 %10, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 9), align 4
+  store volatile i32 %11, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 10), align 8
+  store volatile i32 %12, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 11), align 4
+  store volatile i32 %13, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 12), align 16
+  store volatile i32 %14, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 13), align 4
+  store volatile i32 %15, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 14), align 8
+  store volatile i32 %16, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 15), align 4
+  store volatile i32 %17, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 16), align 16
+  store volatile i32 %18, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 17), align 4
+  store volatile i32 %19, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 18), align 8
+  store volatile i32 %20, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 19), align 4
+  store volatile i32 %21, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 20), align 16
+  store volatile i32 %22, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 21), align 4
+  store volatile i32 %23, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 22), align 8
+  store volatile i32 %24, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 23), align 4
+  store volatile i32 %25, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 24), align 16
+  store volatile i32 %26, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 25), align 4
+  store volatile i32 %27, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 26), align 8
+  store volatile i32 %28, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 27), align 4
+  store volatile i32 %29, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 28), align 16
+  store volatile i32 %30, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 29), align 4
+  store volatile i32 %31, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 30), align 8
+  store volatile i32 %0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 31), align 4
+  ret void
+}
+
+
+; CHECK-LABEL: use_all_i64_regs:
+; CHECK-NOT: %g0
+; CHECK-NOT: %g1
+; CHECK-NOT: %g4
+; CHECK-NOT: %g5
+; CHECK-NOT: %g6
+; CHECK-NOT: %g7
+; CHECK-NOT: %o6
+; CHECK-NOT: %o7
+; CHECK-NOT: %i6
+; CHECK-NOT: %i7
+; CHECK: ret
+define void @use_all_i64_regs() {
+entry:
+  %0 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 0), align 16
+  %1 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 1), align 4
+  %2 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 2), align 8
+  %3 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 3), align 4
+  %4 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 4), align 16
+  %5 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 5), align 4
+  %6 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 6), align 8
+  %7 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 7), align 4
+  %8 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 8), align 16
+  %9 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 9), align 4
+  %10 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 10), align 8
+  %11 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 11), align 4
+  %12 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 12), align 16
+  %13 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 13), align 4
+  %14 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 14), align 8
+  %15 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 15), align 4
+  store volatile i64 %1, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 0), align 16
+  store volatile i64 %2, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 1), align 4
+  store volatile i64 %3, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 2), align 8
+  store volatile i64 %4, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 3), align 4
+  store volatile i64 %5, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 4), align 16
+  store volatile i64 %6, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 5), align 4
+  store volatile i64 %7, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 6), align 8
+  store volatile i64 %8, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 7), align 4
+  store volatile i64 %9, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 8), align 16
+  store volatile i64 %10, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 9), align 4
+  store volatile i64 %11, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 10), align 8
+  store volatile i64 %12, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 11), align 4
+  store volatile i64 %13, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 12), align 16
+  store volatile i64 %14, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 13), align 4
+  store volatile i64 %15, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 14), align 8
+  store volatile i64 %0, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 15), align 4
+  ret void
+}
diff --git a/test/CodeGen/SPARC/select-mask.ll b/test/CodeGen/SPARC/select-mask.ll
new file mode 100644
index 0000000000000..2e69a3b9be539
--- /dev/null
+++ b/test/CodeGen/SPARC/select-mask.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=sparc < %s
+
+;; getBooleanContents on Sparc used to claim that no bits mattered
+;; other than the first for SELECT. Thus, the 'trunc' got eliminated
+;; as redundant. But, cmp does NOT ignore the other bits!
+
+; CHECK-LABEL: select_mask:
+; CHECK: ldub [%o0], [[R:%[goli][0-7]]]
+; CHECK: and [[R]], 1, [[V:%[goli][0-7]]]
+; CHECK: cmp [[V]], 0
+define i32 @select_mask(i8* %this) {
+entry:
+  %bf.load2 = load i8, i8* %this, align 4
+  %bf.cast5 = trunc i8 %bf.load2 to i1
+  %cond = select i1 %bf.cast5, i32 2, i32 0
+  ret i32 %cond
+}
diff --git a/test/CodeGen/SPARC/spill.ll b/test/CodeGen/SPARC/spill.ll
new file mode 100644
index 0000000000000..a461de9640bd5
--- /dev/null
+++ b/test/CodeGen/SPARC/spill.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=sparc  < %s | FileCheck %s
+
+;; Ensure that spills and reloads work for various types on
+;; sparcv8.
+
+;; For i32/i64 tests, use an asm statement which clobbers most
+;; registers to ensure the spill will happen.
+
+; CHECK-LABEL: test_i32_spill:
+; CHECK:       and %i0, %i1, %o0
+; CHECK:       st %o0, [%fp+{{.+}}]
+; CHECK:       add %o0, %o0, %g0
+; CHECK:       ld [%fp+{{.+}}, %i0
+define i32 @test_i32_spill(i32 %a, i32 %b) {
+entry:
+  %r0 = and i32 %a, %b
+  ; The clobber list has all registers except g0/o0. (Only o0 is usable.)
+  %0 = call i32 asm sideeffect "add $0,$1,%g0", "=r,0,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6},~{o7}"(i32 %r0)
+  ret i32 %r0
+}
+
+; CHECK-LABEL: test_i64_spill:
+; CHECK:       and %i0, %i2, %o0
+; CHECK:       and %i1, %i3, %o1
+; CHECK:       std %o0, [%fp+{{.+}}]
+; CHECK:       add %o0, %o0, %g0
+; CHECK:       ldd [%fp+{{.+}}, %i0
+define i64 @test_i64_spill(i64 %a, i64 %b) {
+entry:
+  %r0 = and i64 %a, %b
+  ; The clobber list has all registers except g0,g1,o0,o1. (Only o0/o1 are a usable pair)
+  ; So, o0/o1 must be used.
+  %0 = call i64 asm sideeffect "add $0,$1,%g0", "=r,0,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o2},~{o3},~{o4},~{o5},~{o7}"(i64 %r0)
+  ret i64 %r0
+}
+
+;; For float/double tests, a call is a suitable clobber as *all* FPU
+;; registers are caller-save on sparcv8.
+
+; CHECK-LABEL: test_float_spill:
+; CHECK:       fadds %f1, %f0, [[R:%[f][0-31]]]
+; CHECK:       st [[R]], [%fp+{{.+}}]
+; CHECK:       call
+; CHECK:       ld [%fp+{{.+}}, %f0
+declare float @foo_float(float)
+define float @test_float_spill(float %a, float %b) {
+entry:
+  %r0 = fadd float %a, %b
+  %0 = call float @foo_float(float %r0)
+  ret float %r0
+}
+
+; CHECK-LABEL: test_double_spill:
+; CHECK:       faddd %f2, %f0, [[R:%[f][0-31]]]
+; CHECK:       std [[R]], [%fp+{{.+}}]
+; CHECK:       call
+; CHECK:       ldd [%fp+{{.+}}, %f0
+declare double @foo_double(double)
+define double @test_double_spill(double %a, double %b) {
+entry:
+  %r0 = fadd double %a, %b
+  %0 = call double @foo_double(double %r0)
+  ret double %r0
+}
diff --git a/test/CodeGen/SPARC/stack-align.ll b/test/CodeGen/SPARC/stack-align.ll
new file mode 100644
index 0000000000000..2554ee821fcd2
--- /dev/null
+++ b/test/CodeGen/SPARC/stack-align.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=sparc < %s | FileCheck %s
+declare void @stack_realign_helper(i32 %a, i32* %b)
+
+@foo = global i32 1
+
+;; This is a function where we have a local variable of 64-byte
+;; alignment.  We want to see that the stack is aligned (the initial
+;; andn), that the local var is accessed via stack pointer (to %o0), and that
+;; the argument is accessed via frame pointer not stack pointer (to %o1).
+
+;; CHECK-LABEL: stack_realign:
+;; CHECK:      andn %sp, 63, %sp
+;; CHECK-NEXT: ld [%fp+92], %o0
+;; CHECK-NEXT: call stack_realign_helper
+;; CHECK-NEXT: add %sp, 128, %o1
+
+define void @stack_realign(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g) {
+entry:
+  %aligned = alloca i32, align 64
+  call void @stack_realign_helper(i32 %g, i32* %aligned)
+  ret void
+}
diff --git a/test/CodeGen/SPARC/tls.ll b/test/CodeGen/SPARC/tls.ll
index a70637b283f51..8ebd36833ba12 100644
--- a/test/CodeGen/SPARC/tls.ll
+++ b/test/CodeGen/SPARC/tls.ll
@@ -103,10 +103,10 @@ entry:
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_PC22 _GLOBAL_OFFSET_TABLE_ 0x4
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_PC10 _GLOBAL_OFFSET_TABLE_ 0x8
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDO_HIX22 local_symbol 0x0
-; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDO_LOX10 local_symbol 0x0
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDM_HI22 local_symbol 0x0
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDM_LO10 local_symbol 0x0
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDM_ADD local_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDO_LOX10 local_symbol 0x0
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDM_CALL local_symbol 0x0
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDO_ADD local_symbol 0x0
 ; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_PC22 _GLOBAL_OFFSET_TABLE_ 0x4
diff --git a/test/CodeGen/SPARC/varargs.ll b/test/CodeGen/SPARC/varargs.ll
index c2d1e98b698be..576acc284fb95 100644
--- a/test/CodeGen/SPARC/varargs.ll
+++ b/test/CodeGen/SPARC/varargs.ll
@@ -67,8 +67,8 @@ declare void @llvm.va_start(i8*)
 ; CHECK: call_1d
 ; The fixed-arg double goes in %d2, the second goes in %o2.
 ; CHECK: sethi 1048576
-; CHECK: , %o2
 ; CHECK: , %f2
+; CHECK: , %o2
 define i32 @call_1d() #0 {
 entry:
   %call = call double (i8*, double, ...) @varargsfunc(i8* undef, double 1.000000e+00, double 2.000000e+00)
diff --git a/test/CodeGen/SystemZ/alloca-03.ll b/test/CodeGen/SystemZ/alloca-03.ll
new file mode 100644
index 0000000000000..ece1198ad62f2
--- /dev/null
+++ b/test/CodeGen/SystemZ/alloca-03.ll
@@ -0,0 +1,84 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Allocate 8 bytes, no need to align stack.
+define void @f0() {
+; CHECK-LABEL: f0:
+; CHECK: aghi %r15, -168
+; CHECK-NOT: nil
+; CHECK: mvghi 160(%r15), 10
+; CHECK: aghi %r15, 168
+  %x = alloca i64
+  store volatile i64 10, i64* %x
+  ret void
+}
+
+; Allocate %len * 8, no need to align stack.
+define void @f1(i64 %len) {
+; CHECK-LABEL: f1:
+; CHECK: sllg    %r0, %r2, 3
+; CHECK: lgr     %r1, %r15
+; CHECK: sgr     %r1, %r0
+; CHECK-NOT: ngr
+; CHECK: lgr     %r15, %r1
+; CHECK: la      %r1, 160(%r1)
+; CHECK: mvghi   0(%r1), 10
+  %x = alloca i64, i64 %len
+  store volatile i64 10, i64* %x
+  ret void
+}
+
+; Static alloca, align 128.
+define void @f2() {
+; CHECK-LABEL: f2:
+; CHECK: aghi    %r1, -128
+; CHECK: lgr     %r15, %r1
+; CHECK: la      %r1, 280(%r1)
+; CHECK: nill	 %r1, 65408
+; CHECK: mvghi   0(%r1), 10
+  %x = alloca i64, i64 1, align 128
+  store volatile i64 10, i64* %x, align 128
+  ret void
+}
+
+; Dynamic alloca, align 128.
+define void @f3(i64 %len) {
+; CHECK-LABEL: f3:
+; CHECK: sllg	%r1, %r2, 3
+; CHECK: la	%r0, 120(%r1)
+; CHECK: lgr	%r1, %r15
+; CHECK: sgr	%r1, %r0
+; CHECK: lgr	%r15, %r1
+; CHECK: la	%r1, 280(%r1)
+; CHECK: nill	%r1, 65408
+; CHECK: mvghi	0(%r1), 10
+  %x = alloca i64, i64 %len, align 128
+  store volatile i64 10, i64* %x, align 128
+  ret void
+}
+
+; Static alloca w/out alignment - part of frame.
+define void @f4() {
+; CHECK-LABEL: f4:
+; CHECK: aghi    %r15, -168
+; CHECK: mvhi    164(%r15), 10
+; CHECK: aghi    %r15, 168
+  %x = alloca i32
+  store volatile i32 10, i32* %x
+  ret void
+}
+
+; Static alloca of one i32, aligned by 128.
+define void @f5() {
+; CHECK-LABEL: f5:
+
+; CHECK: lgr	%r1, %r15
+; CHECK: aghi	%r1, -128
+; CHECK: lgr	%r15, %r1
+; CHECK: la	%r1, 280(%r1)
+; CHECK: nill	%r1, 65408
+; CHECK: mvhi	0(%r1), 10
+  %x = alloca i32, i64 1, align 128
+  store volatile i32 10, i32* %x
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/alloca-04.ll b/test/CodeGen/SystemZ/alloca-04.ll
new file mode 100644
index 0000000000000..86c77493d3e9b
--- /dev/null
+++ b/test/CodeGen/SystemZ/alloca-04.ll
@@ -0,0 +1,14 @@
+; Check the "no-realign-stack" function attribute. We should get a warning.
+
+; RUN: llc < %s -mtriple=s390x-linux-gnu -debug-only=codegen 2>&1 | \
+; RUN:   FileCheck %s
+; REQUIRES: asserts
+
+define void @f6() "no-realign-stack" {
+  %x = alloca i64, i64 1, align 128
+  store volatile i64 10, i64* %x, align 128
+  ret void
+}
+
+; CHECK: Warning: requested alignment 128 exceeds the stack alignment 8
+; CHECK-NOT: nill
diff --git a/test/CodeGen/SystemZ/args-01.ll b/test/CodeGen/SystemZ/args-01.ll
index 3105503eda53b..113110faf3413 100644
--- a/test/CodeGen/SystemZ/args-01.ll
+++ b/test/CodeGen/SystemZ/args-01.ll
@@ -30,12 +30,12 @@ define void @foo() {
 ;
 ; CHECK-FLOAT-LABEL: foo:
 ; CHECK-FLOAT: lzer %f0
-; CHECK-FLOAT: lcebr %f4, %f0
+; CHECK-FLOAT: lcdfr %f4, %f0
 ; CHECK-FLOAT: brasl %r14, bar@PLT
 ;
 ; CHECK-DOUBLE-LABEL: foo:
 ; CHECK-DOUBLE: lzdr %f2
-; CHECK-DOUBLE: lcdbr %f6, %f2
+; CHECK-DOUBLE: lcdfr %f6, %f2
 ; CHECK-DOUBLE: brasl %r14, bar@PLT
 ;
 ; CHECK-FP128-1-LABEL: foo:
diff --git a/test/CodeGen/SystemZ/args-02.ll b/test/CodeGen/SystemZ/args-02.ll
index 8686df88e6793..89b080e821bf4 100644
--- a/test/CodeGen/SystemZ/args-02.ll
+++ b/test/CodeGen/SystemZ/args-02.ll
@@ -31,12 +31,12 @@ define void @foo() {
 ;
 ; CHECK-FLOAT-LABEL: foo:
 ; CHECK-FLOAT: lzer %f0
-; CHECK-FLOAT: lcebr %f4, %f0
+; CHECK-FLOAT: lcdfr %f4, %f0
 ; CHECK-FLOAT: brasl %r14, bar@PLT
 ;
 ; CHECK-DOUBLE-LABEL: foo:
 ; CHECK-DOUBLE: lzdr %f2
-; CHECK-DOUBLE: lcdbr %f6, %f2
+; CHECK-DOUBLE: lcdfr %f6, %f2
 ; CHECK-DOUBLE: brasl %r14, bar@PLT
 ;
 ; CHECK-FP128-1-LABEL: foo:
diff --git a/test/CodeGen/SystemZ/args-03.ll b/test/CodeGen/SystemZ/args-03.ll
index d7d3ea105df7b..a52782f4c1836 100644
--- a/test/CodeGen/SystemZ/args-03.ll
+++ b/test/CodeGen/SystemZ/args-03.ll
@@ -31,12 +31,12 @@ define void @foo() {
 ;
 ; CHECK-FLOAT-LABEL: foo:
 ; CHECK-FLOAT: lzer %f0
-; CHECK-FLOAT: lcebr %f4, %f0
+; CHECK-FLOAT: lcdfr %f4, %f0
 ; CHECK-FLOAT: brasl %r14, bar@PLT
 ;
 ; CHECK-DOUBLE-LABEL: foo:
 ; CHECK-DOUBLE: lzdr %f2
-; CHECK-DOUBLE: lcdbr %f6, %f2
+; CHECK-DOUBLE: lcdfr %f6, %f2
 ; CHECK-DOUBLE: brasl %r14, bar@PLT
 ;
 ; CHECK-FP128-1-LABEL: foo:
diff --git a/test/CodeGen/SystemZ/args-04.ll b/test/CodeGen/SystemZ/args-04.ll
index 48a2cf4910493..475cceb106e5d 100644
--- a/test/CodeGen/SystemZ/args-04.ll
+++ b/test/CodeGen/SystemZ/args-04.ll
@@ -1,7 +1,7 @@
 ; Test incoming GPR, FPR and stack arguments when no extension type is given.
 ; This type of argument is used for passing structures, etc.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s
 
 ; Do some arithmetic so that we can see the register being used.
 define i8 @f1(i8 %r2) {
diff --git a/test/CodeGen/SystemZ/args-07.ll b/test/CodeGen/SystemZ/args-07.ll
index 29d9b319ffc00..44a31fadd6d24 100644
--- a/test/CodeGen/SystemZ/args-07.ll
+++ b/test/CodeGen/SystemZ/args-07.ll
@@ -1,6 +1,6 @@
 ; Test multiple return values (LLVM ABI extension)
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs| FileCheck %s
 
 ; Up to four integer return values fit into GPRs.
 define { i64, i64, i64, i64 } @f1() {
diff --git a/test/CodeGen/SystemZ/asm-17.ll b/test/CodeGen/SystemZ/asm-17.ll
index 533b5e90d62d4..acf2aff454292 100644
--- a/test/CodeGen/SystemZ/asm-17.ll
+++ b/test/CodeGen/SystemZ/asm-17.ll
@@ -1,6 +1,7 @@
 ; Test explicit register names.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
+; RUN: llc < %s  -verify-machineinstrs -mtriple=s390x-linux-gnu -no-integrated-as \
+; RUN:   | FileCheck %s
 
 ; Test i32 GPRs.
 define i32 @f1() {
diff --git a/test/CodeGen/SystemZ/asm-18.ll b/test/CodeGen/SystemZ/asm-18.ll
index 999984be88d41..7909253d188c3 100644
--- a/test/CodeGen/SystemZ/asm-18.ll
+++ b/test/CodeGen/SystemZ/asm-18.ll
@@ -1,7 +1,8 @@
 ; Test high-word operations, using "h" constraints to force a high
 ; register and "r" constraints to force a low register.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 -no-integrated-as | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -mcpu=z196 \
+; RUN:   -no-integrated-as | FileCheck %s
 
 ; Test loads and stores involving mixtures of high and low registers.
 define void @f1(i32 *%ptr1, i32 *%ptr2) {
diff --git a/test/CodeGen/SystemZ/dag-combine-01.ll b/test/CodeGen/SystemZ/dag-combine-01.ll
new file mode 100644
index 0000000000000..a56a118dadaa3
--- /dev/null
+++ b/test/CodeGen/SystemZ/dag-combine-01.ll
@@ -0,0 +1,97 @@
+; Test that MergeConsecutiveStores() does not during DAG combining
+; incorrectly drop a chain dependency to a store previously chained to
+; one of two combined loads.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+@A = common global [2048 x float] zeroinitializer, align 4
+
+; Function Attrs: nounwind
+define signext i32 @main(i32 signext %argc, i8** nocapture readnone %argv) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv24 = phi i64 [ 0, %entry ], [ %indvars.iv.next25, %for.body ]
+  %sum.018 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %0 = trunc i64 %indvars.iv24 to i32
+  %conv = sitofp i32 %0 to float
+  %arrayidx = getelementptr inbounds [2048 x float], [2048 x float]* @A, i64 0, i64 %indvars.iv24
+  store float %conv, float* %arrayidx, align 4
+  %add = fadd float %sum.018, %conv
+  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
+  %exitcond26 = icmp eq i64 %indvars.iv.next25, 2048
+  br i1 %exitcond26, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  br label %for.body.3.lr.ph.i.preheader
+
+for.body.3.lr.ph.i.preheader:                     ; preds = %complex_transpose.exit, %for.end
+  %i.116 = phi i32 [ 0, %for.end ], [ %inc9, %complex_transpose.exit ]
+  br label %for.body.3.lr.ph.i
+
+for.body.3.lr.ph.i:                               ; preds = %for.body.3.lr.ph.i.preheader, %for.inc.40.i
+  %indvars.iv19 = phi i32 [ 1, %for.body.3.lr.ph.i.preheader ], [ %indvars.iv.next20, %for.inc.40.i ]
+  %indvars.iv57.i = phi i64 [ 1, %for.body.3.lr.ph.i.preheader ], [ %indvars.iv.next58.i, %for.inc.40.i ]
+  %1 = shl nsw i64 %indvars.iv57.i, 1
+  %2 = shl nsw i64 %indvars.iv57.i, 6
+  br label %for.body.3.i
+
+for.body.3.i:                                     ; preds = %for.body.3.i, %for.body.3.lr.ph.i
+; CHECK-LABEL: .LBB0_5:
+; CHECK-NOT:    stfh    %r{{.*}}, 0(%r{{.*}})
+; CHECK:        lg      %r{{.*}}, -4(%r{{.*}})
+; Overlapping load should go before the store
+  %indvars.iv.i = phi i64 [ 0, %for.body.3.lr.ph.i ], [ %indvars.iv.next.i, %for.body.3.i ]
+  %3 = shl nsw i64 %indvars.iv.i, 6
+  %4 = add nuw nsw i64 %3, %1
+  %arrayidx.i = getelementptr inbounds [2048 x float], [2048 x float]* @A, i64 0, i64 %4
+  %5 = bitcast float* %arrayidx.i to i32*
+  %6 = load i32, i32* %5, align 4
+  %arrayidx9.i = getelementptr inbounds float, float* getelementptr inbounds ([2048 x float], [2048 x float]* @A, i64 0, i64 1), i64 %4
+  %7 = bitcast float* %arrayidx9.i to i32*
+  %8 = load i32, i32* %7, align 4
+  %9 = shl nsw i64 %indvars.iv.i, 1
+  %10 = add nuw nsw i64 %9, %2
+  %arrayidx14.i = getelementptr inbounds [2048 x float], [2048 x float]* @A, i64 0, i64 %10
+  %11 = bitcast float* %arrayidx14.i to i32*
+  %12 = load i32, i32* %11, align 4
+  %arrayidx19.i = getelementptr inbounds float, float* getelementptr inbounds ([2048 x float], [2048 x float]* @A, i64 0, i64 1), i64 %10
+  %13 = bitcast float* %arrayidx19.i to i32*
+  %14 = load i32, i32* %13, align 4
+  store i32 %6, i32* %11, align 4
+  store i32 %8, i32* %13, align 4
+  store i32 %12, i32* %5, align 4
+  store i32 %14, i32* %7, align 4
+  %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next.i to i32
+  %exitcond21 = icmp eq i32 %lftr.wideiv, %indvars.iv19
+  br i1 %exitcond21, label %for.inc.40.i, label %for.body.3.i
+
+for.inc.40.i:                                     ; preds = %for.body.3.i
+  %indvars.iv.next58.i = add nuw nsw i64 %indvars.iv57.i, 1
+  %indvars.iv.next20 = add nuw nsw i32 %indvars.iv19, 1
+  %exitcond22 = icmp eq i64 %indvars.iv.next58.i, 32
+  br i1 %exitcond22, label %complex_transpose.exit, label %for.body.3.lr.ph.i
+
+complex_transpose.exit:                           ; preds = %for.inc.40.i
+  %inc9 = add nuw nsw i32 %i.116, 1
+  %exitcond23 = icmp eq i32 %inc9, 10
+  br i1 %exitcond23, label %for.body.14.preheader, label %for.body.3.lr.ph.i.preheader
+
+for.body.14.preheader:                            ; preds = %complex_transpose.exit
+  br label %for.body.14
+
+for.body.14:                                      ; preds = %for.body.14.preheader, %for.body.14
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.14 ], [ 0, %for.body.14.preheader ]
+  %sum.115 = phi float [ %add17, %for.body.14 ], [ 0.000000e+00, %for.body.14.preheader ]
+  %arrayidx16 = getelementptr inbounds [2048 x float], [2048 x float]* @A, i64 0, i64 %indvars.iv
+  %15 = load float, float* %arrayidx16, align 4
+  %add17 = fadd float %sum.115, %15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 2048
+  br i1 %exitcond, label %for.end.20, label %for.body.14
+
+for.end.20:                                       ; preds = %for.body.14
+  ret i32 0
+}
diff --git a/test/CodeGen/SystemZ/fp-abs-01.ll b/test/CodeGen/SystemZ/fp-abs-01.ll
index 3b143d93315bf..3bb3ede457f3c 100644
--- a/test/CodeGen/SystemZ/fp-abs-01.ll
+++ b/test/CodeGen/SystemZ/fp-abs-01.ll
@@ -7,7 +7,7 @@
 declare float @llvm.fabs.f32(float %f)
 define float @f1(float %f) {
 ; CHECK-LABEL: f1:
-; CHECK: lpebr %f0, %f0
+; CHECK: lpdfr %f0, %f0
 ; CHECK: br %r14
   %res = call float @llvm.fabs.f32(float %f)
   ret float %res
@@ -17,7 +17,7 @@ define float @f1(float %f) {
 declare double @llvm.fabs.f64(double %f)
 define double @f2(double %f) {
 ; CHECK-LABEL: f2:
-; CHECK: lpdbr %f0, %f0
+; CHECK: lpdfr %f0, %f0
 ; CHECK: br %r14
   %res = call double @llvm.fabs.f64(double %f)
   ret double %res
diff --git a/test/CodeGen/SystemZ/fp-abs-02.ll b/test/CodeGen/SystemZ/fp-abs-02.ll
index e831ddb86feae..b2d2cfd52b6a6 100644
--- a/test/CodeGen/SystemZ/fp-abs-02.ll
+++ b/test/CodeGen/SystemZ/fp-abs-02.ll
@@ -7,7 +7,7 @@
 declare float @llvm.fabs.f32(float %f)
 define float @f1(float %f) {
 ; CHECK-LABEL: f1:
-; CHECK: lnebr %f0, %f0
+; CHECK: lndfr %f0, %f0
 ; CHECK: br %r14
   %abs = call float @llvm.fabs.f32(float %f)
   %res = fsub float -0.0, %abs
@@ -18,7 +18,7 @@ define float @f1(float %f) {
 declare double @llvm.fabs.f64(double %f)
 define double @f2(double %f) {
 ; CHECK-LABEL: f2:
-; CHECK: lndbr %f0, %f0
+; CHECK: lndfr %f0, %f0
 ; CHECK: br %r14
   %abs = call double @llvm.fabs.f64(double %f)
   %res = fsub double -0.0, %abs
diff --git a/test/CodeGen/SystemZ/fp-add-02.ll b/test/CodeGen/SystemZ/fp-add-02.ll
index 5be1ad79d4530..4f98742197bdf 100644
--- a/test/CodeGen/SystemZ/fp-add-02.ll
+++ b/test/CodeGen/SystemZ/fp-add-02.ll
@@ -2,7 +2,7 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
 ; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -verify-machineinstrs | FileCheck %s
 declare double @foo()
 
 ; Check register addition.
diff --git a/test/CodeGen/SystemZ/fp-cmp-02.ll b/test/CodeGen/SystemZ/fp-cmp-02.ll
index 94a256777c757..0808ddd8db483 100644
--- a/test/CodeGen/SystemZ/fp-cmp-02.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-02.ll
@@ -3,7 +3,7 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
 ; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -verify-machineinstrs\
 ; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare double @foo()
@@ -164,8 +164,7 @@ define i64 @f8(i64 %a, i64 %b, double %f) {
 ; CHECK-SCALAR: ltdbr %f0, %f0
 ; CHECK-SCALAR-NEXT: je
 ; CHECK-SCALAR: lgr %r2, %r3
-; CHECK-VECTOR: lzdr %f1
-; CHECK-VECTOR-NEXT: cdbr %f0, %f1
+; CHECK-VECTOR: ltdbr %f0, %f0
 ; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %cond = fcmp oeq double %f, 0.0
diff --git a/test/CodeGen/SystemZ/fp-cmp-05.ll b/test/CodeGen/SystemZ/fp-cmp-05.ll
new file mode 100644
index 0000000000000..c8eb18c6e6ba4
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-cmp-05.ll
@@ -0,0 +1,80 @@
+; Test that floating-point instructions that set cc are used to
+; eliminate compares for load complement, load negative and load
+; positive.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Load complement (sign-bit flipped).
+; Test f32
+define float @f1(float %a, float %b, float %f) {
+; CHECK-LABEL: f1:
+; CHECK: lcebr
+; CHECK-NEXT: je
+  %neg = fsub float -0.0, %f
+  %cond = fcmp oeq float %neg, 0.0
+  %res = select i1 %cond, float %a, float %b
+  ret float %res
+}
+
+; Test f64
+define double @f2(double %a, double %b, double %f) {
+; CHECK-LABEL: f2:
+; CHECK: lcdbr
+; CHECK-NEXT: je
+  %neg = fsub double -0.0, %f
+  %cond = fcmp oeq double %neg, 0.0
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Negation of floating-point absolute.
+; Test f32
+declare float @llvm.fabs.f32(float %f)
+define float @f3(float %a, float %b, float %f) {
+; CHECK-LABEL: f3:
+; CHECK: lnebr
+; CHECK-NEXT: je
+  %abs = call float @llvm.fabs.f32(float %f)
+  %neg = fsub float -0.0, %abs
+  %cond = fcmp oeq float %neg, 0.0
+  %res = select i1 %cond, float %a, float %b
+  ret float %res
+}
+
+; Test f64
+declare double @llvm.fabs.f64(double %f)
+define double @f4(double %a, double %b, double %f) {
+; CHECK-LABEL: f4:
+; CHECK: lndbr
+; CHECK-NEXT: je
+  %abs = call double @llvm.fabs.f64(double %f)
+  %neg = fsub double -0.0, %abs
+  %cond = fcmp oeq double %neg, 0.0
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Absolute floating-point value.
+; Test f32
+define float @f5(float %a, float %b, float %f) {
+; CHECK-LABEL: f5:
+; CHECK: lpebr
+; CHECK-NEXT: je
+  %abs = call float @llvm.fabs.f32(float %f)
+  %cond = fcmp oeq float %abs, 0.0
+  %res = select i1 %cond, float %a, float %b
+  ret float %res
+}
+
+; Test f64
+define double @f6(double %a, double %b, double %f) {
+; CHECK-LABEL: f6:
+; CHECK: lpdbr
+; CHECK-NEXT: je
+  %abs = call double @llvm.fabs.f64(double %f)
+  %cond = fcmp oeq double %abs, 0.0
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
diff --git a/test/CodeGen/SystemZ/fp-const-02.ll b/test/CodeGen/SystemZ/fp-const-02.ll
index 96f857895ecfa..942465c066009 100644
--- a/test/CodeGen/SystemZ/fp-const-02.ll
+++ b/test/CodeGen/SystemZ/fp-const-02.ll
@@ -6,7 +6,7 @@
 define float @f1() {
 ; CHECK-LABEL: f1:
 ; CHECK: lzer [[REGISTER:%f[0-5]+]]
-; CHECK: lcebr %f0, [[REGISTER]]
+; CHECK: lcdfr %f0, [[REGISTER]]
 ; CHECK: br %r14
   ret float -0.0
 }
@@ -15,7 +15,7 @@ define float @f1() {
 define double @f2() {
 ; CHECK-LABEL: f2:
 ; CHECK: lzdr [[REGISTER:%f[0-5]+]]
-; CHECK: lcdbr %f0, [[REGISTER]]
+; CHECK: lcdfr %f0, [[REGISTER]]
 ; CHECK: br %r14
   ret double -0.0
 }
diff --git a/test/CodeGen/SystemZ/fp-libcall.ll b/test/CodeGen/SystemZ/fp-libcall.ll
new file mode 100644
index 0000000000000..75250b811cba5
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-libcall.ll
@@ -0,0 +1,273 @@
+; Test that library calls are emitted for LLVM IR intrinsics
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define float @f1(float %x, i32 %y) {
+; CHECK-LABEL: f1:
+; CHECK: brasl %r14, __powisf2@PLT
+  %tmp = call float @llvm.powi.f32(float %x, i32 %y)
+  ret float %tmp
+}
+
+define double @f2(double %x, i32 %y) {
+; CHECK-LABEL: f2:
+; CHECK: brasl %r14, __powidf2@PLT
+  %tmp = call double @llvm.powi.f64(double %x, i32 %y)
+  ret double %tmp
+}
+
+define fp128 @f3(fp128 %x, i32 %y) {
+; CHECK-LABEL: f3:
+; CHECK: brasl %r14, __powitf2@PLT
+  %tmp = call fp128 @llvm.powi.f128(fp128 %x, i32 %y)
+  ret fp128 %tmp
+}
+
+define float @f4(float %x, float %y) {
+; CHECK-LABEL: f4:
+; CHECK: brasl %r14, powf@PLT
+  %tmp = call float @llvm.pow.f32(float %x, float %y)
+  ret float %tmp
+}
+
+define double @f5(double %x, double %y) {
+; CHECK-LABEL: f5:
+; CHECK: brasl %r14, pow@PLT
+  %tmp = call double @llvm.pow.f64(double %x, double %y)
+  ret double %tmp
+}
+
+define fp128 @f6(fp128 %x, fp128 %y) {
+; CHECK-LABEL: f6:
+; CHECK: brasl %r14, powl@PLT
+  %tmp = call fp128 @llvm.pow.f128(fp128 %x, fp128 %y)
+  ret fp128 %tmp
+}
+
+define float @f7(float %x) {
+; CHECK-LABEL: f7:
+; CHECK: brasl %r14, sinf@PLT
+  %tmp = call float @llvm.sin.f32(float %x)
+  ret float %tmp
+}
+
+define double @f8(double %x) {
+; CHECK-LABEL: f8:
+; CHECK: brasl %r14, sin@PLT
+  %tmp = call double @llvm.sin.f64(double %x)
+  ret double %tmp
+}
+
+define fp128 @f9(fp128 %x) {
+; CHECK-LABEL: f9:
+; CHECK: brasl %r14, sinl@PLT
+  %tmp = call fp128 @llvm.sin.f128(fp128 %x)
+  ret fp128 %tmp
+}
+
+define float @f10(float %x) {
+; CHECK-LABEL: f10:
+; CHECK: brasl %r14, cosf@PLT
+  %tmp = call float @llvm.cos.f32(float %x)
+  ret float %tmp
+}
+
+define double @f11(double %x) {
+; CHECK-LABEL: f11:
+; CHECK: brasl %r14, cos@PLT
+  %tmp = call double @llvm.cos.f64(double %x)
+  ret double %tmp
+}
+
+define fp128 @f12(fp128 %x) {
+; CHECK-LABEL: f12:
+; CHECK: brasl %r14, cosl@PLT
+  %tmp = call fp128 @llvm.cos.f128(fp128 %x)
+  ret fp128 %tmp
+}
+
+define float @f13(float %x) {
+; CHECK-LABEL: f13:
+; CHECK: brasl %r14, expf@PLT
+  %tmp = call float @llvm.exp.f32(float %x)
+  ret float %tmp
+}
+
+define double @f14(double %x) {
+; CHECK-LABEL: f14:
+; CHECK: brasl %r14, exp@PLT
+  %tmp = call double @llvm.exp.f64(double %x)
+  ret double %tmp
+}
+
+define fp128 @f15(fp128 %x) {
+; CHECK-LABEL: f15:
+; CHECK: brasl %r14, expl@PLT
+  %tmp = call fp128 @llvm.exp.f128(fp128 %x)
+  ret fp128 %tmp
+}
+
+define float @f16(float %x) {
+; CHECK-LABEL: f16:
+; CHECK: brasl %r14, exp2f@PLT
+  %tmp = call float @llvm.exp2.f32(float %x)
+  ret float %tmp
+}
+
+define double @f17(double %x) {
+; CHECK-LABEL: f17:
+; CHECK: brasl %r14, exp2@PLT
+  %tmp = call double @llvm.exp2.f64(double %x)
+  ret double %tmp
+}
+
+define fp128 @f18(fp128 %x) {
+; CHECK-LABEL: f18:
+; CHECK: brasl %r14, exp2l@PLT
+  %tmp = call fp128 @llvm.exp2.f128(fp128 %x)
+  ret fp128 %tmp
+}
+
+define float @f19(float %x) {
+; CHECK-LABEL: f19:
+; CHECK: brasl %r14, logf@PLT
+  %tmp = call float @llvm.log.f32(float %x)
+  ret float %tmp
+}
+
+define double @f20(double %x) {
+; CHECK-LABEL: f20:
+; CHECK: brasl %r14, log@PLT
+  %tmp = call double @llvm.log.f64(double %x)
+  ret double %tmp
+}
+
+define fp128 @f21(fp128 %x) {
+; CHECK-LABEL: f21:
+; CHECK: brasl %r14, logl@PLT
+  %tmp = call fp128 @llvm.log.f128(fp128 %x)
+  ret fp128 %tmp
+}
+
+define float @f22(float %x) {
+; CHECK-LABEL: f22:
+; CHECK: brasl %r14, log2f@PLT
+  %tmp = call float @llvm.log2.f32(float %x)
+  ret float %tmp
+}
+
+define double @f23(double %x) {
+; CHECK-LABEL: f23:
+; CHECK: brasl %r14, log2@PLT
+  %tmp = call double @llvm.log2.f64(double %x)
+  ret double %tmp
+}
+
+define fp128 @f24(fp128 %x) {
+; CHECK-LABEL: f24:
+; CHECK: brasl %r14, log2l@PLT
+  %tmp = call fp128 @llvm.log2.f128(fp128 %x)
+  ret fp128 %tmp
+}
+
+define float @f25(float %x) {
+; CHECK-LABEL: f25:
+; CHECK: brasl %r14, log10f@PLT
+  %tmp = call float @llvm.log10.f32(float %x)
+  ret float %tmp
+}
+
+define double @f26(double %x) {
+; CHECK-LABEL: f26:
+; CHECK: brasl %r14, log10@PLT
+  %tmp = call double @llvm.log10.f64(double %x)
+  ret double %tmp
+}
+
+define fp128 @f27(fp128 %x) {
+; CHECK-LABEL: f27:
+; CHECK: brasl %r14, log10l@PLT
+  %tmp = call fp128 @llvm.log10.f128(fp128 %x)
+  ret fp128 %tmp
+}
+
+define float @f28(float %x, float %y) {
+; CHECK-LABEL: f28:
+; CHECK: brasl %r14, fminf@PLT
+  %tmp = call float @llvm.minnum.f32(float %x, float %y)
+  ret float %tmp
+}
+
+define double @f29(double %x, double %y) {
+; CHECK-LABEL: f29:
+; CHECK: brasl %r14, fmin@PLT
+  %tmp = call double @llvm.minnum.f64(double %x, double %y)
+  ret double %tmp
+}
+
+define fp128 @f30(fp128 %x, fp128 %y) {
+; CHECK-LABEL: f30:
+; CHECK: brasl %r14, fminl@PLT
+  %tmp = call fp128 @llvm.minnum.f128(fp128 %x, fp128 %y)
+  ret fp128 %tmp
+}
+
+define float @f31(float %x, float %y) {
+; CHECK-LABEL: f31:
+; CHECK: brasl %r14, fmaxf@PLT
+  %tmp = call float @llvm.maxnum.f32(float %x, float %y)
+  ret float %tmp
+}
+
+define double @f32(double %x, double %y) {
+; CHECK-LABEL: f32:
+; CHECK: brasl %r14, fmax@PLT
+  %tmp = call double @llvm.maxnum.f64(double %x, double %y)
+  ret double %tmp
+}
+
+define fp128 @f33(fp128 %x, fp128 %y) {
+; CHECK-LABEL: f33:
+; CHECK: brasl %r14, fmaxl@PLT
+  %tmp = call fp128 @llvm.maxnum.f128(fp128 %x, fp128 %y)
+  ret fp128 %tmp
+}
+
+declare float @llvm.powi.f32(float, i32)
+declare double @llvm.powi.f64(double, i32)
+declare fp128 @llvm.powi.f128(fp128, i32)
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)
+declare fp128 @llvm.pow.f128(fp128, fp128)
+
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare fp128 @llvm.sin.f128(fp128)
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare fp128 @llvm.cos.f128(fp128)
+
+declare float @llvm.exp.f32(float)
+declare double @llvm.exp.f64(double)
+declare fp128 @llvm.exp.f128(fp128)
+declare float @llvm.exp2.f32(float)
+declare double @llvm.exp2.f64(double)
+declare fp128 @llvm.exp2.f128(fp128)
+
+declare float @llvm.log.f32(float)
+declare double @llvm.log.f64(double)
+declare fp128 @llvm.log.f128(fp128)
+declare float @llvm.log2.f32(float)
+declare double @llvm.log2.f64(double)
+declare fp128 @llvm.log2.f128(fp128)
+declare float @llvm.log10.f32(float)
+declare double @llvm.log10.f64(double)
+declare fp128 @llvm.log10.f128(fp128)
+
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare fp128 @llvm.minnum.f128(fp128, fp128)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
+declare fp128 @llvm.maxnum.f128(fp128, fp128)
+
diff --git a/test/CodeGen/SystemZ/fp-move-05.ll b/test/CodeGen/SystemZ/fp-move-05.ll
index da12af6d68c12..0864deee5137f 100644
--- a/test/CodeGen/SystemZ/fp-move-05.ll
+++ b/test/CodeGen/SystemZ/fp-move-05.ll
@@ -1,6 +1,6 @@
 ; Test 128-bit floating-point loads.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s
 
 ; Check loads with no offset.
 define double @f1(i64 %src) {
diff --git a/test/CodeGen/SystemZ/fp-neg-01.ll b/test/CodeGen/SystemZ/fp-neg-01.ll
index fe2e5f67cf5b7..b9810f9f34d3e 100644
--- a/test/CodeGen/SystemZ/fp-neg-01.ll
+++ b/test/CodeGen/SystemZ/fp-neg-01.ll
@@ -6,7 +6,7 @@
 ; Test f32.
 define float @f1(float %f) {
 ; CHECK-LABEL: f1:
-; CHECK: lcebr %f0, %f0
+; CHECK: lcdfr %f0, %f0
 ; CHECK: br %r14
   %res = fsub float -0.0, %f
   ret float %res
@@ -15,7 +15,7 @@ define float @f1(float %f) {
 ; Test f64.
 define double @f2(double %f) {
 ; CHECK-LABEL: f2:
-; CHECK: lcdbr %f0, %f0
+; CHECK: lcdfr %f0, %f0
 ; CHECK: br %r14
   %res = fsub double -0.0, %f
   ret double %res
diff --git a/test/CodeGen/SystemZ/fp-sincos-01.ll b/test/CodeGen/SystemZ/fp-sincos-01.ll
new file mode 100644
index 0000000000000..cd182a590eee0
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-sincos-01.ll
@@ -0,0 +1,56 @@
+; Test that combined sin/cos library call is emitted when appropriate
+
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-NOOPT
+; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT
+
+define float @f1(float %x) {
+; CHECK-OPT-LABEL: f1:
+; CHECK-OPT: brasl %r14, sincosf@PLT
+; CHECK-OPT: le %f0, 164(%r15)
+; CHECK-OPT: aeb %f0, 160(%r15)
+
+; CHECK-NOOPT-LABEL: f1:
+; CHECK-NOOPT: brasl %r14, sinf@PLT
+; CHECK-NOOPT: brasl %r14, cosf@PLT
+  %tmp1 = call float @sinf(float %x)
+  %tmp2 = call float @cosf(float %x)
+  %add = fadd float %tmp1, %tmp2
+  ret float %add
+}
+
+define double @f2(double %x) {
+; CHECK-OPT-LABEL: f2:
+; CHECK-OPT: brasl %r14, sincos@PLT
+; CHECK-OPT: ld %f0, 168(%r15)
+; CHECK-OPT: adb %f0, 160(%r15)
+
+; CHECK-NOOPT-LABEL: f2:
+; CHECK-NOOPT: brasl %r14, sin@PLT
+; CHECK-NOOPT: brasl %r14, cos@PLT
+  %tmp1 = call double @sin(double %x)
+  %tmp2 = call double @cos(double %x)
+  %add = fadd double %tmp1, %tmp2
+  ret double %add
+}
+
+define fp128 @f3(fp128 %x) {
+; CHECK-OPT-LABEL: f3:
+; CHECK-OPT: brasl %r14, sincosl@PLT
+; CHECK-OPT: axbr
+
+; CHECK-NOOPT-LABEL: f3:
+; CHECK-NOOPT: brasl %r14, sinl@PLT
+; CHECK-NOOPT: brasl %r14, cosl@PLT
+  %tmp1 = call fp128 @sinl(fp128 %x)
+  %tmp2 = call fp128 @cosl(fp128 %x)
+  %add = fadd fp128 %tmp1, %tmp2
+  ret fp128 %add
+}
+
+declare float @sinf(float) readonly
+declare double @sin(double) readonly
+declare fp128 @sinl(fp128) readonly
+declare float @cosf(float) readonly
+declare double @cos(double) readonly
+declare fp128 @cosl(fp128) readonly
+
diff --git a/test/CodeGen/SystemZ/insert-05.ll b/test/CodeGen/SystemZ/insert-05.ll
index b76859a568f31..1ea8a64e28e38 100644
--- a/test/CodeGen/SystemZ/insert-05.ll
+++ b/test/CodeGen/SystemZ/insert-05.ll
@@ -214,8 +214,8 @@ define i64 @f18(i32 %a) {
 ; The truncation here isn't free; we need an explicit zero extension.
 define i64 @f19(i32 %a) {
 ; CHECK-LABEL: f19:
-; CHECK: llgcr %r2, %r2
-; CHECK: oihl %r2, 1
+; CHECK: llcr %r2, %r2
+; CHECK: iihf %r2, 1
 ; CHECK: br %r14
   %trunc = trunc i32 %a to i8
   %ext = zext i8 %trunc to i64
diff --git a/test/CodeGen/SystemZ/int-cmp-44.ll b/test/CodeGen/SystemZ/int-cmp-44.ll
index 97d48521254d8..a87dccd4ac2a1 100644
--- a/test/CodeGen/SystemZ/int-cmp-44.ll
+++ b/test/CodeGen/SystemZ/int-cmp-44.ll
@@ -1,7 +1,8 @@
 ; Test that compares are omitted if CC already has the right value
 ; (z10 version).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -no-integrated-as | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -no-integrated-as \
+; RUN:   -verify-machineinstrs| FileCheck %s
 
 declare void @foo()
 
diff --git a/test/CodeGen/SystemZ/int-cmp-51.ll b/test/CodeGen/SystemZ/int-cmp-51.ll
new file mode 100644
index 0000000000000..85a0e4b4d3a74
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-51.ll
@@ -0,0 +1,34 @@
+; Check that modelling of CC/CCRegs does not stop MachineCSE from
+; removing a compare.  MachineCSE will not extend a live range of an
+; allocatable or reserved phys reg.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @bar(i8)
+
+; Check the low end of the CH range.
+define void @f1(i32 %lhs) {
+; CHECK-LABEL: BB#1:
+; CHECK-NOT: cijlh %r0, 1, .LBB0_3
+
+entry:
+  %and188 = and i32 %lhs, 255
+  %cmp189 = icmp ult i32 %and188, 2
+  br i1 %cmp189, label %if.then.191, label %if.else.201
+
+if.then.191:
+  %cmp194 = icmp eq i32 %and188, 1
+  br i1 %cmp194, label %if.then.196, label %if.else.198
+
+if.then.196:
+  call void @bar(i8 1);
+  br label %if.else.201
+
+if.else.198:
+  call void @bar(i8 0);
+  br label %if.else.201
+
+if.else.201:
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/int-cmp-52.ll b/test/CodeGen/SystemZ/int-cmp-52.ll
new file mode 100644
index 0000000000000..a0b72371d1c5d
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-52.ll
@@ -0,0 +1,24 @@
+; This used to crash the backend due to a failed assertion.
+; No particular output expected, but must compile.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu
+
+define void @test(i16 *%input, i32 *%result) {
+entry:
+  %0 = load i16, i16* %input, align 2
+  %1 = zext i16 %0 to i32
+  %2 = icmp slt i32 %1, 0
+  br i1 %2, label %if.then, label %if.else
+
+if.then:
+  store i32 1, i32* %result, align 4
+  br label %return
+
+if.else:
+  store i32 0, i32* %result, align 4
+  br label %return
+
+return:
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/memchr-01.ll b/test/CodeGen/SystemZ/memchr-01.ll
index c51690b9848d5..f7509c4f256b0 100644
--- a/test/CodeGen/SystemZ/memchr-01.ll
+++ b/test/CodeGen/SystemZ/memchr-01.ll
@@ -1,6 +1,6 @@
 ; Test memchr using SRST, with a weird but usable prototype.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s
 
 declare i8 *@memchr(i8 *%src, i16 %char, i32 %len)
 
diff --git a/test/CodeGen/SystemZ/spill-01.ll b/test/CodeGen/SystemZ/spill-01.ll
index a59c06f192b62..9be4420fd8394 100644
--- a/test/CodeGen/SystemZ/spill-01.ll
+++ b/test/CodeGen/SystemZ/spill-01.ll
@@ -1,7 +1,7 @@
 ; Test spilling using MVC.  The tests here assume z10 register pressure,
 ; without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -verify-machineinstrs | FileCheck %s
 
 declare void @foo()
 
diff --git a/test/CodeGen/SystemZ/vec-args-04.ll b/test/CodeGen/SystemZ/vec-args-04.ll
index 3a25404934e23..5176d80f08fb4 100644
--- a/test/CodeGen/SystemZ/vec-args-04.ll
+++ b/test/CodeGen/SystemZ/vec-args-04.ll
@@ -21,17 +21,25 @@ define void @foo() {
 ; CHECK-VEC-DAG: vrepib %v31, 8
 ; CHECK-VEC: brasl %r14, bar@PLT
 ;
+
+; CHECK-STACK: .LCPI0_0:
+; CHECK-STACK:	.quad	795741901033570304      # 0xb0b0b0b00000000
+; CHECK-STACK:	.quad	868082074056920076      # 0xc0c0c0c0c0c0c0c
+; CHECK-STACK: .LCPI0_1:
+; CHECK-STACK:	.quad	648518346341351424      # 0x900000000000000
+; CHECK-STACK:	.quad	723390690146385920      # 0xa0a000000000000
+
 ; CHECK-STACK-LABEL: foo:
 ; CHECK-STACK: aghi %r15, -192
-; CHECK-STACK-DAG: llihh [[REG1:%r[0-9]+]], 2304
-; CHECK-STACK-DAG: stg [[REG1]], 160(%r15)
-; CHECK-STACK-DAG: llihh [[REG2:%r[0-9]+]], 2570
-; CHECK-STACK-DAG: stg [[REG2]], 168(%r15)
-; CHECK-STACK-DAG: llihf [[REG3:%r[0-9]+]], 185273099
-; CHECK-STACK-DAG: stg [[REG3]], 176(%r15)
-; CHECK-STACK-DAG: llihf [[REG4:%r[0-9]+]], 202116108
-; CHECK-STACK-DAG: oilf [[REG4]], 202116108
-; CHECK-STACK-DAG: stg [[REG4]], 176(%r15)
+
+; CHECK-STACK-DAG: larl [[REG1:%r[0-9]+]], .LCPI0_0
+; CHECK-STACK-DAG: vl [[VREG0:%v[0-9]+]], 0([[REG1]])
+; CHECK-STACK-DAG: vst [[VREG0]], 176(%r15)
+
+; CHECK-STACK-DAG: larl [[REG2:%r[0-9]+]], .LCPI0_1
+; CHECK-STACK-DAG: vl [[VREG1:%v[0-9]+]], 0([[REG2]])
+; CHECK-STACK-DAG: vst [[VREG1]], 160(%r15)
+
 ; CHECK-STACK: brasl %r14, bar@PLT
 
   call void @bar (<1 x i8> <i8 1>,
diff --git a/test/CodeGen/SystemZ/vec-args-05.ll b/test/CodeGen/SystemZ/vec-args-05.ll
index cd1448b8611e2..8c5ff84142921 100644
--- a/test/CodeGen/SystemZ/vec-args-05.ll
+++ b/test/CodeGen/SystemZ/vec-args-05.ll
@@ -14,12 +14,14 @@ define void @foo() {
 ; CHECK-VEC-DAG: vrepib %v26, 2
 ; CHECK-VEC: brasl %r14, bar@PLT
 ;
+; CHECK-STACK: .LCPI0_0:
+; CHECK-STACK: .quad	217020518463700992      # 0x303030300000000
+; CHECK-STACK: .quad	289360691284934656      # 0x404040400000000
 ; CHECK-STACK-LABEL: foo:
 ; CHECK-STACK: aghi %r15, -176
-; CHECK-STACK-DAG: llihf [[REG1:%r[0-9]+]], 50529027
-; CHECK-STACK-DAG: stg [[REG1]], 160(%r15)
-; CHECK-STACK-DAG: llihf [[REG2:%r[0-9]+]], 67372036
-; CHECK-STACK-DAG: stg [[REG2]], 168(%r15)
+; CHECK-STACK-DAG: larl [[REG1:%r[0-9]+]], .LCPI0_0
+; CHECK-STACK-DAG: vl [[VREG:%v[0-9]+]], 0([[REG1]])
+; CHECK-STACK-DAG: vst [[VREG]], 160(%r15)
 ; CHECK-STACK: brasl %r14, bar@PLT
 
   call void (<4 x i8>, <4 x i8>, ...) @bar
diff --git a/test/CodeGen/SystemZ/vec-perm-12.ll b/test/CodeGen/SystemZ/vec-perm-12.ll
new file mode 100644
index 0000000000000..b70b13d90682c
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-perm-12.ll
@@ -0,0 +1,43 @@
+; Test inserting a truncated value into a vector element
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
+; RUN:   FileCheck -check-prefix=CHECK-CODE %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
+; RUN:   FileCheck -check-prefix=CHECK-VECTOR %s
+
+define <4 x i32> @f1(<4 x i32> %x, i64 %y) {
+; CHECK-CODE-LABEL: f1:
+; CHECK-CODE: vlvgf [[ELT:%v[0-9]+]], %r2, 0
+; CHECK-CODE: larl [[REG:%r[0-5]]],
+; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]])
+; CHECK-CODE: vperm %v24, %v24, [[ELT]], [[MASK]]
+; CHECK-CODE: br %r14
+
+; CHECK-VECTOR: .byte 12
+; CHECK-VECTOR-NEXT: .byte 13
+; CHECK-VECTOR-NEXT: .byte 14
+; CHECK-VECTOR-NEXT: .byte 15
+; CHECK-VECTOR-NEXT: .byte 8
+; CHECK-VECTOR-NEXT: .byte 9
+; CHECK-VECTOR-NEXT: .byte 10
+; CHECK-VECTOR-NEXT: .byte 11
+; CHECK-VECTOR-NEXT: .byte 4
+; CHECK-VECTOR-NEXT: .byte 5
+; CHECK-VECTOR-NEXT: .byte 6
+; CHECK-VECTOR-NEXT: .byte 7
+; CHECK-VECTOR-NEXT: .byte 16
+; CHECK-VECTOR-NEXT: .byte 17
+; CHECK-VECTOR-NEXT: .byte 18
+; CHECK-VECTOR-NEXT: .byte 19
+
+  %elt0 = extractelement <4 x i32> %x, i32 3
+  %elt1 = extractelement <4 x i32> %x, i32 2
+  %elt2 = extractelement <4 x i32> %x, i32 1
+  %elt3 = trunc i64 %y to i32
+  %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+  ret <4 x i32> %vec3
+}
+
diff --git a/test/CodeGen/SystemZ/vec-perm-13.ll b/test/CodeGen/SystemZ/vec-perm-13.ll
new file mode 100644
index 0000000000000..708d8de53f863
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-perm-13.ll
@@ -0,0 +1,38 @@
+; Test vector shuffles on vectors with implicitly extended elements
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
+; RUN:   FileCheck -check-prefix=CHECK-CODE %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
+; RUN:   FileCheck -check-prefix=CHECK-VECTOR %s
+
+define <4 x i16> @f1(<4 x i16> %x) {
+; CHECK-CODE-LABEL: f1:
+; CHECK-CODE: larl [[REG:%r[0-5]]],
+; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]])
+; CHECK-CODE: vgbm [[ELT:%v[0-9]+]], 0
+; CHECK-CODE: vperm %v24, %v24, [[ELT]], [[MASK]]
+; CHECK-CODE: br %r14
+
+; CHECK-VECTOR: .space  1                                        
+; CHECK-VECTOR-NEXT: .space  1                                        
+; CHECK-VECTOR-NEXT: .space  1                                        
+; CHECK-VECTOR-NEXT: .space  1                                        
+; CHECK-VECTOR-NEXT: .byte   6
+; CHECK-VECTOR-NEXT: .byte   7
+; CHECK-VECTOR-NEXT: .byte   16
+; CHECK-VECTOR-NEXT: .byte   17
+; CHECK-VECTOR-NEXT: .space  1                                        
+; CHECK-VECTOR-NEXT: .space  1                                        
+; CHECK-VECTOR-NEXT: .space  1                                        
+; CHECK-VECTOR-NEXT: .space  1                                        
+; CHECK-VECTOR-NEXT: .space  1                                        
+; CHECK-VECTOR-NEXT: .space  1                                        
+; CHECK-VECTOR-NEXT: .space  1                                        
+; CHECK-VECTOR-NEXT: .space  1                                        
+
+  %elt = extractelement <4 x i16> %x, i32 3
+  %vec1 = insertelement <4 x i16> undef, i16 %elt, i32 2
+  %vec2 = insertelement <4 x i16> %vec1, i16 0, i32 3
+  ret <4 x i16> %vec2
+}
+
diff --git a/test/CodeGen/SystemZ/xor-01.ll b/test/CodeGen/SystemZ/xor-01.ll
index e0aaffbb257ec..281f386ce955c 100644
--- a/test/CodeGen/SystemZ/xor-01.ll
+++ b/test/CodeGen/SystemZ/xor-01.ll
@@ -1,6 +1,6 @@
 ; Test 32-bit XORs in which the second operand is variable.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
 
 declare i32 @foo()
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index 0fd1a9e1e2327..8ec4d5b9865bc 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -51,9 +51,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !0 = !DILocation(line: 46, scope: !1)
 !1 = distinct !DILexicalBlock(line: 44, column: 0, file: !101, scope: !2)
 !2 = distinct !DILexicalBlock(line: 44, column: 0, file: !101, scope: !3)
-!3 = !DISubprogram(name: "getClosestDiagonal3", linkageName: "_Z19getClosestDiagonal3ii", line: 44, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !101, scope: null, type: !6)
+!3 = distinct !DISubprogram(name: "getClosestDiagonal3", linkageName: "_Z19getClosestDiagonal3ii", line: 44, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !101, scope: null, type: !6)
 !4 = !DIFile(filename: "ggEdgeDiscrepancy.cc", directory: "/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src")
-!5 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", isOptimized: true, emissionKind: 0, file: !101, enums: !102, retainedTypes: !102, subprograms: !103)
+!5 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", isOptimized: true, emissionKind: 0, file: !101, enums: !102, retainedTypes: !102, subprograms: !103)
 !6 = !DISubroutineType(types: !7)
 !7 = !{!8, !22, !22}
 !8 = !DICompositeType(tag: DW_TAG_structure_type, name: "ggVector3", line: 66, size: 192, align: 32, file: !99, elements: !10)
@@ -87,12 +87,12 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !34 = !DIDerivedType(tag: DW_TAG_const_type, size: 192, align: 32, file: !101, scope: !4, baseType: !8)
 !35 = !DISubprogram(name: "y", linkageName: "_ZNK9ggVector31yEv", line: 83, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !31)
 !36 = !DISubprogram(name: "z", linkageName: "_ZNK9ggVector31zEv", line: 84, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !31)
-!37 = !DISubprogram(name: "x", linkageName: "_ZN9ggVector31xEv", line: 85, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !38)
+!37 = distinct !DISubprogram(name: "x", linkageName: "_ZN9ggVector31xEv", line: 85, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !38)
 !38 = !DISubroutineType(types: !39)
 !39 = !{!40, !19}
 !40 = !DIDerivedType(tag: DW_TAG_reference_type, name: "double", size: 32, align: 32, file: !101, scope: !4, baseType: !13)
-!41 = !DISubprogram(name: "y", linkageName: "_ZN9ggVector31yEv", line: 86, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !38)
-!42 = !DISubprogram(name: "z", linkageName: "_ZN9ggVector31zEv", line: 87, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !38)
+!41 = distinct !DISubprogram(name: "y", linkageName: "_ZN9ggVector31yEv", line: 86, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !38)
+!42 = distinct !DISubprogram(name: "z", linkageName: "_ZN9ggVector31zEv", line: 87, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !38)
 !43 = !DISubprogram(name: "SetX", linkageName: "_ZN9ggVector34SetXEd", line: 88, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !44)
 !44 = !DISubroutineType(types: !45)
 !45 = !{null, !19, !13}
@@ -127,7 +127,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !74 = !DISubprogram(name: "operator/=", linkageName: "_ZN9ggVector3dVEd", line: 324, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !72)
 !75 = !DISubprogram(name: "length", linkageName: "_ZNK9ggVector36lengthEv", line: 121, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !31)
 !76 = !DISubprogram(name: "squaredLength", linkageName: "_ZNK9ggVector313squaredLengthEv", line: 122, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !31)
-!77 = !DISubprogram(name: "MakeUnitVector", linkageName: "_ZN9ggVector314MakeUnitVectorEv", line: 217, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !24)
+!77 = distinct !DISubprogram(name: "MakeUnitVector", linkageName: "_ZN9ggVector314MakeUnitVectorEv", line: 217, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !24)
 !78 = !DISubprogram(name: "Perturb", linkageName: "_ZNK9ggVector37PerturbEdd", line: 126, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !79)
 !79 = !DISubroutineType(types: !80)
 !80 = !{!8, !33, !13, !13}
@@ -141,7 +141,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !88 = !DISubprogram(name: "indexOfMinAbsComponent", linkageName: "_ZNK9ggVector322indexOfMinAbsComponentEv", line: 137, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !86)
 !89 = !DISubprogram(name: "indexOfMaxComponent", linkageName: "_ZNK9ggVector319indexOfMaxComponentEv", line: 146, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !86)
 !90 = !DISubprogram(name: "indexOfMaxAbsComponent", linkageName: "_ZNK9ggVector322indexOfMaxAbsComponentEv", line: 150, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !86)
-!91 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "vx", line: 46, scope: !1, file: !4, type: !13)
+!91 = !DILocalVariable(name: "vx", line: 46, scope: !1, file: !4, type: !13)
 !92 = !DILocation(line: 48, scope: !1)
 !93 = !DILocation(line: 218, scope: !94, inlinedAt: !96)
 !94 = distinct !DILexicalBlock(line: 217, column: 0, file: !101, scope: !95)
diff --git a/test/CodeGen/Thumb/cortex-m0-unaligned-access.ll b/test/CodeGen/Thumb/cortex-m0-unaligned-access.ll
index cba1ca68569f7..1ba7cb795d11b 100644
--- a/test/CodeGen/Thumb/cortex-m0-unaligned-access.ll
+++ b/test/CodeGen/Thumb/cortex-m0-unaligned-access.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv6m-apple-unknown-macho < %s | FileCheck --check-prefix=V6M %s
+; RUN: llc -mtriple=thumbv6m-apple-unknown-macho -mattr=+strict-align < %s | FileCheck --check-prefix=V6M %s
 ; RUN: llc -mtriple=thumbv7m-apple-unknown-macho < %s | FileCheck --check-prefix=V7M %s
 
 define i32 @split_load(i32* %p) nounwind {
diff --git a/test/CodeGen/Thumb/large-stack.ll b/test/CodeGen/Thumb/large-stack.ll
index 0d534589ae0a0..c5d1044e9d69d 100644
--- a/test/CodeGen/Thumb/large-stack.ll
+++ b/test/CodeGen/Thumb/large-stack.ll
@@ -32,10 +32,10 @@ define void @test100() {
 ; Smallest stack for which we use a constant pool
 define void @test2() {
 ; CHECK-LABEL: test2:
-; CHECK: ldr r0,
-; CHECK: add sp, r0
-; EABI: ldr r0,
-; EABI: add sp, r0
+; CHECK: ldr [[TEMP:r[0-7]]],
+; CHECK: add sp, [[TEMP]]
+; EABI: ldr [[TEMP:r[0-7]]],
+; EABI: add sp, [[TEMP]]
 ; IOS: subs r4, r7, #4
 ; IOS: mov sp, r4
     %tmp = alloca [ 1528 x i8 ] , align 4
@@ -44,12 +44,12 @@ define void @test2() {
 
 define i32 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK: ldr r1,
-; CHECK: add sp, r1
-; CHECK: ldr r1,
-; CHECK: add r1, sp
-; EABI: ldr r1,
-; EABI: add sp, r1
+; CHECK: ldr [[TEMP:r[0-7]]],
+; CHECK: add sp, [[TEMP]]
+; CHECK: ldr [[TEMP]],
+; CHECK: add [[TEMP]], sp
+; EABI: ldr [[TEMP:r[0-7]]],
+; EABI: add sp, [[TEMP]]
 ; IOS: subs r4, r7, #4
 ; IOS: mov sp, r4
     %retval = alloca i32, align 4
diff --git a/test/CodeGen/Thumb/ldm-stm-base-materialization-thumb2.ll b/test/CodeGen/Thumb/ldm-stm-base-materialization-thumb2.ll
new file mode 100644
index 0000000000000..7901a158a959e
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-stm-base-materialization-thumb2.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mattr=-neon < %s -verify-machineinstrs -o - | FileCheck %s
+
+target triple = "thumbv7a-none--eabi"
+
+@a = external global i32*
+@b = external global i32*
+
+; Function Attrs: nounwind
+define void @foo24() #0 {
+entry:
+; CHECK-LABEL: foo24:
+; We use '[rl0-9]*' to allow 'r0'..'r12', 'lr'
+; CHECK: movt [[LB:[rl0-9]+]], :upper16:b
+; CHECK: movt [[SB:[rl0-9]+]], :upper16:a
+; CHECK: add{{s?}}{{(\.w)?}} [[NLB:[rl0-9]+]], [[LB]], #4
+; CHECK: adds [[SB]], #4
+; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]], [[R5:[rl0-9]+]], [[R6:[rl0-9]+]]}
+; CHECK-NEXT: stm{{(\.w)?}} [[SB]], {[[R1]], [[R2]], [[R3]], [[R4]], [[R5]], [[R6]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 24, i32 4, i1 false)
+  ret void
+}
+
+define void @foo28() #0 {
+entry:
+; CHECK-LABEL: foo28:
+; CHECK: movt [[LB:[rl0-9]+]], :upper16:b
+; CHECK: movt [[SB:[rl0-9]+]], :upper16:a
+; CHECK: add{{(\.w)?}} [[NLB:[rl0-9]+]], [[LB]], #4
+; CHECK: adds [[SB]], #4
+; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]]}
+; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!, {[[R1]], [[R2]], [[R3]]}
+; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]}
+; CHECK-NEXT: stm{{(\.w)?}} [[SB]], {[[R1]], [[R2]], [[R3]], [[R4]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false)
+  ret void
+}
+
+define void @foo32() #0 {
+entry:
+; CHECK-LABEL: foo32:
+; CHECK: movt [[LB:[rl0-9]+]], :upper16:b
+; CHECK: movt [[SB:[rl0-9]+]], :upper16:a
+; CHECK: add{{(\.w)?}} [[NLB:[rl0-9]+]], [[LB]], #4
+; CHECK: adds [[SB]], #4
+; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]}
+; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!, {[[R1]], [[R2]], [[R3]], [[R4]]}
+; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]}
+; CHECK-NEXT: stm{{(\.w)?}} [[SB]], {[[R1]], [[R2]], [[R3]], [[R4]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 32, i32 4, i1 false)
+  ret void
+}
+
+define void @foo36() #0 {
+entry:
+; CHECK-LABEL: foo36:
+; CHECK: movt [[LB:[rl0-9]+]], :upper16:b
+; CHECK: movt [[SB:[rl0-9]+]], :upper16:a
+; CHECK: add{{(\.w)?}} [[NLB:[rl0-9]+]], [[LB]], #4
+; CHECK: adds [[SB]], #4
+; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]]!, {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]]}
+; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!, {[[R1]], [[R2]], [[R3]], [[R4]]}
+; CHECK-NEXT: ldm{{(\.w)?}} [[NLB]], {[[R1:[rl0-9]+]], [[R2:[rl0-9]+]], [[R3:[rl0-9]+]], [[R4:[rl0-9]+]], [[R5:[rl0-9]+]]}
+; CHECK-NEXT: stm{{(\.w)?}} [[SB]], {[[R1]], [[R2]], [[R3]], [[R4]], [[R5]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 36, i32 4, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
diff --git a/test/CodeGen/Thumb/ldm-stm-base-materialization.ll b/test/CodeGen/Thumb/ldm-stm-base-materialization.ll
index 916e5ea299a3c..0be796eb8f8df 100644
--- a/test/CodeGen/Thumb/ldm-stm-base-materialization.ll
+++ b/test/CodeGen/Thumb/ldm-stm-base-materialization.ll
@@ -6,15 +6,17 @@ target triple = "thumbv6m-none--eabi"
 @b = external global i32*
 
 ; Function Attrs: nounwind
-define void @foo() #0 {
+define void @foo24() #0 {
 entry:
-; CHECK-LABEL: foo:
-; CHECK: ldr r[[SB:[0-9]]], .LCPI
+; CHECK-LABEL: foo24:
 ; CHECK: ldr r[[LB:[0-9]]], .LCPI
 ; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
-; CHECK-NEXT: ldm r[[NLB]],
+; CHECK: ldr r[[SB:[0-9]]], .LCPI
 ; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
-; CHECK-NEXT: stm r[[NSB]]
+; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
+; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
+; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
+; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
   %0 = load i32*, i32** @a, align 4
   %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
   %1 = bitcast i32* %arrayidx to i8*
@@ -25,5 +27,70 @@ entry:
   ret void
 }
 
+define void @foo28() #0 {
+entry:
+; CHECK-LABEL: foo28:
+; CHECK: ldr r[[LB:[0-9]]], .LCPI
+; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
+; CHECK: ldr r[[SB:[0-9]]], .LCPI
+; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
+; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
+; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
+; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]}
+; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 28, i32 4, i1 false)
+  ret void
+}
+
+define void @foo32() #0 {
+entry:
+; CHECK-LABEL: foo32:
+; CHECK: ldr r[[LB:[0-9]]], .LCPI
+; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
+; CHECK: ldr r[[SB:[0-9]]], .LCPI
+; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
+; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]}
+; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]}
+; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]], r[[R4:[0-9]]]}
+; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]], r[[R4]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 32, i32 4, i1 false)
+  ret void
+}
+
+define void @foo36() #0 {
+entry:
+; CHECK-LABEL: foo36:
+; CHECK: ldr r[[LB:[0-9]]], .LCPI
+; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
+; CHECK: ldr r[[SB:[0-9]]], .LCPI
+; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
+; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
+; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
+; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
+; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
+; CHECK-NEXT: ldm r[[NLB]]!, {r[[R1:[0-9]]], r[[R2:[0-9]]], r[[R3:[0-9]]]}
+; CHECK-NEXT: stm r[[NSB]]!, {r[[R1]], r[[R2]], r[[R3]]}
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32*, i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 36, i32 4, i1 false)
+  ret void
+}
+
 ; Function Attrs: nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
diff --git a/test/CodeGen/Thumb/pop.ll b/test/CodeGen/Thumb/pop.ll
index 3c539c6901708..851f793e5ee07 100644
--- a/test/CodeGen/Thumb/pop.ll
+++ b/test/CodeGen/Thumb/pop.ll
@@ -3,9 +3,9 @@
 
 define void @t(i8* %a, ...) nounwind {
 ; CHECK-LABEL:      t:
-; CHECK:      pop {r3}
+; CHECK:      pop {[[POP_REG:r[0-3]]]}
 ; CHECK-NEXT: add sp, #12
-; CHECK-NEXT: bx r3
+; CHECK-NEXT: bx [[POP_REG]]
 entry:
   %a.addr = alloca i8, i32 4
   call void @llvm.va_start(i8* %a.addr)
diff --git a/test/CodeGen/Thumb/segmented-stacks.ll b/test/CodeGen/Thumb/segmented-stacks.ll
index 09f5db852bf49..251c29534727e 100644
--- a/test/CodeGen/Thumb/segmented-stacks.ll
+++ b/test/CodeGen/Thumb/segmented-stacks.ll
@@ -12,7 +12,7 @@ define void @test_basic() #0 {
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
 
-; Thumb-android:      test_basic:
+; Thumb-android-LABEL:      test_basic:
 
 ; Thumb-android:      push    {r4, r5}
 ; Thumb-android-NEXT: mov     r5, sp
@@ -32,7 +32,11 @@ define void @test_basic() #0 {
 
 ; Thumb-android:      pop     {r4, r5}
 
-; Thumb-linux:      test_basic:
+; Thumb-android: .align 2
+; Thumb-android: .LCPI0_0:
+; Thumb-android-NEXT: .long __STACK_LIMIT
+
+; Thumb-linux-LABEL:      test_basic:
 
 ; Thumb-linux:      push    {r4, r5}
 ; Thumb-linux-NEXT: mov     r5, sp
@@ -61,7 +65,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
-; Thumb-android:      test_nested:
+; Thumb-android-LABEL:      test_nested:
 
 ; Thumb-android:      push  {r4, r5}
 ; Thumb-android-NEXT: mov     r5, sp
@@ -81,7 +85,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 
 ; Thumb-android:      pop     {r4, r5}
 
-; Thumb-linux:      test_nested:
+; Thumb-linux-LABEL:      test_nested:
 
 ; Thumb-linux:      push    {r4, r5}
 ; Thumb-linux-NEXT: mov     r5, sp
@@ -108,7 +112,7 @@ define void @test_large() #0 {
         call void @dummy_use (i32* %mem, i32 0)
         ret void
 
-; Thumb-android:      test_large:
+; Thumb-android-LABEL:      test_large:
 
 ; Thumb-android:      push    {r4, r5}
 ; Thumb-android-NEXT: mov     r5, sp
@@ -129,7 +133,7 @@ define void @test_large() #0 {
 
 ; Thumb-android:      pop     {r4, r5}
 
-; Thumb-linux:      test_large:
+; Thumb-linux-LABEL:      test_large:
 
 ; Thumb-linux:      push    {r4, r5}
 ; Thumb-linux-NEXT: mov     r5, sp
@@ -157,7 +161,7 @@ define fastcc void @test_fastcc() #0 {
         call void @dummy_use (i32* %mem, i32 10)
         ret void
 
-; Thumb-android:      test_fastcc:
+; Thumb-android-LABEL:      test_fastcc:
 
 ; Thumb-android:      push    {r4, r5}
 ; Thumb-android-NEXT: mov     r5, sp
@@ -177,7 +181,7 @@ define fastcc void @test_fastcc() #0 {
 
 ; Thumb-android:      pop     {r4, r5}
 
-; Thumb-linux:      test_fastcc:
+; Thumb-linux-LABEL:      test_fastcc:
 
 ; Thumb-linux:      push    {r4, r5}
 ; Thumb-linux-NEXT: mov     r5, sp
@@ -204,7 +208,7 @@ define fastcc void @test_fastcc_large() #0 {
         call void @dummy_use (i32* %mem, i32 0)
         ret void
 
-; Thumb-android:      test_fastcc_large:
+; Thumb-android-LABEL:      test_fastcc_large:
 
 ; Thumb-android:      push    {r4, r5}
 ; Thumb-android-NEXT: mov     r5, sp
@@ -225,7 +229,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; Thumb-android:      pop     {r4, r5}
 
-; Thumb-linux:      test_fastcc_large:
+; Thumb-linux-LABEL:      test_fastcc_large:
 
 ; Thumb-linux:      push    {r4, r5}
 ; Thumb-linux-NEXT: mov     r5, sp
diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
deleted file mode 100644
index da2f3f09b281f..0000000000000
--- a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
-@d = external global [64 x i32]
-@s = external global [64 x i32]
-
-; Function Attrs: nounwind
-define void @t1() #0 {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: ldr r[[LB:[0-9]]],
-; CHECK-NEXT: ldm r[[LB]]!,
-; CHECK-NEXT: ldr r[[SB:[0-9]]],
-; CHECK-NEXT: stm r[[SB]]!,
-; CHECK-NEXT: ldrb {{.*}}, [r[[LB]]]
-; CHECK-NEXT: strb {{.*}}, [r[[SB]]]
-    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false)
-    ret void
-}
-
-; Function Attrs: nounwind
-define void @t2() #0 {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: ldr r[[LB:[0-9]]],
-; CHECK-NEXT: ldm r[[LB]]!,
-; CHECK-NEXT: ldr r[[SB:[0-9]]],
-; CHECK-NEXT: stm r[[SB]]!,
-; CHECK-NEXT: ldrh {{.*}}, [r[[LB]]]
-; CHECK-NEXT: ldrb {{.*}}, [r[[LB]], #2]
-; CHECK-NEXT: strb {{.*}}, [r[[SB]], #2]
-; CHECK-NEXT: strh {{.*}}, [r[[SB]]]
-    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false)
-    ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
diff --git a/test/CodeGen/Thumb/thumb-shrink-wrapping.ll b/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
new file mode 100644
index 0000000000000..fb4ee8dba7a97
--- /dev/null
+++ b/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
@@ -0,0 +1,691 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T
+;
+; Note: Lots of tests use inline asm instead of regular calls.
+; This allows to have a better control on what the allocation will do.
+; Otherwise, we may have spill right in the entry block, defeating
+; shrink-wrapping. Moreover, some of the inline asm statements (nop)
+; are here to ensure that the related paths do not end up as critical
+; edges.
+; Also disable the late if-converter as it makes harder to reason on
+; the diffs.
+
+; Initial motivating example: Simple diamond with a call just on one side.
+; CHECK-LABEL: foo:
+;
+; Compare the arguments and jump to exit.
+; No prologue needed.
+; ENABLE: cmp r0, r1
+; ENABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; CHECK: push {r7, lr}
+; CHECK: sub sp, #8
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; DISABLE: cmp r0, r1
+; DISABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Store %a in the alloca.
+; CHECK: str r0, [sp, #4]
+; Set the alloca address in the second argument.
+; Set the first argument to zero.
+; CHECK: movs r0, #0
+; CHECK-NEXT: add r1, sp, #4
+; CHECK-NEXT: bl
+;
+; With shrink-wrapping, epilogue is just after the call.
+; ENABLE-NEXT: add sp, #8
+; ENABLE-V5T-NEXT: pop {r7, pc}
+; ENABLE-V4T-NEXT: pop {r7}
+; ENABLE-V4T-NEXT: pop {r1}
+; ENABLE-V4T-NEXT: mov lr, r1
+;
+; CHECK: [[EXIT_LABEL]]:
+;
+; Without shrink-wrapping, epilogue is in the exit block.
+; Epilogue code. (What we pop does not matter.)
+; DISABLE: add sp, #8
+; DISABLE-V5T-NEXT: pop {r7, pc}
+; DISABLE-V4T-NEXT: pop {r7}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+;
+; ENABLE-NEXT: bx lr
+define i32 @foo(i32 %a, i32 %b) {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+
+; Same, but the final BB is non-trivial, so we don't duplicate the return inst.
+; CHECK-LABEL: bar:
+;
+; With shrink-wrapping, epilogue is just after the call.
+; CHECK: bl
+; ENABLE-NEXT: add sp, #8
+; ENABLE-NEXT: pop {r7}
+; ENABLE-NEXT: pop {r0}
+; ENABLE-NEXT: mov lr, r0
+;
+; CHECK: movs r0, #42
+;
+; Without shrink-wrapping, epilogue is in the exit block.
+; Epilogue code. (What we pop does not matter.)
+; DISABLE: add sp, #8
+; DISABLE-V5T-NEXT: pop {r7, pc}
+; DISABLE-V4T-NEXT: pop {r7}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+;
+; ENABLE-NEXT: bx lr
+define i32 @bar(i32 %a, i32 %b) {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  ret i32 42
+}
+
+; Function Attrs: optsize
+declare i32 @doSomething(i32, i32*)
+
+
+; Check that we do not perform the restore inside the loop whereas the save
+; is outside.
+; CHECK-LABEL: freqSaveAndRestoreOutsideLoop:
+;
+; Shrink-wrapping allows to skip the prologue in the else case.
+; ENABLE: cmp r0, #0
+; ENABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r4.
+; CHECK: push {r4, lr}
+;
+; DISABLE: cmp r0, #0
+; DISABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; SUM is in r0 because it is coalesced with the second
+; argument on the else path.
+; CHECK: movs [[SUM:r0]], #0
+; CHECK-NEXT: movs [[IV:r[0-9]+]], #10
+;
+; Next BB.
+; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
+; CHECK: movs [[TMP:r[0-9]+]], #1
+; CHECK: adds [[SUM]], [[TMP]], [[SUM]]
+; CHECK-NEXT: subs [[IV]], [[IV]], #1
+; CHECK-NEXT: cmp [[IV]], #0
+; CHECK-NEXT: bne [[LOOP]]
+;
+; Next BB.
+; SUM << 3.
+; CHECK: lsls [[SUM]], [[SUM]], #3
+;
+; Duplicated epilogue.
+; DISABLE-V5T: pop {r4, pc}
+; DISABLE-V4T: b [[END_LABEL:LBB[0-9_]+]]
+;
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: lsls r0, r1, #1
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+;
+; ENABLE-V5T-NEXT: {{LBB[0-9_]+}}: @ %if.end
+; ENABLE-NEXT: bx lr
+define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
+  %call = tail call i32 asm sideeffect "movs $0, #1", "=r,~{r4}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+declare i32 @something(...)
+
+; Check that we do not perform the shrink-wrapping inside the loop even
+; though that would be legal. The cost model must prevent that.
+; CHECK-LABEL: freqSaveAndRestoreOutsideLoop2:
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r4.
+; CHECK: push {r4
+; This is the nop.
+; CHECK: mov r8, r8
+; CHECK: movs [[SUM:r0]], #0
+; CHECK-NEXT: movs [[IV:r[0-9]+]], #10
+; Next BB.
+; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: @ %for.body
+; CHECK: movs [[TMP:r[0-9]+]], #1
+; CHECK: adds [[SUM]], [[TMP]], [[SUM]]
+; CHECK-NEXT: subs [[IV]], [[IV]], #1
+; CHECK-NEXT: cmp [[IV]], #0
+; CHECK-NEXT: bne [[LOOP_LABEL]]
+; Next BB.
+; CHECK: @ %for.exit
+; This is the nop.
+; CHECK: mov r8, r8
+; CHECK: pop {r4
+define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) {
+entry:
+  br label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.04 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
+  %sum.03 = phi i32 [ 0, %for.preheader ], [ %add, %for.body ]
+  %call = tail call i32 asm sideeffect "movs $0, #1", "=r,~{r4}"()
+  %add = add nsw i32 %call, %sum.03
+  %inc = add nuw nsw i32 %i.04, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  tail call void asm "nop", ""()
+  br label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add
+}
+
+; Check with a more complex case that we do not have save within the loop and
+; restore outside.
+; CHECK-LABEL: loopInfoSaveOutsideLoop:
+;
+; ENABLE: cmp r0, #0
+; ENABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r4.
+; CHECK: push {r4, lr}
+;
+; DISABLE: cmp r0, #0
+; DISABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; SUM is in r0 because it is coalesced with the second
+; argument on the else path.
+; CHECK: movs [[SUM:r0]], #0
+; CHECK-NEXT: movs [[IV:r[0-9]+]], #10
+;
+; Next BB.
+; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
+; CHECK: movs [[TMP:r[0-9]+]], #1
+; CHECK: adds [[SUM]], [[TMP]], [[SUM]]
+; CHECK-NEXT: subs [[IV]], [[IV]], #1
+; CHECK-NEXT: cmp [[IV]], #0
+; CHECK-NEXT: bne [[LOOP]]
+;
+; Next BB.
+; SUM << 3.
+; CHECK: lsls [[SUM]], [[SUM]], #3
+; ENABLE-V5T-NEXT: pop {r4, pc}
+; ENABLE-V4T-NEXT: pop {r4}
+; ENABLE-V4T-NEXT: pop {r1}
+; ENABLE-V4T-NEXT: bx r1
+;
+; Duplicated epilogue.
+; DISABLE-V5T: pop {r4, pc}
+; DISABLE-V4T: b [[END_LABEL:LBB[0-9_]+]]
+;
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: lsls r0, r1, #1
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+;
+; ENABLE-V5T-NEXT: {{LBB[0-9_]+}}: @ %if.end
+; ENABLE-NEXT: bx lr
+define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
+  %call = tail call i32 asm sideeffect "movs $0, #1", "=r,~{r4}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  tail call void asm "nop", "~{r4}"()
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+declare void @somethingElse(...)
+
+; Check with a more complex case that we do not have restore within the loop and
+; save outside.
+; CHECK-LABEL: loopInfoRestoreOutsideLoop:
+;
+; ENABLE: cmp r0, #0
+; ENABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r4.
+; CHECK: push {r4, lr}
+;
+; DISABLE-NEXT: cmp r0, #0
+; DISABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; SUM is in r0 because it is coalesced with the second
+; argument on the else path.
+; CHECK: movs [[SUM:r0]], #0
+; CHECK-NEXT: movs [[IV:r[0-9]+]], #10
+;
+; Next BB.
+; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
+; CHECK: movs [[TMP:r[0-9]+]], #1
+; CHECK: adds [[SUM]], [[TMP]], [[SUM]]
+; CHECK-NEXT: subs [[IV]], [[IV]], #1
+; CHECK-NEXT: cmp [[IV]], #0
+; CHECK-NEXT: bne [[LOOP]]
+;
+; Next BB.
+; SUM << 3.
+; CHECK: lsls [[SUM]], [[SUM]], #3
+; ENABLE-V5T-NEXT: pop {r4, pc}
+; ENABLE-V4T-NEXT: pop {r4}
+; ENABLE-V4T-NEXT: pop {r1}
+; ENABLE-V4T-NEXT: bx r1
+;
+; Duplicated epilogue.
+; DISABLE-V5T: pop {r4, pc}
+; DISABLE-V4T: b [[END_LABEL:LBB[0-9_]+]]
+;
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: lsls r0, r1, #1
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+;
+; ENABLE-V5T-NEXT: {{LBB[0-9_]+}}: @ %if.end
+; ENABLE-NEXT: bx lr
+define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) #0 {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void asm "nop", "~{r4}"()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %if.then
+  %i.05 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
+  %sum.04 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
+  %call = tail call i32 asm sideeffect "movs $0, #1", "=r,~{r4}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+; Check that we handle function with no frame information correctly.
+; CHECK-LABEL: emptyFrame:
+; CHECK: @ %entry
+; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: bx lr
+define i32 @emptyFrame() {
+entry:
+  ret i32 0
+}
+
+; Check that we handle inline asm correctly.
+; CHECK-LABEL: inlineAsm:
+;
+; ENABLE: cmp r0, #0
+; ENABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: r4.
+; CHECK: push {r4, lr}
+;
+; DISABLE: cmp r0, #0
+; DISABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; CHECK: movs [[IV:r[0-9]+]], #10
+;
+; Next BB.
+; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
+; CHECK: movs r4, #1
+; CHECK: subs [[IV]], [[IV]], #1
+; CHECK-NEXT: cmp [[IV]], #0
+; CHECK-NEXT: bne [[LOOP]]
+;
+; Next BB.
+; CHECK: movs r0, #0
+; ENABLE-V5T-NEXT: pop {r4, pc}
+; ENABLE-V4T-NEXT: pop {r4}
+; ENABLE-V4T-NEXT: pop {r1}
+; ENABLE-V4T-NEXT: bx r1
+;
+; Duplicated epilogue.
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: b [[END_LABEL:LBB[0-9_]+]]
+;
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: lsls r0, r1, #1
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+;
+; ENABLE-V5T-NEXT: {{LBB[0-9_]+}}: @ %if.end
+; ENABLE-NEXT: bx lr
+define i32 @inlineAsm(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  tail call void asm sideeffect "movs r4, #1", "~{r4}"()
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  tail call void asm "nop", ""()
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.else
+  %sum.0 = phi i32 [ %mul, %if.else ], [ 0, %for.exit ]
+  ret i32 %sum.0
+}
+
+; Check that we handle calls to variadic functions correctly.
+; CHECK-LABEL: callVariadicFunc:
+;
+; ENABLE: cmp r0, #0
+; ENABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; CHECK: push {[[TMP:r[0-9]+]], lr}
+; CHECK: sub sp, #16
+;
+; DISABLE: cmp r0, #0
+; DISABLE-NEXT: beq [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Setup of the varags.
+; CHECK: mov [[TMP_SP:r[0-9]+]], sp
+; CHECK-NEXT: str r1, {{\[}}[[TMP_SP]]]
+; CHECK-NEXT: str r1, {{\[}}[[TMP_SP]], #4]
+; CHECK-NEXT: str r1, {{\[}}[[TMP_SP]], #8]
+; Thumb has quite a strange way for moving stuff
+; in around. Oh well, match the current sequence.
+; CHECK: push {r1}
+; CHECK-NEXT: pop {r0}
+; CHECK: push {r1}
+; CHECK-NEXT: pop {r2}
+; CHECK: push {r1}
+; CHECK-NEXT: pop {r3}
+; CHECK-NEXT: bl
+; CHECK-NEXT: lsls r0, r0, #3
+;
+; ENABLE-NEXT: add sp, #16
+; ENABLE-V5T-NEXT: pop {[[TMP]], pc}
+; ENABLE-V4T-NEXT: pop {[[TMP]]}
+; ENABLE-V4T-NEXT: pop {r1}
+; ENABLE-V4T-NEXT: bx r1
+;
+; Duplicated epilogue.
+; DISABLE-V5T-NEXT: add sp, #16
+; DISABLE-V5T-NEXT: pop {[[TMP]], pc}
+; DISABLE-V4T-NEXT: b [[END_LABEL:LBB[0-9_]+]]
+;
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: lsls r0, r1, #1
+;
+; Epilogue code.
+; ENABLE-V5T-NEXT: {{LBB[0-9_]+}}: @ %if.end
+; ENABLE-NEXT: bx lr
+;
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-NEXT: add sp, #16
+; DISABLE-V5T-NEXT: pop {[[TMP]], pc}
+; DISABLE-V4T-NEXT: pop {[[TMP]]}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+define i32 @callVariadicFunc(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 (i32, ...) @someVariadicFunc(i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N)
+  %shl = shl i32 %call, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %sum.0 = phi i32 [ %shl, %if.then ], [ %mul, %if.else ]
+  ret i32 %sum.0
+}
+
+declare i32 @someVariadicFunc(i32, ...)
+
+; Make sure we do not insert unreachable code after noreturn function.
+; Although this is not incorrect to insert such code, it is useless
+; and it hurts the binary size.
+;
+; CHECK-LABEL: noreturn:
+; DISABLE: push
+;
+; CHECK: movs [[TMP:r[0-9]+]], #255
+; CHECK-NEXT: tst  r0, [[TMP]]
+; CHECK-NEXT: bne      [[ABORT:LBB[0-9_]+]]
+;
+; CHECK: movs r0, #42
+;
+; ENABLE-NEXT: bx lr
+;
+; DISABLE-NEXT: pop
+;;
+; CHECK: [[ABORT]]: @ %if.abort
+;
+; ENABLE: push
+;
+; CHECK: bl
+; ENABLE-NOT: pop
+define i32 @noreturn(i8 signext %bad_thing) {
+entry:
+  %tobool = icmp eq i8 %bad_thing, 0
+  br i1 %tobool, label %if.end, label %if.abort
+
+if.abort:
+  %call = tail call i32 asm sideeffect "movs $0, #1", "=r,~{r4}"()
+  tail call void @abort() #0
+  unreachable
+
+if.end:
+  ret i32 42
+}
+
+declare void @abort() #0
+
+define i32 @b_to_bx(i32 %value) {
+; CHECK-LABEL: b_to_bx:
+; DISABLE: push {r7, lr}
+; CHECK: cmp r1, #49
+; CHECK-NEXT: bgt [[ELSE_LABEL:LBB[0-9_]+]]
+; ENABLE: push {r7, lr}
+
+; CHECK: bl
+; DISABLE-V5-NEXT: pop {r7, pc}
+; DISABLE-V4T-NEXT: b [[END_LABEL:LBB[0-9_]+]]
+
+; ENABLE-V5-NEXT: pop {r7, pc}
+; ENABLE-V4-NEXT: pop {r7}
+; ENABLE-V4-NEXT: pop {r1}
+; ENABLE-V4-NEXT: bx r1
+
+; CHECK: [[ELSE_LABEL]]: @ %if.else
+; CHECK-NEXT: lsls r0, r1, #1
+; DISABLE-V5-NEXT: pop {r7, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r7}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+
+; ENABLE-V5T-NEXT: {{LBB[0-9_]+}}: @ %if.end
+; ENABLE-NEXT: bx lr
+
+entry:
+  %cmp = icmp slt i32 %value, 50
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %div = sdiv i32 5000, %value
+  br label %if.end
+
+if.else:
+  %mul = shl nsw i32 %value, 1
+  br label %if.end
+
+if.end:
+  %value.addr.0 = phi i32 [ %div, %if.then ], [ %mul, %if.else ]
+  ret i32 %value.addr.0
+}
+
+define i1 @beq_to_bx(i32* %y, i32 %head) {
+; CHECK-LABEL: beq_to_bx:
+; DISABLE: push {r4, lr}
+; CHECK: cmp r2, #0
+; CHECK-NEXT: beq [[EXIT_LABEL:LBB[0-9_]+]]
+; ENABLE: push {r4, lr}
+
+; CHECK: tst r3, r4
+; ENABLE-NEXT: pop {r4}
+; ENABLE-NEXT: pop {r3}
+; ENABLE-NEXT: mov lr, r3
+; CHECK-NEXT: beq [[EXIT_LABEL]]
+
+; CHECK: str r1, [r2]
+; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: [[EXIT_LABEL]]: @ %cleanup
+; ENABLE-NEXT: bx lr
+; DISABLE-V5-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+
+entry:
+  %cmp = icmp eq i32* %y, null
+  br i1 %cmp, label %cleanup, label %if.end
+
+if.end:
+  %z = load i32, i32* %y, align 4
+  %and = and i32 %z, 2
+  %cmp2 = icmp eq i32 %and, 0
+  br i1 %cmp2, label %cleanup, label %if.end4
+
+if.end4:
+  store i32 %head, i32* %y, align 4
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i1 [ 0, %if.end4 ], [ 1, %entry ], [ 1, %if.end ]
+  ret i1 %retval.0
+}
+
+attributes #0 = { noreturn nounwind }
diff --git a/test/CodeGen/Thumb/vargs.ll b/test/CodeGen/Thumb/vargs.ll
index 1c7b631741b7a..45b42125e166e 100644
--- a/test/CodeGen/Thumb/vargs.ll
+++ b/test/CodeGen/Thumb/vargs.ll
@@ -32,12 +32,12 @@ bb7:            ; preds = %bb
         call void @llvm.va_end( i8* %va.upgrd.4 )
         ret void
 
-; The return sequence should pop the lr to r3, recover the stack space used to
+; The return sequence should pop the lr to r0-3, recover the stack space used to
 ; store variadic argument registers, then return via r3. Possibly there is a pop
 ; before this, but only if the function happened to use callee-saved registers.
-; CHECK: pop {r3}
+; CHECK: pop {[[POP_REG:r[0-3]]]}
 ; CHECK: add sp, #[[IMM]]
-; CHECK: bx r3
+; CHECK: bx [[POP_REG]]
 }
 
 declare void @llvm.va_start(i8*)
diff --git a/test/CodeGen/Thumb2/crash.ll b/test/CodeGen/Thumb2/crash.ll
index 893a45d8f7222..fb32a2cac3a12 100644
--- a/test/CodeGen/Thumb2/crash.ll
+++ b/test/CodeGen/Thumb2/crash.ll
@@ -15,11 +15,11 @@ entry:
   %6 = bitcast i32* %sp3 to <4 x i32>*            ; <<4 x i32>*> [#uses=1]
   %7 = load <4 x i32>, <4 x i32>* %6, align 16               ; <<4 x i32>> [#uses=1]
   %8 = bitcast i32* %dp to i8*                    ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst4.v4i32(i8* %8, <4 x i32> %1, <4 x i32> %3, <4 x i32> %5, <4 x i32> %7, i32 1)
+  tail call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* %8, <4 x i32> %1, <4 x i32> %3, <4 x i32> %5, <4 x i32> %7, i32 1)
   ret void
 }
 
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
 
 @sbuf = common global [16 x i32] zeroinitializer, align 16 ; <[16 x i32]*> [#uses=5]
 @dbuf = common global [16 x i32] zeroinitializer  ; <[16 x i32]*> [#uses=2]
@@ -45,7 +45,7 @@ bb2:                                              ; preds = %bb
   %3 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 4) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
   %4 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 8) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
   %5 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 12) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst4.v4i32(i8* bitcast ([16 x i32]* @dbuf to i8*), <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, i32 1) nounwind
+  tail call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* bitcast ([16 x i32]* @dbuf to i8*), <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, i32 1) nounwind
   ret i32 0
 }
 
@@ -53,15 +53,15 @@ bb2:                                              ; preds = %bb
 ; Make sure the DPair register class can spill.
 define void @pr12389(i8* %p) nounwind ssp {
 entry:
-  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %p, i32 1)
+  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %p, i32 1)
   tail call void asm sideeffect "", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15}"() nounwind
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* %p, <4 x float> %vld1, i32 1)
+  tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %vld1, i32 1)
   ret void
 }
 
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
 
 ; <rdar://problem/11101911>
 ; When an strd is expanded into two str instructions, make sure the first str
diff --git a/test/CodeGen/Thumb2/emit-unwinding.ll b/test/CodeGen/Thumb2/emit-unwinding.ll
new file mode 100644
index 0000000000000..1f1ea1b48af0d
--- /dev/null
+++ b/test/CodeGen/Thumb2/emit-unwinding.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple thumbv7em-apple-unknown-eabi-macho %s -o - -O0 | FileCheck %s
+
+; CHECK: add.w r11, sp, #{{[1-9]+}}
+
+define void @foo1() {
+  call void asm sideeffect "", "~{r4}"()
+  call void @foo2()
+  ret void
+}
+
+declare void @foo2()
diff --git a/test/CodeGen/Thumb2/float-cmp.ll b/test/CodeGen/Thumb2/float-cmp.ll
index 88d6c3b0adb86..77b0999337c67 100644
--- a/test/CodeGen/Thumb2/float-cmp.ll
+++ b/test/CodeGen/Thumb2/float-cmp.ll
@@ -81,8 +81,9 @@ define i1 @cmp_f_ord(float %a, float %b) {
 }
 define i1 @cmp_f_ugt(float %a, float %b) {
 ; CHECK-LABEL: cmp_f_ugt:
-; NONE: bl __aeabi_fcmpgt
-; NONE: bl __aeabi_fcmpun
+; NONE: bl __aeabi_fcmple
+; NONE: cmp r0, #0
+; NONE-NEXT: it eq
 ; HARD: vcmpe.f32
 ; HARD: movhi r0, #1
   %1 = fcmp ugt float %a, %b
@@ -90,8 +91,9 @@ define i1 @cmp_f_ugt(float %a, float %b) {
 }
 define i1 @cmp_f_uge(float %a, float %b) {
 ; CHECK-LABEL: cmp_f_uge:
-; NONE: bl __aeabi_fcmpge
-; NONE: bl __aeabi_fcmpun
+; NONE: bl __aeabi_fcmplt
+; NONE: cmp r0, #0
+; NONE-NEXT: it eq
 ; HARD: vcmpe.f32
 ; HARD: movpl r0, #1
   %1 = fcmp uge float %a, %b
@@ -99,8 +101,9 @@ define i1 @cmp_f_uge(float %a, float %b) {
 }
 define i1 @cmp_f_ult(float %a, float %b) {
 ; CHECK-LABEL: cmp_f_ult:
-; NONE: bl __aeabi_fcmplt
-; NONE: bl __aeabi_fcmpun
+; NONE: bl __aeabi_fcmpge
+; NONE: cmp r0, #0
+; NONE-NEXT: it eq
 ; HARD: vcmpe.f32
 ; HARD: movlt r0, #1
   %1 = fcmp ult float %a, %b
@@ -108,8 +111,9 @@ define i1 @cmp_f_ult(float %a, float %b) {
 }
 define i1 @cmp_f_ule(float %a, float %b) {
 ; CHECK-LABEL: cmp_f_ule:
-; NONE: bl __aeabi_fcmple
-; NONE: bl __aeabi_fcmpun
+; NONE: bl __aeabi_fcmpgt
+; NONE: cmp r0, #0
+; NONE-NEXT: it eq
 ; HARD: vcmpe.f32
 ; HARD: movle r0, #1
   %1 = fcmp ule float %a, %b
@@ -214,10 +218,8 @@ define i1 @cmp_d_ord(double %a, double %b) {
 }
 define i1 @cmp_d_ugt(double %a, double %b) {
 ; CHECK-LABEL: cmp_d_ugt:
-; NONE: bl __aeabi_dcmpgt
-; NONE: bl __aeabi_dcmpun
-; SP: bl __aeabi_dcmpgt
-; SP: bl __aeabi_dcmpun
+; NONE: bl __aeabi_dcmple
+; SP: bl __aeabi_dcmple
 ; DP: vcmpe.f64
 ; DP: movhi r0, #1
   %1 = fcmp ugt double %a, %b
@@ -226,10 +228,8 @@ define i1 @cmp_d_ugt(double %a, double %b) {
 
 define i1 @cmp_d_ult(double %a, double %b) {
 ; CHECK-LABEL: cmp_d_ult:
-; NONE: bl __aeabi_dcmplt
-; NONE: bl __aeabi_dcmpun
-; SP: bl __aeabi_dcmplt
-; SP: bl __aeabi_dcmpun
+; NONE: bl __aeabi_dcmpge
+; SP: bl __aeabi_dcmpge
 ; DP: vcmpe.f64
 ; DP: movlt r0, #1
   %1 = fcmp ult double %a, %b
@@ -268,10 +268,8 @@ define i1 @cmp_d_ueq(double %a, double %b) {
 
 define i1 @cmp_d_uge(double %a, double %b) {
 ; CHECK-LABEL: cmp_d_uge:
-; NONE: bl __aeabi_dcmpge
-; NONE: bl __aeabi_dcmpun
-; SP: bl __aeabi_dcmpge
-; SP: bl __aeabi_dcmpun
+; NONE: bl __aeabi_dcmplt
+; SP: bl __aeabi_dcmplt
 ; DP: vcmpe.f64
 ; DP: movpl r0, #1
   %1 = fcmp uge double %a, %b
@@ -280,10 +278,8 @@ define i1 @cmp_d_uge(double %a, double %b) {
 
 define i1 @cmp_d_ule(double %a, double %b) {
 ; CHECK-LABEL: cmp_d_ule:
-; NONE: bl __aeabi_dcmple
-; NONE: bl __aeabi_dcmpun
-; SP: bl __aeabi_dcmple
-; SP: bl __aeabi_dcmpun
+; NONE: bl __aeabi_dcmpgt
+; SP: bl __aeabi_dcmpgt
 ; DP: vcmpe.f64
 ; DP: movle r0, #1
   %1 = fcmp ule double %a, %b
diff --git a/test/CodeGen/Thumb2/float-intrinsics-double.ll b/test/CodeGen/Thumb2/float-intrinsics-double.ll
index 01a23bd0fe697..657d1b172da98 100644
--- a/test/CodeGen/Thumb2/float-intrinsics-double.ll
+++ b/test/CodeGen/Thumb2/float-intrinsics-double.ll
@@ -109,9 +109,12 @@ declare double     @llvm.fabs.f64(double %Val)
 define double @abs_d(double %a) {
 ; CHECK-LABEL: abs_d:
 ; NONE: bic r1, r1, #-2147483648
-; SP: bl __aeabi_dcmpgt
-; SP: bl __aeabi_dcmpun
-; SP: bl __aeabi_dsub
+; SP: vldr d1, .LCPI{{.*}}
+; SP: vmov r0, r1, d0
+; SP: vmov r2, r3, d1
+; SP: lsrs r2, r3, #31
+; SP: bfi r1, r2, #31, #1
+; SP: vmov d0, r0, r1
 ; DP: vabs.f64 d0, d0
   %1 = call double @llvm.fabs.f64(double %a)
   ret double %1
@@ -216,7 +219,7 @@ define i16 @d_to_h(double %a) {
 declare double @llvm.convert.from.fp16.f64(i16 %a)
 define double @h_to_d(i16 %a) {
 ; CHECK-LABEL: h_to_d:
-; NONE: bl __gnu_h2f_ieee
+; NONE: bl __aeabi_h2f
 ; NONE: bl __aeabi_f2d
 ; SP: vcvt{{[bt]}}.f32.f16
 ; SP: bl __aeabi_f2d
diff --git a/test/CodeGen/Thumb2/float-intrinsics-float.ll b/test/CodeGen/Thumb2/float-intrinsics-float.ll
index ec1bcd3708ac6..847aeacd2f914 100644
--- a/test/CodeGen/Thumb2/float-intrinsics-float.ll
+++ b/test/CodeGen/Thumb2/float-intrinsics-float.ll
@@ -205,7 +205,7 @@ define float @fmuladd_f(float %a, float %b, float %c) {
 declare i16 @llvm.convert.to.fp16.f32(float %a)
 define i16 @f_to_h(float %a) {
 ; CHECK-LABEL: f_to_h:
-; SOFT: bl __gnu_f2h_ieee
+; SOFT: bl __aeabi_f2h
 ; HARD: vcvt{{[bt]}}.f16.f32
   %1 = call i16 @llvm.convert.to.fp16.f32(float %a)
   ret i16 %1
@@ -214,7 +214,7 @@ define i16 @f_to_h(float %a) {
 declare float @llvm.convert.from.fp16.f32(i16 %a)
 define float @h_to_f(i16 %a) {
 ; CHECK-LABEL: h_to_f:
-; SOFT: bl __gnu_h2f_ieee
+; SOFT: bl __aeabi_h2f
 ; HARD: vcvt{{[bt]}}.f32.f16
   %1 = call float @llvm.convert.from.fp16.f32(i16 %a)
   ret float %1
diff --git a/test/CodeGen/Thumb2/ifcvt-compare.ll b/test/CodeGen/Thumb2/ifcvt-compare.ll
index 8af139a5ef6ed..7b5ce4fa3f5f3 100644
--- a/test/CodeGen/Thumb2/ifcvt-compare.ll
+++ b/test/CodeGen/Thumb2/ifcvt-compare.ll
@@ -19,7 +19,8 @@ f:
 define void @f1(i32 %x) optsize {
   ; CHECK-LABEL: f1:
   ; CHECK: cmp r0, #1
-  ; CHECK: it eq
+  ; CHECK: it ne
+  ; CHECK-NEXT: bxne lr
   %p = icmp eq i32 %x, 1
   br i1 %p, label %t, label %f
 
@@ -34,7 +35,8 @@ f:
 define void @f2(i32 %x) {
   ; CHECK-LABEL: f2:
   ; CHECK: cmp r0, #0
-  ; CHECK: it eq
+  ; CHECK: it ne
+  ; CHECK-NEXT: bxne lr
   %p = icmp eq i32 %x, 0
   br i1 %p, label %t, label %f
 
diff --git a/test/CodeGen/Thumb2/machine-licm.ll b/test/CodeGen/Thumb2/machine-licm.ll
index 2b1caa393072e..c57274ea55992 100644
--- a/test/CodeGen/Thumb2/machine-licm.ll
+++ b/test/CodeGen/Thumb2/machine-licm.ll
@@ -59,10 +59,10 @@ bb1:
   %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
   %tmp1 = shl i32 %indvar, 2
   %gep1 = getelementptr i8, i8* %ptr1, i32 %tmp1
-  %tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %gep1, i32 1)
+  %tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %gep1, i32 1)
   %tmp3 = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> %tmp2)
   %gep2 = getelementptr i8, i8* %ptr2, i32 %tmp1
-  call void @llvm.arm.neon.vst1.v4f32(i8* %gep2, <4 x float> %tmp3, i32 1)
+  call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %gep2, <4 x float> %tmp3, i32 1)
   %indvar.next = add i32 %indvar, 1
   %cond = icmp eq i32 %indvar.next, 10
   br i1 %cond, label %bb2, label %bb1
@@ -73,9 +73,9 @@ bb2:
 
 ; CHECK-NOT: LCPI1_0:
 
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
 
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
 
 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
 
diff --git a/test/CodeGen/Thumb2/pic-load.ll b/test/CodeGen/Thumb2/pic-load.ll
index 53d456c534524..cfdad03dcd58d 100644
--- a/test/CodeGen/Thumb2/pic-load.ll
+++ b/test/CodeGen/Thumb2/pic-load.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -relocation-model=pic | FileCheck %s --check-prefix=CHECK --check-prefix=PIC
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=pic -mcpu=swift -mattr=+no-movt | FileCheck %s --check-prefix=CHECK --check-prefix=PIC-NOMOVT
 
 	%struct.anon = type { void ()* }
 	%struct.one_atexit_routine = type { %struct.anon, i32, i8* }
@@ -8,7 +9,14 @@
 define hidden i32 @atexit(void ()* %func) nounwind {
 entry:
 ; CHECK-LABEL: atexit:
-; CHECK: add r0, pc
+; CHECK-PIC: add r0, pc
+; CHECK-NOMOVT: ldr r[[REGNUM:[0-9]+]], LCPI0_0
+; CHECK-NOMOVT: LPC0_0:
+; CHECK-NOMOVT: add r[[REGNUM]], pc
+; CHECK-NOMOVT: ldr r1, [r[[REGNUM]]
+; CHECK-NOMOVT: blx _atexit_common
+; CHECK-NOMOVT: LCPI0_0:
+; CHECK-NOMOVT: .long L___dso_handle$non_lazy_ptr-(LPC0_0+4)
 	%r = alloca %struct.one_atexit_routine, align 4		; <%struct.one_atexit_routine*> [#uses=3]
 	%0 = getelementptr %struct.one_atexit_routine, %struct.one_atexit_routine* %r, i32 0, i32 0, i32 0		; <void ()**> [#uses=1]
 	store void ()* %func, void ()** %0, align 4
diff --git a/test/CodeGen/Thumb2/setjmp_longjmp.ll b/test/CodeGen/Thumb2/setjmp_longjmp.ll
new file mode 100644
index 0000000000000..9e0fad00c1403
--- /dev/null
+++ b/test/CodeGen/Thumb2/setjmp_longjmp.ll
@@ -0,0 +1,89 @@
+; RUN: llc %s -o - | FileCheck %s
+target triple = "thumbv7-apple-ios"
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*)
+declare void @llvm.eh.sjlj.longjmp(i8*)
+declare i8* @llvm.frameaddress(i32)
+declare i8* @llvm.stacksave()
+@g = external global i32
+
+; CHECK-LABEL: double_foobar
+;
+; setjmp sequence:
+; CHECK: mov [[PCREG:r[0-9]+]], pc
+; CHECK-NEXT: adds [[PCREG]], [[PCREG]], #7
+; CHECK-NEXT: str [[PCREG]], {{\[}}[[BUFREG:r[0-9]+]], #4]
+; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: b [[LABEL:L[a-zA-Z0-9]+]]
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: [[LABEL]]:
+;
+; setjmp sequence 2:
+; CHECK: mov [[PCREG:r[0-9]+]], pc
+; CHECK-NEXT: adds [[PCREG]], [[PCREG]], #7
+; CHECK-NEXT: str [[PCREG]], {{\[}}[[BUFREG:r[0-9]+]], #4]
+; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: b [[LABEL:L[a-zA-Z0-9]+]]
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: [[LABEL]]:
+
+; longjmp sequence:
+; CHECK: ldr [[TEMPREG:r[0-9]+]], [{{\s*}}[[BUFREG:r[0-9]+]], #8]
+; CHECK-NEXT: mov sp, [[TEMPREG]]
+; CHECK-NEXT: ldr [[DESTREG:r[0-9]+]], {{\[}}[[BUFREG]], #4]
+; CHECK-NEXT: ldr r7, {{\[}}[[BUFREG]]{{\]}}
+; CHECK-NEXT: bx [[DESTREG]]
+;
+; longjmp sequence2:
+; CHECK: ldr [[TEMPREG:r[0-9]+]], [{{\s*}}[[BUFREG:r[0-9]+]], #8]
+; CHECK-NEXT: mov sp, [[TEMPREG]]
+; CHECK-NEXT: ldr [[DESTREG:r[0-9]+]], {{\[}}[[BUFREG]], #4]
+; CHECK-NEXT: ldr r7, {{\[}}[[BUFREG]]{{\]}}
+; CHECK-NEXT: bx [[DESTREG]]
+define void @double_foobar() {
+entry:
+  %buf = alloca [5 x i8*], align 4
+  %bufptr = bitcast [5 x i8*]* %buf to i8*
+  %arraydecay = getelementptr inbounds [5 x i8*], [5 x i8*]* %buf, i32 0, i32 0
+
+  %fa = tail call i8* @llvm.frameaddress(i32 0)
+  store i8* %fa, i8** %arraydecay, align 4
+  %ss = tail call i8* @llvm.stacksave()
+  %ssgep = getelementptr [5 x i8*], [5 x i8*]* %buf, i32 0, i32 2
+  store i8* %ss, i8** %ssgep, align 4
+
+  %setjmpres = call i32 @llvm.eh.sjlj.setjmp(i8* %bufptr)
+  %tobool = icmp ne i32 %setjmpres, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:
+  store volatile i32 1, i32* @g, align 4
+  br label %if.end
+
+if.else:
+  store volatile i32 0, i32* @g, align 4
+  call void @llvm.eh.sjlj.longjmp(i8* %bufptr)
+  unreachable
+
+if.end:
+  %fa2 = tail call i8* @llvm.frameaddress(i32 0)
+  store i8* %fa2, i8** %arraydecay, align 4
+  %ss2 = tail call i8* @llvm.stacksave()
+  store i8* %ss2, i8** %ssgep, align 4
+
+  %setjmpres2 = call i32 @llvm.eh.sjlj.setjmp(i8* %bufptr)
+  %tobool2 = icmp ne i32 %setjmpres2, 0
+  br i1 %tobool2, label %if2.then, label %if2.else
+
+if2.then:
+  store volatile i32 3, i32* @g, align 4
+  br label %if2.end
+
+if2.else:
+  store volatile i32 2, i32* @g, align 4
+  call void @llvm.eh.sjlj.longjmp(i8* %bufptr)
+  unreachable
+
+if2.end:
+  ret void
+}
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
index da1057b8bb4ab..eb48ffb7d80e5 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
@@ -24,11 +24,10 @@ cond_next:
 
 define i32 @t2(i32 %a, i32 %b) nounwind {
 entry:
-; Do not if-convert when branches go to the different loops.
 ; CHECK-LABEL: t2:
-; CHECK-NOT: ite gt
-; CHECK-NOT: subgt
-; CHECK-NOT: suble
+; CHECK: ite gt
+; CHECK: subgt
+; CHECK: suble
 	%tmp1434 = icmp eq i32 %a, %b		; <i1> [#uses=1]
 	br i1 %tmp1434, label %bb17, label %bb.outer
 
@@ -73,9 +72,10 @@ entry:
 define void @t3(i32 %a, i32 %b) nounwind {
 entry:
 ; CHECK-LABEL: t3:
-; CHECK: itt ge
-; CHECK: movge r0, r1
-; CHECK: blge  {{_?}}foo
+; CHECK: it lt
+; CHECK-NEXT: bxlt lr
+; CHECK: mov r0, r1
+; CHECK: bl  {{_?}}foo
 	%tmp1 = icmp sgt i32 %a, 10		; <i1> [#uses=1]
 	br i1 %tmp1, label %cond_true, label %UnifiedReturnBlock
 
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
index 1d2ba0008be82..4a76e100b6580 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
@@ -8,7 +8,7 @@ entry:
 ; CHECK: it ne
 ; CHECK: cmpne
 ; CHECK: it hi
-; CHECK: pophi {r7, pc}
+; CHECK: bxhi lr
 	%tmp1 = icmp ult i32 %X, 4		; <i1> [#uses=1]
 	%tmp4 = icmp eq i32 %Y, 0		; <i1> [#uses=1]
 	%tmp7 = or i1 %tmp4, %tmp1		; <i1> [#uses=1]
@@ -69,7 +69,7 @@ define fastcc void @t1(%struct.SString* %word, i8 signext  %c) {
 entry:
 ; CHECK-LABEL: t1:
 ; CHECK: it ne
-; CHECK: popne {r7, pc}
+; CHECK: bxne lr
 	%tmp1 = icmp eq %struct.SString* %word, null		; <i1> [#uses=1]
 	br i1 %tmp1, label %cond_true, label %cond_false
 
diff --git a/test/CodeGen/Thumb2/thumb2-mulhi.ll b/test/CodeGen/Thumb2/thumb2-mulhi.ll
index db9b644d4f92e..273abb8a7b0c6 100644
--- a/test/CodeGen/Thumb2/thumb2-mulhi.ll
+++ b/test/CodeGen/Thumb2/thumb2-mulhi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2dsp %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+dsp %s -o - | FileCheck %s
 
 define i32 @smulhi(i32 %x, i32 %y) {
 ; CHECK: smulhi
diff --git a/test/CodeGen/Thumb2/thumb2-smla.ll b/test/CodeGen/Thumb2/thumb2-smla.ll
index 8573d39f09f6c..5ddaf9353f92d 100644
--- a/test/CodeGen/Thumb2/thumb2-smla.ll
+++ b/test/CodeGen/Thumb2/thumb2-smla.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false %s -o - | FileCheck %s -check-prefix=NO_MULOPS
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+dsp %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+dsp -arm-use-mulops=false %s -o - | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f3(i32 %a, i16 %x, i32 %y) {
 ; CHECK: f3
diff --git a/test/CodeGen/Thumb2/thumb2-smul.ll b/test/CodeGen/Thumb2/thumb2-smul.ll
index 937f7737f2b3c..a196a3c79ae9b 100644
--- a/test/CodeGen/Thumb2/thumb2-smul.ll
+++ b/test/CodeGen/Thumb2/thumb2-smul.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp %s -o - |  FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+dsp %s -o - |  FileCheck %s
 
 @x = weak global i16 0          ; <i16*> [#uses=1]
 @y = weak global i16 0          ; <i16*> [#uses=0]
diff --git a/test/CodeGen/Thumb2/thumb2-spill-q.ll b/test/CodeGen/Thumb2/thumb2-spill-q.ll
index e0f7b5bd919cd..f408242ea01fa 100644
--- a/test/CodeGen/Thumb2/thumb2-spill-q.ll
+++ b/test/CodeGen/Thumb2/thumb2-spill-q.ll
@@ -7,7 +7,7 @@
 %quux = type { i32 (...)**, %baz*, i32 }
 %quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
 
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
 
 define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK-LABEL: aaa:
@@ -18,30 +18,30 @@ entry:
   %aligned_vec = alloca <4 x float>, align 16
   %"alloca point" = bitcast i32 0 to i32
   %vecptr = bitcast <4 x float>* %aligned_vec to i8*
-  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %vecptr, i32 1) nounwind 
+  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %vecptr, i32 1) nounwind 
   store float 6.300000e+01, float* undef, align 4
-  %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
+  %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
   store float 0.000000e+00, float* undef, align 4
-  %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
-  %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
+  %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
-  %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+  %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
   store float 0.000000e+00, float* undef, align 4
   %val173 = load <4 x float>, <4 x float>* undef               ; <<4 x float>> [#uses=1]
   br label %bb4
diff --git a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
index 4afea894aebcf..e091a6529cd17 100644
--- a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
@@ -3,17 +3,17 @@
 ; rdar://11318438
 
 define zeroext i8 @test1(i32 %A.u)  {
-; A8: test1
+; A8-LABEL: test1:
 ; A8: uxtb r0, r0
     %B.u = trunc i32 %A.u to i8
     ret i8 %B.u
 }
 
 define zeroext i32 @test2(i32 %A.u, i32 %B.u)  {
-; A8: test2
+; A8-LABEL: test2:
 ; A8: uxtab  r0, r0, r1
 
-; M3: test2
+; M3-LABEL: test2:
 ; M3: uxtb  r1, r1
 ; M3-NOT: uxtab
 ; M3: add   r0, r1
@@ -24,7 +24,7 @@ define zeroext i32 @test2(i32 %A.u, i32 %B.u)  {
 }
 
 define zeroext i32 @test3(i32 %A.u)  {
-; A8-LABEL: test3
+; A8-LABEL: test3:
 ; A8: ubfx  r0, r0, #8, #16
     %B.u = lshr i32 %A.u, 8
     %C.u = shl i32 %A.u, 24
diff --git a/test/CodeGen/Thumb2/v8_IT_1.ll b/test/CodeGen/Thumb2/v8_IT_1.ll
index 30dbb4802b6d6..948f159c343db 100644
--- a/test/CodeGen/Thumb2/v8_IT_1.ll
+++ b/test/CodeGen/Thumb2/v8_IT_1.ll
@@ -6,12 +6,12 @@
 ;CHECK: bx
 define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
 entry:
-  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %bar, i32 1)
   %and = and i32 %avail, 1
   %tobool = icmp eq i32 %and, 0
   %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
   ret <16 x i8> %vld1.
 }
 
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* , i32 )
 
diff --git a/test/CodeGen/Thumb2/v8_IT_3.ll b/test/CodeGen/Thumb2/v8_IT_3.ll
index 3ccee5fbb8cab..78b51a033084a 100644
--- a/test/CodeGen/Thumb2/v8_IT_3.ll
+++ b/test/CodeGen/Thumb2/v8_IT_3.ll
@@ -35,7 +35,7 @@ bb:
   br i1 %tmp4, label %bb1, label %bb8
 
 bb1:
-; CHECK: %bb6
+; CHECK: %entry
 ; CHECK: it	eq
 ; CHECK-NEXT: ldreq
 ; CHECK-NEXT: it	eq
@@ -54,8 +54,9 @@ bb1:
 bb4:
 ; CHECK-PIC: cmp
 ; CHECK-PIC: cmp
+; CHECK-PIC: cmp
 ; CHECK-PIC-NEXT: bne
-; CHECK-PIC-NEXT: %bb4
+; CHECK-PIC: %bb6
 ; CHECK-PIC-NEXT: movs
 ; CHECK-PIC-NEXT: add
 ; CHECK-PIC-NEXT: pop
diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll
index 78b80d7dcdefa..6a7a7a0b0aa05 100644
--- a/test/CodeGen/Thumb2/v8_IT_5.ll
+++ b/test/CodeGen/Thumb2/v8_IT_5.ll
@@ -7,9 +7,11 @@
 ; CHECK-NEXT: %if.else163
 ; CHECK-NEXT: mov.w
 ; CHECK-NEXT: b
+; CHECK: [[JUMPTARGET]]:{{.*}}%if.else173
+; CHECK-NEXT: mov.w
+; CHECK-NEXT: bx lr
 ; CHECK-NEXT: %if.else145
 ; CHECK-NEXT: mov.w
-; CHECK: [[JUMPTARGET]]:{{.*}}%if.else173
 
 %struct.hc = type { i32, i32, i32, i32 }
 
diff --git a/test/CodeGen/WebAssembly/call.ll b/test/CodeGen/WebAssembly/call.ll
new file mode 100644
index 0000000000000..9158ccec09798
--- /dev/null
+++ b/test/CodeGen/WebAssembly/call.ll
@@ -0,0 +1,127 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic call operations assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare i32 @i32_nullary()
+declare i32 @i32_unary(i32)
+declare i32 @i32_binary(i32, i32)
+declare i64 @i64_nullary()
+declare float @float_nullary()
+declare double @double_nullary()
+declare void @void_nullary()
+
+; CHECK-LABEL: call_i32_nullary:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_nullary{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @call_i32_nullary() {
+  %r = call i32 @i32_nullary()
+  ret i32 %r
+}
+
+; CHECK-LABEL: call_i64_nullary:
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: {{^}} i64.call $push[[NUM:[0-9]+]]=, i64_nullary{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @call_i64_nullary() {
+  %r = call i64 @i64_nullary()
+  ret i64 %r
+}
+
+; CHECK-LABEL: call_float_nullary:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: {{^}} f32.call $push[[NUM:[0-9]+]]=, float_nullary{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @call_float_nullary() {
+  %r = call float @float_nullary()
+  ret float %r
+}
+
+; CHECK-LABEL: call_double_nullary:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: {{^}} f64.call $push[[NUM:[0-9]+]]=, double_nullary{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @call_double_nullary() {
+  %r = call double @double_nullary()
+  ret double %r
+}
+
+; CHECK-LABEL: call_void_nullary:
+; CHECK-NEXT: {{^}} call void_nullary{{$}}
+; CHECK-NEXT: return{{$}}
+define void @call_void_nullary() {
+  call void @void_nullary()
+  ret void
+}
+
+; CHECK-LABEL: call_i32_unary:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_unary, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @call_i32_unary(i32 %a) {
+  %r = call i32 @i32_unary(i32 %a)
+  ret i32 %r
+}
+
+; CHECK-LABEL: call_i32_binary:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_binary, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @call_i32_binary(i32 %a, i32 %b) {
+  %r = call i32 @i32_binary(i32 %a, i32 %b)
+  ret i32 %r
+}
+
+; CHECK-LABEL: call_indirect_void:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: {{^}} call_indirect $0{{$}}
+; CHECK-NEXT: return{{$}}
+define void @call_indirect_void(void ()* %callee) {
+  call void %callee()
+  ret void
+}
+
+; CHECK-LABEL: call_indirect_i32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: {{^}} i32.call_indirect $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @call_indirect_i32(i32 ()* %callee) {
+  %t = call i32 %callee()
+  ret i32 %t
+}
+
+; CHECK-LABEL: tail_call_void_nullary:
+; CHECK-NEXT: {{^}} call void_nullary{{$}}
+; CHECK-NEXT: return{{$}}
+define void @tail_call_void_nullary() {
+  tail call void @void_nullary()
+  ret void
+}
+
+; CHECK-LABEL: fastcc_tail_call_void_nullary:
+; CHECK-NEXT: {{^}} call void_nullary{{$}}
+; CHECK-NEXT: return{{$}}
+define void @fastcc_tail_call_void_nullary() {
+  tail call fastcc void @void_nullary()
+  ret void
+}
+
+; CHECK-LABEL: coldcc_tail_call_void_nullary:
+; CHECK-NEXT: {{^}} call void_nullary
+; CHECK-NEXT: return{{$}}
+define void @coldcc_tail_call_void_nullary() {
+  tail call coldcc void @void_nullary()
+  ret void
+}
+
+; FIXME test the following:
+;  - More argument combinations.
+;  - Tail call.
+;  - Interesting returns (struct, multiple).
+;  - Vararg.
diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll
new file mode 100644
index 0000000000000..71f3551347bf5
--- /dev/null
+++ b/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -0,0 +1,1102 @@
+; RUN: llc < %s -asm-verbose=false -disable-block-placement -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck -check-prefix=OPT %s
+
+; Test the CFG stackifier pass.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare void @something()
+
+; Test that loops are made contiguous, even in the presence of split backedges.
+
+; CHECK-LABEL: test0:
+; CHECK: loop
+; CHECK-NOT: br
+; CHECK: i32.add
+; CHECK-NEXT: i32.ge_s
+; CHECK-NEXT: br_if
+; CHECK-NOT: br
+; CHECK: call
+; CHECK: br BB0_1{{$}}
+; CHECK: return{{$}}
+; OPT-LABEL: test0:
+; OPT: loop
+; OPT-NOT: br
+; OPT: i32.add
+; OPT-NEXT: i32.ge_s
+; OPT-NEXT: br_if
+; OPT-NOT: br
+; OPT: call
+; OPT: br BB0_1{{$}}
+; OPT: return{{$}}
+define void @test0(i32 %n) {
+entry:
+  br label %header
+
+header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %back ]
+  %i.next = add i32 %i, 1
+
+  %c = icmp slt i32 %i.next, %n
+  br i1 %c, label %back, label %exit
+
+exit:
+  ret void
+
+back:
+  call void @something()
+  br label %header
+}
+
+; Same as test0, but the branch condition is reversed.
+
+; CHECK-LABEL: test1:
+; CHECK: loop
+; CHECK-NOT: br
+; CHECK: i32.add
+; CHECK-NEXT: i32.ge_s
+; CHECK-NEXT: br_if
+; CHECK-NOT: br
+; CHECK: call
+; CHECK: br BB1_1{{$}}
+; CHECK: return{{$}}
+; OPT-LABEL: test1:
+; OPT: loop
+; OPT-NOT: br
+; OPT: i32.add
+; OPT-NEXT: i32.ge_s
+; OPT-NEXT: br_if
+; OPT-NOT: br
+; OPT: call
+; OPT: br BB1_1{{$}}
+; OPT: return{{$}}
+define void @test1(i32 %n) {
+entry:
+  br label %header
+
+header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %back ]
+  %i.next = add i32 %i, 1
+
+  %c = icmp sge i32 %i.next, %n
+  br i1 %c, label %exit, label %back
+
+exit:
+  ret void
+
+back:
+  call void @something()
+  br label %header
+}
+
+; Test that a simple loop is handled as expected.
+
+; CHECK-LABEL: test2:
+; CHECK: block BB2_2{{$}}
+; CHECK: br_if {{[^,]*}}, BB2_2{{$}}
+; CHECK: BB2_1:
+; CHECK: br_if ${{[0-9]+}}, BB2_1{{$}}
+; CHECK: BB2_2:
+; CHECK: return{{$}}
+; OPT-LABEL: test2:
+; OPT: block BB2_2{{$}}
+; OPT: br_if {{[^,]*}}, BB2_2{{$}}
+; OPT: BB2_1:
+; OPT: br_if ${{[0-9]+}}, BB2_1{{$}}
+; OPT: BB2_2:
+; OPT: return{{$}}
+define void @test2(double* nocapture %p, i32 %n) {
+entry:
+  %cmp.4 = icmp sgt i32 %n, 0
+  br i1 %cmp.4, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds double, double* %p, i32 %i.05
+  %0 = load double, double* %arrayidx, align 8
+  %mul = fmul double %0, 3.200000e+00
+  store double %mul, double* %arrayidx, align 8
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: doublediamond:
+; CHECK: block BB3_5{{$}}
+; CHECK: block BB3_2{{$}}
+; CHECK: br_if $0, BB3_2{{$}}
+; CHECK: block BB3_4{{$}}
+; CHECK: br_if $1, BB3_4{{$}}
+; CHECK: br BB3_5{{$}}
+; CHECK: BB3_4:
+; CHECK: BB3_5:
+; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: doublediamond:
+; OPT: block BB3_5{{$}}
+; OPT: block BB3_4{{$}}
+; OPT: br_if {{[^,]*}}, BB3_4{{$}}
+; OPT: block BB3_3{{$}}
+; OPT: br_if {{[^,]*}}, BB3_3{{$}}
+; OPT: br BB3_5{{$}}
+; OPT: BB3_4:
+; OPT: BB3_5:
+; OPT: return ${{[0-9]+}}{{$}}
+define i32 @doublediamond(i32 %a, i32 %b, i32* %p) {
+entry:
+  %c = icmp eq i32 %a, 0
+  %d = icmp eq i32 %b, 0
+  store volatile i32 0, i32* %p
+  br i1 %c, label %true, label %false
+true:
+  store volatile i32 1, i32* %p
+  br label %exit
+false:
+  store volatile i32 2, i32* %p
+  br i1 %d, label %ft, label %ff
+ft:
+  store volatile i32 3, i32* %p
+  br label %exit
+ff:
+  store volatile i32 4, i32* %p
+  br label %exit
+exit:
+  store volatile i32 5, i32* %p
+  ret i32 0
+}
+
+; CHECK-LABEL: triangle:
+; CHECK: block BB4_2{{$}}
+; CHECK: br_if $1, BB4_2{{$}}
+; CHECK: BB4_2:
+; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: triangle:
+; OPT: block BB4_2{{$}}
+; OPT: br_if $1, BB4_2{{$}}
+; OPT: BB4_2:
+; OPT: return ${{[0-9]+}}{{$}}
+define i32 @triangle(i32* %p, i32 %a) {
+entry:
+  %c = icmp eq i32 %a, 0
+  store volatile i32 0, i32* %p
+  br i1 %c, label %true, label %exit
+true:
+  store volatile i32 1, i32* %p
+  br label %exit
+exit:
+  store volatile i32 2, i32* %p
+  ret i32 0
+}
+
+; CHECK-LABEL: diamond:
+; CHECK: block BB5_3{{$}}
+; CHECK: block BB5_2{{$}}
+; CHECK: br_if $1, BB5_2{{$}}
+; CHECK: br BB5_3{{$}}
+; CHECK: BB5_2:
+; CHECK: BB5_3:
+; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: diamond:
+; OPT: block BB5_3{{$}}
+; OPT: block BB5_2{{$}}
+; OPT: br_if {{[^,]*}}, BB5_2{{$}}
+; OPT: br BB5_3{{$}}
+; OPT: BB5_2:
+; OPT: BB5_3:
+; OPT: return ${{[0-9]+}}{{$}}
+define i32 @diamond(i32* %p, i32 %a) {
+entry:
+  %c = icmp eq i32 %a, 0
+  store volatile i32 0, i32* %p
+  br i1 %c, label %true, label %false
+true:
+  store volatile i32 1, i32* %p
+  br label %exit
+false:
+  store volatile i32 2, i32* %p
+  br label %exit
+exit:
+  store volatile i32 3, i32* %p
+  ret i32 0
+}
+
+; CHECK-LABEL: single_block:
+; CHECK-NOT: br
+; CHECK: return $pop{{[0-9]+}}{{$}}
+; OPT-LABEL: single_block:
+; OPT-NOT: br
+; OPT: return $pop{{[0-9]+}}{{$}}
+define i32 @single_block(i32* %p) {
+entry:
+  store volatile i32 0, i32* %p
+  ret i32 0
+}
+
+; CHECK-LABEL: minimal_loop:
+; CHECK-NOT: br
+; CHECK: BB7_1:
+; CHECK: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
+; CHECK: br BB7_1{{$}}
+; CHECK: BB7_2:
+; OPT-LABEL: minimal_loop:
+; OPT-NOT: br
+; OPT: BB7_1:
+; OPT: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
+; OPT: br BB7_1{{$}}
+; OPT: BB7_2:
+define i32 @minimal_loop(i32* %p) {
+entry:
+  store volatile i32 0, i32* %p
+  br label %loop
+loop:
+  store volatile i32 1, i32* %p
+  br label %loop
+}
+
+; CHECK-LABEL: simple_loop:
+; CHECK-NOT: br
+; CHECK: BB8_1:
+; CHECK: loop BB8_2{{$}}
+; CHECK: br_if $pop{{[0-9]+}}, BB8_1{{$}}
+; CHECK: BB8_2:
+; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: simple_loop:
+; OPT-NOT: br
+; OPT: BB8_1:
+; OPT: loop BB8_2{{$}}
+; OPT: br_if {{[^,]*}}, BB8_1{{$}}
+; OPT: BB8_2:
+; OPT: return ${{[0-9]+}}{{$}}
+define i32 @simple_loop(i32* %p, i32 %a) {
+entry:
+  %c = icmp eq i32 %a, 0
+  store volatile i32 0, i32* %p
+  br label %loop
+loop:
+  store volatile i32 1, i32* %p
+  br i1 %c, label %loop, label %exit
+exit:
+  store volatile i32 2, i32* %p
+  ret i32 0
+}
+
+; CHECK-LABEL: doubletriangle:
+; CHECK: block BB9_4{{$}}
+; CHECK: br_if $0, BB9_4{{$}}
+; CHECK: block BB9_3{{$}}
+; CHECK: br_if $1, BB9_3{{$}}
+; CHECK: BB9_3:
+; CHECK: BB9_4:
+; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: doubletriangle:
+; OPT: block BB9_4{{$}}
+; OPT: br_if $0, BB9_4{{$}}
+; OPT: block BB9_3{{$}}
+; OPT: br_if $1, BB9_3{{$}}
+; OPT: BB9_3:
+; OPT: BB9_4:
+; OPT: return ${{[0-9]+}}{{$}}
+define i32 @doubletriangle(i32 %a, i32 %b, i32* %p) {
+entry:
+  %c = icmp eq i32 %a, 0
+  %d = icmp eq i32 %b, 0
+  store volatile i32 0, i32* %p
+  br i1 %c, label %true, label %exit
+true:
+  store volatile i32 2, i32* %p
+  br i1 %d, label %tt, label %tf
+tt:
+  store volatile i32 3, i32* %p
+  br label %tf
+tf:
+  store volatile i32 4, i32* %p
+  br label %exit
+exit:
+  store volatile i32 5, i32* %p
+  ret i32 0
+}
+
+; CHECK-LABEL: ifelse_earlyexits:
+; CHECK: block BB10_4{{$}}
+; CHECK: block BB10_2{{$}}
+; CHECK: br_if $0, BB10_2{{$}}
+; CHECK: br BB10_4{{$}}
+; CHECK: BB10_2:
+; CHECK: br_if $1, BB10_4{{$}}
+; CHECK: BB10_4:
+; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: ifelse_earlyexits:
+; OPT: block BB10_4{{$}}
+; OPT: block BB10_3{{$}}
+; OPT: br_if {{[^,]*}}, BB10_3{{$}}
+; OPT: br_if $1, BB10_4{{$}}
+; OPT: br BB10_4{{$}}
+; OPT: BB10_3:
+; OPT: BB10_4:
+; OPT: return ${{[0-9]+}}{{$}}
+define i32 @ifelse_earlyexits(i32 %a, i32 %b, i32* %p) {
+entry:
+  %c = icmp eq i32 %a, 0
+  %d = icmp eq i32 %b, 0
+  store volatile i32 0, i32* %p
+  br i1 %c, label %true, label %false
+true:
+  store volatile i32 1, i32* %p
+  br label %exit
+false:
+  store volatile i32 2, i32* %p
+  br i1 %d, label %ft, label %exit
+ft:
+  store volatile i32 3, i32* %p
+  br label %exit
+exit:
+  store volatile i32 4, i32* %p
+  ret i32 0
+}
+
+; CHECK-LABEL: doublediamond_in_a_loop:
+; CHECK: BB11_1:
+; CHECK: loop            BB11_7{{$}}
+; CHECK: block           BB11_6{{$}}
+; CHECK: block           BB11_3{{$}}
+; CHECK: br_if           $0, BB11_3{{$}}
+; CHECK: br              BB11_6{{$}}
+; CHECK: BB11_3:
+; CHECK: block           BB11_5{{$}}
+; CHECK: br_if           $1, BB11_5{{$}}
+; CHECK: br              BB11_6{{$}}
+; CHECK: BB11_5:
+; CHECK: BB11_6:
+; CHECK: br              BB11_1{{$}}
+; CHECK: BB11_7:
+; OPT-LABEL: doublediamond_in_a_loop:
+; OPT: BB11_1:
+; OPT: loop            BB11_7{{$}}
+; OPT: block           BB11_6{{$}}
+; OPT: block           BB11_5{{$}}
+; OPT: br_if           {{[^,]*}}, BB11_5{{$}}
+; OPT: block           BB11_4{{$}}
+; OPT: br_if           {{[^,]*}}, BB11_4{{$}}
+; OPT: br              BB11_6{{$}}
+; OPT: BB11_4:
+; OPT: br              BB11_6{{$}}
+; OPT: BB11_5:
+; OPT: BB11_6:
+; OPT: br              BB11_1{{$}}
+; OPT: BB11_7:
+define i32 @doublediamond_in_a_loop(i32 %a, i32 %b, i32* %p) {
+entry:
+  br label %header
+header:
+  %c = icmp eq i32 %a, 0
+  %d = icmp eq i32 %b, 0
+  store volatile i32 0, i32* %p
+  br i1 %c, label %true, label %false
+true:
+  store volatile i32 1, i32* %p
+  br label %exit
+false:
+  store volatile i32 2, i32* %p
+  br i1 %d, label %ft, label %ff
+ft:
+  store volatile i32 3, i32* %p
+  br label %exit
+ff:
+  store volatile i32 4, i32* %p
+  br label %exit
+exit:
+  store volatile i32 5, i32* %p
+  br label %header
+}
+
+; Test that nested loops are handled.
+
+; CHECK-LABEL: test3:
+; CHECK: loop
+; CHECK-NEXT: br_if
+; CHECK-NEXT: BB{{[0-9]+}}_{{[0-9]+}}:
+; CHECK-NEXT: loop
+; OPT-LABEL: test3:
+; OPT: loop
+; OPT-NEXT: br_if
+; OPT-NEXT: BB{{[0-9]+}}_{{[0-9]+}}:
+; OPT-NEXT: loop
+declare void @bar()
+define void @test3(i32 %w)  {
+entry:
+  br i1 undef, label %outer.ph, label %exit
+
+outer.ph:
+  br label %outer
+
+outer:
+  %tobool = icmp eq i32 undef, 0
+  br i1 %tobool, label %inner, label %unreachable
+
+unreachable:
+  unreachable
+
+inner:
+  %c = icmp eq i32 undef, %w
+  br i1 %c, label %if.end, label %inner
+
+exit:
+  ret void
+
+if.end:
+  call void @bar()
+  br label %outer
+}
+
+; Test switch lowering and block placement.
+
+; CHECK-LABEL: test4:
+; CHECK-NEXT: .param      i32{{$}}
+; CHECK:      block       BB13_8{{$}}
+; CHECK-NEXT: block       BB13_7{{$}}
+; CHECK-NEXT: block       BB13_4{{$}}
+; CHECK:      br_if       $pop{{[0-9]*}}, BB13_4{{$}}
+; CHECK-NEXT: block       BB13_3{{$}}
+; CHECK:      br_if       $pop{{[0-9]*}}, BB13_3{{$}}
+; CHECK:      br_if       $pop{{[0-9]*}}, BB13_7{{$}}
+; CHECK-NEXT: BB13_3:
+; CHECK-NEXT: return{{$}}
+; CHECK-NEXT: BB13_4:
+; CHECK:      br_if       $pop{{[0-9]*}}, BB13_8{{$}}
+; CHECK:      br_if       $pop{{[0-9]*}}, BB13_7{{$}}
+; CHECK-NEXT: return{{$}}
+; CHECK-NEXT: BB13_7:
+; CHECK-NEXT: return{{$}}
+; CHECK-NEXT: BB13_8:
+; CHECK-NEXT: return{{$}}
+; OPT-LABEL: test4:
+; OPT-NEXT: .param      i32{{$}}
+; OPT:      block       BB13_8{{$}}
+; OPT-NEXT: block       BB13_7{{$}}
+; OPT-NEXT: block       BB13_4{{$}}
+; OPT:      br_if       $pop{{[0-9]*}}, BB13_4{{$}}
+; OPT-NEXT: block       BB13_3{{$}}
+; OPT:      br_if       $pop{{[0-9]*}}, BB13_3{{$}}
+; OPT:      br_if       $pop{{[0-9]*}}, BB13_7{{$}}
+; OPT-NEXT: BB13_3:
+; OPT-NEXT: return{{$}}
+; OPT-NEXT: BB13_4:
+; OPT:      br_if       $pop{{[0-9]*}}, BB13_8{{$}}
+; OPT:      br_if       $pop{{[0-9]*}}, BB13_7{{$}}
+; OPT-NEXT: return{{$}}
+; OPT-NEXT: BB13_7:
+; OPT-NEXT: return{{$}}
+; OPT-NEXT: BB13_8:
+; OPT-NEXT: return{{$}}
+define void @test4(i32 %t) {
+entry:
+  switch i32 %t, label %default [
+    i32 0, label %bb2
+    i32 2, label %bb2
+    i32 4, label %bb1
+    i32 622, label %bb0
+  ]
+
+bb0:
+  ret void
+
+bb1:
+  ret void
+
+bb2:
+  ret void
+
+default:
+  ret void
+}
+
+; Test a case where the BLOCK needs to be placed before the LOOP in the
+; same basic block.
+
+; CHECK-LABEL: test5:
+; CHECK:       BB14_1:
+; CHECK-NEXT:  block BB14_4{{$}}
+; CHECK-NEXT:  loop BB14_3{{$}}
+; CHECK:       br_if {{[^,]*}}, BB14_4{{$}}
+; CHECK:       br_if {{[^,]*}}, BB14_1{{$}}
+; CHECK-NEXT:  BB14_3:
+; CHECK:       return{{$}}
+; CHECK-NEXT:  BB14_4:
+; CHECK:       return{{$}}
+; OPT-LABEL: test5:
+; OPT:       BB14_1:
+; OPT-NEXT:  block BB14_4{{$}}
+; OPT-NEXT:  loop BB14_3{{$}}
+; OPT:       br_if {{[^,]*}}, BB14_4{{$}}
+; OPT:       br_if {{[^,]*}}, BB14_1{{$}}
+; OPT-NEXT:  BB14_3:
+; OPT:       return{{$}}
+; OPT-NEXT:  BB14_4:
+; OPT:       return{{$}}
+define void @test5(i1 %p, i1 %q) {
+entry:
+  br label %header
+
+header:
+  store volatile i32 0, i32* null
+  br i1 %p, label %more, label %alt
+
+more:
+  store volatile i32 1, i32* null
+  br i1 %q, label %header, label %return
+
+alt:
+  store volatile i32 2, i32* null
+  ret void
+
+return:
+  store volatile i32 3, i32* null
+  ret void
+}
+
+; Test an interesting case of a loop with multiple exits, which
+; aren't to layout successors of the loop, and one of which is to a successors
+; which has another predecessor.
+
+; CHECK-LABEL: test6:
+; CHECK:       BB15_1:
+; CHECK-NEXT:  block BB15_6{{$}}
+; CHECK-NEXT:  block BB15_5{{$}}
+; CHECK-NEXT:  loop  BB15_4{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if {{[^,]*}}, BB15_6{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if {{[^,]*}}, BB15_5{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if {{[^,]*}}, BB15_1{{$}}
+; CHECK-NEXT:  BB15_4:
+; CHECK-NOT:   block
+; CHECK:       return{{$}}
+; CHECK-NEXT:  BB15_5:
+; CHECK-NOT:   block
+; CHECK:       BB15_6:
+; CHECK-NOT:   block
+; CHECK:       return{{$}}
+; OPT-LABEL: test6:
+; OPT:       BB15_1:
+; OPT-NEXT:  block BB15_6{{$}}
+; OPT-NEXT:  block BB15_5{{$}}
+; OPT-NEXT:  loop  BB15_4{{$}}
+; OPT-NOT:   block
+; OPT:       br_if {{[^,]*}}, BB15_6{{$}}
+; OPT-NOT:   block
+; OPT:       br_if {{[^,]*}}, BB15_5{{$}}
+; OPT-NOT:   block
+; OPT:       br_if {{[^,]*}}, BB15_1{{$}}
+; OPT-NEXT:  BB15_4:
+; OPT-NOT:   block
+; OPT:       return{{$}}
+; OPT-NEXT:  BB15_5:
+; OPT-NOT:   block
+; OPT:       BB15_6:
+; OPT-NOT:   block
+; OPT:       return{{$}}
+define void @test6(i1 %p, i1 %q) {
+entry:
+  br label %header
+
+header:
+  store volatile i32 0, i32* null
+  br i1 %p, label %more, label %second
+
+more:
+  store volatile i32 1, i32* null
+  br i1 %q, label %evenmore, label %first
+
+evenmore:
+  store volatile i32 1, i32* null
+  br i1 %q, label %header, label %return
+
+return:
+  store volatile i32 2, i32* null
+  ret void
+
+first:
+  store volatile i32 3, i32* null
+  br label %second
+
+second:
+  store volatile i32 4, i32* null
+  ret void
+}
+
+; Test a case where there are multiple backedges and multiple loop exits
+; that end in unreachable.
+
+; CHECK-LABEL: test7:
+; CHECK:       BB16_1:
+; CHECK-NEXT:  loop BB16_5{{$}}
+; CHECK-NOT:   block
+; CHECK:       block BB16_4{{$}}
+; CHECK:       br_if {{[^,]*}}, BB16_4{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if {{[^,]*}}, BB16_1{{$}}
+; CHECK-NOT:   block
+; CHECK:       unreachable
+; CHECK_NEXT:  BB16_4:
+; CHECK-NOT:   block
+; CHECK:       br_if {{[^,]*}}, BB16_1{{$}}
+; CHECK-NEXT:  BB16_5:
+; CHECK-NOT:   block
+; CHECK:       unreachable
+; OPT-LABEL: test7:
+; OPT:       BB16_1:
+; OPT-NEXT:  loop BB16_5{{$}}
+; OPT-NOT:   block
+; OPT:       block BB16_4{{$}}
+; OPT-NOT:   block
+; OPT:       br_if {{[^,]*}}, BB16_4{{$}}
+; OPT-NOT:   block
+; OPT:       br_if {{[^,]*}}, BB16_1{{$}}
+; OPT-NOT:   block
+; OPT:       unreachable
+; OPT_NEXT:  BB16_4:
+; OPT-NOT:   block
+; OPT:       br_if {{[^,]*}}, BB16_1{{$}}
+; OPT-NEXT:  BB16_5:
+; OPT-NOT:   block
+; OPT:       unreachable
+define void @test7(i1 %tobool2, i1 %tobool9) {
+entry:
+  store volatile i32 0, i32* null
+  br label %loop
+
+loop:
+  store volatile i32 1, i32* null
+  br i1 %tobool2, label %l1, label %l0
+
+l0:
+  store volatile i32 2, i32* null
+  br i1 %tobool9, label %loop, label %u0
+
+l1:
+  store volatile i32 3, i32* null
+  br i1 %tobool9, label %loop, label %u1
+
+u0:
+  store volatile i32 4, i32* null
+  unreachable
+
+u1:
+  store volatile i32 5, i32* null
+  unreachable
+}
+
+; Test an interesting case using nested loops and switches.
+
+; CHECK-LABEL: test8:
+; CHECK:       BB17_1:
+; CHECK-NEXT:  loop     BB17_4{{$}}
+; CHECK-NEXT:  block    BB17_3{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if    {{[^,]*}}, BB17_3{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if    {{[^,]*}}, BB17_1{{$}}
+; CHECK-NEXT:  BB17_3:
+; CHECK-NEXT:  loop     BB17_4{{$}}
+; CHECK-NEXT:  br_if    {{[^,]*}}, BB17_3{{$}}
+; CHECK-NEXT:  br       BB17_1{{$}}
+; CHECK-NEXT:  BB17_4:
+; OPT-LABEL: test8:
+; OPT:       BB17_1:
+; OPT-NEXT:  loop     BB17_4{{$}}
+; OPT-NEXT:  block    BB17_3{{$}}
+; OPT-NOT:   block
+; OPT:       br_if    {{[^,]*}}, BB17_3{{$}}
+; OPT-NOT:   block
+; OPT:       br_if    {{[^,]*}}, BB17_1{{$}}
+; OPT-NEXT:  BB17_3:
+; OPT-NEXT:  loop     BB17_4{{$}}
+; OPT-NEXT:  br_if    {{[^,]*}}, BB17_3{{$}}
+; OPT-NEXT:  br       BB17_1{{$}}
+; OPT-NEXT:  BB17_4:
+define i32 @test8() {
+bb:
+  br label %bb1
+
+bb1:
+  br i1 undef, label %bb2, label %bb3
+
+bb2:
+  switch i8 undef, label %bb1 [
+    i8 44, label %bb2
+  ]
+
+bb3:
+  switch i8 undef, label %bb1 [
+    i8 44, label %bb2
+  ]
+}
+
+; Test an interesting case using nested loops that share a bottom block.
+
+; CHECK-LABEL: test9:
+; CHECK:       BB18_1:
+; CHECK-NEXT:  loop      BB18_5{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if     {{[^,]*}}, BB18_5{{$}}
+; CHECK-NEXT:  BB18_2:
+; CHECK-NEXT:  loop      BB18_5{{$}}
+; CHECK-NOT:   block
+; CHECK:       block     BB18_4{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if     {{[^,]*}}, BB18_4{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if     {{[^,]*}}, BB18_2{{$}}
+; CHECK-NEXT:  br        BB18_1{{$}}
+; CHECK-NEXT:  BB18_4:
+; CHECK-NOT:   block
+; CHECK:       br_if     {{[^,]*}}, BB18_2{{$}}
+; CHECK-NEXT:  br        BB18_1{{$}}
+; CHECK-NEXT:  BB18_5:
+; CHECK-NOT:   block
+; CHECK:       return{{$}}
+; OPT-LABEL: test9:
+; OPT:       BB18_1:
+; OPT-NEXT:  loop      BB18_5{{$}}
+; OPT-NOT:   block
+; OPT:       br_if     {{[^,]*}}, BB18_5{{$}}
+; OPT-NEXT:  BB18_2:
+; OPT-NEXT:  loop      BB18_5{{$}}
+; OPT-NOT:   block
+; OPT:       block     BB18_4{{$}}
+; OPT-NOT:   block
+; OPT:       br_if     {{[^,]*}}, BB18_4{{$}}
+; OPT-NOT:   block
+; OPT:       br_if     {{[^,]*}}, BB18_2{{$}}
+; OPT-NEXT:  br        BB18_1{{$}}
+; OPT-NEXT:  BB18_4:
+; OPT-NOT:   block
+; OPT:       br_if     {{[^,]*}}, BB18_2{{$}}
+; OPT-NEXT:  br        BB18_1{{$}}
+; OPT-NEXT:  BB18_5:
+; OPT-NOT:   block
+; OPT:       return{{$}}
+declare i1 @a()
+define void @test9() {
+entry:
+  store volatile i32 0, i32* null
+  br label %header
+
+header:
+  store volatile i32 1, i32* null
+  %call4 = call i1 @a()
+  br i1 %call4, label %header2, label %end
+
+header2:
+  store volatile i32 2, i32* null
+  %call = call i1 @a()
+  br i1 %call, label %if.then, label %if.else
+
+if.then:
+  store volatile i32 3, i32* null
+  %call3 = call i1 @a()
+  br i1 %call3, label %header2, label %header
+
+if.else:
+  store volatile i32 4, i32* null
+  %call2 = call i1 @a()
+  br i1 %call2, label %header2, label %header
+
+end:
+  store volatile i32 5, i32* null
+  ret void
+}
+
+; Test an interesting case involving nested loops sharing a loop bottom,
+; and loop exits to a block with unreachable.
+
+; CHECK-LABEL: test10:
+; CHECK:       BB19_1:
+; CHECK-NEXT:  loop     BB19_7{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if    {{[^,]*}}, BB19_1{{$}}
+; CHECK-NEXT:  BB19_2:
+; CHECK-NEXT:  block    BB19_6{{$}}
+; CHECK-NEXT:  loop     BB19_5{{$}}
+; CHECK-NOT:   block
+; CHECK:       BB19_3:
+; CHECK-NEXT:  loop     BB19_5{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if    {{[^,]*}}, BB19_1{{$}}
+; CHECK-NOT:   block
+; CHECK:       tableswitch  {{[^,]*}}, BB19_3, BB19_3, BB19_5, BB19_1, BB19_2, BB19_6{{$}}
+; CHECK-NEXT:  BB19_5:
+; CHECK-NEXT:  return{{$}}
+; CHECK-NEXT:  BB19_6:
+; CHECK-NOT:   block
+; CHECK:       br       BB19_1{{$}}
+; CHECK-NEXT:  BB19_7:
+; OPT-LABEL: test10:
+; OPT:       BB19_1:
+; OPT-NEXT:  loop     BB19_7{{$}}
+; OPT-NOT:   block
+; OPT:       br_if    {{[^,]*}}, BB19_1{{$}}
+; OPT-NEXT:  BB19_2:
+; OPT-NEXT:  block    BB19_6{{$}}
+; OPT-NEXT:  loop     BB19_5{{$}}
+; OPT-NOT:   block
+; OPT:       BB19_3:
+; OPT-NEXT:  loop     BB19_5{{$}}
+; OPT-NOT:   block
+; OPT:       br_if    {{[^,]*}}, BB19_1{{$}}
+; OPT-NOT:   block
+; OPT:       tableswitch  {{[^,]*}}, BB19_3, BB19_3, BB19_5, BB19_1, BB19_2, BB19_6{{$}}
+; OPT-NEXT:  BB19_5:
+; OPT-NEXT:  return{{$}}
+; OPT-NEXT:  BB19_6:
+; OPT-NOT:   block
+; OPT:       br       BB19_1{{$}}
+; OPT-NEXT:  BB19_7:
+define void @test10() {
+bb0:
+  br label %bb1
+
+bb1:
+  %tmp = phi i32 [ 2, %bb0 ], [ 3, %bb3 ]
+  %tmp3 = phi i32 [ undef, %bb0 ], [ %tmp11, %bb3 ]
+  %tmp4 = icmp eq i32 %tmp3, 0
+  br i1 %tmp4, label %bb4, label %bb2
+
+bb2:
+  br label %bb3
+
+bb3:
+  %tmp11 = phi i32 [ 1, %bb5 ], [ 0, %bb2 ]
+  br label %bb1
+
+bb4:
+  %tmp6 = phi i32 [ %tmp9, %bb5 ], [ 4, %bb1 ]
+  %tmp7 = phi i32 [ %tmp6, %bb5 ], [ %tmp, %bb1 ]
+  br label %bb5
+
+bb5:
+  %tmp9 = phi i32 [ %tmp6, %bb5 ], [ %tmp7, %bb4 ]
+  switch i32 %tmp9, label %bb2 [
+    i32 0, label %bb5
+    i32 1, label %bb6
+    i32 3, label %bb4
+    i32 4, label %bb3
+  ]
+
+bb6:
+  ret void
+}
+
+; Test a CFG DAG with interesting merging.
+
+; CHECK-LABEL: test11:
+; CHECK:       block        BB20_8{{$}}
+; CHECK-NEXT:  block        BB20_7{{$}}
+; CHECK-NEXT:  block        BB20_6{{$}}
+; CHECK-NEXT:  block        BB20_4{{$}}
+; CHECK-NEXT:  br_if        {{[^,]*}}, BB20_4{{$}}
+; CHECK-NOT:   block
+; CHECK:       block        BB20_3{{$}}
+; CHECK:       br_if        {{[^,]*}}, BB20_3{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if        {{[^,]*}}, BB20_6{{$}}
+; CHECK-NEXT:  BB20_3:
+; CHECK-NOT:   block
+; CHECK:       return{{$}}
+; CHECK-NEXT:  BB20_4:
+; CHECK-NOT:   block
+; CHECK:       br_if        {{[^,]*}}, BB20_8{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if        {{[^,]*}}, BB20_7{{$}}
+; CHECK-NEXT:  BB20_6:
+; CHECK-NOT:   block
+; CHECK:       return{{$}}
+; CHECK-NEXT:  BB20_7:
+; CHECK-NOT:   block
+; CHECK:       return{{$}}
+; CHECK-NEXT:  BB20_8:
+; CHECK-NOT:   block
+; CHECK:       return{{$}}
+; OPT-LABEL: test11:
+; OPT:       block        BB20_8{{$}}
+; OPT-NEXT:  block        BB20_4{{$}}
+; OPT-NEXT:  br_if        $0, BB20_4{{$}}
+; OPT-NOT:   block
+; OPT:       block        BB20_3{{$}}
+; OPT:       br_if        $0, BB20_3{{$}}
+; OPT-NOT:   block
+; OPT:       br_if        $0, BB20_8{{$}}
+; OPT-NEXT:  BB20_3:
+; OPT-NOT:   block
+; OPT:       return{{$}}
+; OPT-NEXT:  BB20_4:
+; OPT-NOT:   block
+; OPT:       block        BB20_6{{$}}
+; OPT-NOT:   block
+; OPT:       br_if        $pop9, BB20_6{{$}}
+; OPT-NOT:   block
+; OPT:       return{{$}}
+; OPT-NEXT:  BB20_6:
+; OPT-NOT:   block
+; OPT:       br_if        $0, BB20_8{{$}}
+; OPT-NOT:   block
+; OPT:       return{{$}}
+; OPT-NEXT:  BB20_8:
+; OPT-NOT:   block
+; OPT:       return{{$}}
+define void @test11() {
+bb0:
+  store volatile i32 0, i32* null
+  br i1 undef, label %bb1, label %bb4
+bb1:
+  store volatile i32 1, i32* null
+  br i1 undef, label %bb3, label %bb2
+bb2:
+  store volatile i32 2, i32* null
+  br i1 undef, label %bb3, label %bb7
+bb3:
+  store volatile i32 3, i32* null
+  ret void
+bb4:
+  store volatile i32 4, i32* null
+  br i1 undef, label %bb8, label %bb5
+bb5:
+  store volatile i32 5, i32* null
+  br i1 undef, label %bb6, label %bb7
+bb6:
+  store volatile i32 6, i32* null
+  ret void
+bb7:
+  store volatile i32 7, i32* null
+  ret void
+bb8:
+  store volatile i32 8, i32* null
+  ret void
+}
+
+; CHECK-LABEL: test12:
+; CHECK:       BB21_1:
+; CHECK-NEXT:  loop        BB21_8{{$}}
+; CHECK-NOT:   block
+; CHECK:       block       BB21_7{{$}}
+; CHECK-NEXT:  block       BB21_6{{$}}
+; CHECK-NEXT:  block       BB21_4{{$}}
+; CHECK:       br_if       {{[^,]*}}, BB21_4{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if       {{[^,]*}}, BB21_7{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if       {{[^,]*}}, BB21_7{{$}}
+; CHECK-NEXT:  br          BB21_6{{$}}
+; CHECK-NEXT:  BB21_4:
+; CHECK-NOT:   block
+; CHECK:       br_if       {{[^,]*}}, BB21_7{{$}}
+; CHECK-NOT:   block
+; CHECK:       br_if       {{[^,]*}}, BB21_7{{$}}
+; CHECK-NEXT:  BB21_6:
+; CHECK-NEXT:  return{{$}}
+; CHECK-NEXT:  BB21_7:
+; CHECK-NOT:   block
+; CHECK:       br          BB21_1{{$}}
+; CHECK-NEXT:  BB21_8:
+; OPT-LABEL: test12:
+; OPT:       BB21_1:
+; OPT-NEXT:  loop        BB21_8{{$}}
+; OPT-NOT:   block
+; OPT:       block       BB21_7{{$}}
+; OPT-NEXT:  block       BB21_6{{$}}
+; OPT-NEXT:  block       BB21_4{{$}}
+; OPT:       br_if       {{[^,]*}}, BB21_4{{$}}
+; OPT-NOT:   block
+; OPT:       br_if       {{[^,]*}}, BB21_7{{$}}
+; OPT-NOT:   block
+; OPT:       br_if       {{[^,]*}}, BB21_7{{$}}
+; OPT-NEXT:  br          BB21_6{{$}}
+; OPT-NEXT:  BB21_4:
+; OPT-NOT:   block
+; OPT:       br_if       {{[^,]*}}, BB21_7{{$}}
+; OPT-NOT:   block
+; OPT:       br_if       {{[^,]*}}, BB21_7{{$}}
+; OPT-NEXT:  BB21_6:
+; OPT-NEXT:  return{{$}}
+; OPT-NEXT:  BB21_7:
+; OPT-NOT:   block
+; OPT:       br          BB21_1{{$}}
+; OPT-NEXT:  BB21_8:
+define void @test12(i8* %arg) {
+bb:
+  br label %bb1
+
+bb1:
+  %tmp = phi i32 [ 0, %bb ], [ %tmp5, %bb4 ]
+  %tmp2 = getelementptr i8, i8* %arg, i32 %tmp
+  %tmp3 = load i8, i8* %tmp2
+  switch i8 %tmp3, label %bb7 [
+    i8 42, label %bb4
+    i8 76, label %bb4
+    i8 108, label %bb4
+    i8 104, label %bb4
+  ]
+
+bb4:
+  %tmp5 = add i32 %tmp, 1
+  br label %bb1
+
+bb7:
+  ret void
+}
+
+; A block can be "branched to" from another even if it is also reachable via
+; fallthrough from the other. This would normally be optimized away, so use
+; optnone to disable optimizations to test this case.
+
+; CHECK-LABEL: test13:
+; CHECK-NEXT:  .local i32{{$}}
+; CHECK:       block BB22_2{{$}}
+; CHECK:       br_if $pop4, BB22_2{{$}}
+; CHECK-NEXT:  return{{$}}
+; CHECK-NEXT:  BB22_2:
+; CHECK:       block BB22_4{{$}}
+; CHECK-NEXT:  br_if $0, BB22_4{{$}}
+; CHECK:       BB22_4:
+; CHECK:       block BB22_5{{$}}
+; CHECK:       br_if $pop6, BB22_5{{$}}
+; CHECK-NEXT:  BB22_5:
+; CHECK-NEXT:  unreachable{{$}}
+; OPT-LABEL: test13:
+; OPT-NEXT:  .local i32{{$}}
+; OPT:       block BB22_2{{$}}
+; OPT:       br_if $pop4, BB22_2{{$}}
+; OPT-NEXT:  return{{$}}
+; OPT-NEXT:  BB22_2:
+; OPT:       block BB22_4{{$}}
+; OPT-NEXT:  br_if $0, BB22_4{{$}}
+; OPT:       BB22_4:
+; OPT:       block BB22_5{{$}}
+; OPT:       br_if $pop6, BB22_5{{$}}
+; OPT-NEXT:  BB22_5:
+; OPT-NEXT:  unreachable{{$}}
+define void @test13() noinline optnone {
+bb:
+  br i1 undef, label %bb5, label %bb2
+bb1:
+  unreachable
+bb2:
+  br i1 undef, label %bb3, label %bb4
+bb3:
+  br label %bb4
+bb4:
+  %tmp = phi i1 [ false, %bb2 ], [ false, %bb3 ]
+  br i1 %tmp, label %bb1, label %bb1
+bb5:
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/comparisons_f32.ll b/test/CodeGen/WebAssembly/comparisons_f32.ll
new file mode 100644
index 0000000000000..6df37ea1c6dd5
--- /dev/null
+++ b/test/CodeGen/WebAssembly/comparisons_f32.ll
@@ -0,0 +1,181 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic 32-bit floating-point comparison operations assemble as
+; expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: ord_f32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM0:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM1:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.and $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
+; CHECK-NEXT: return $pop[[NUM2]]{{$}}
+define i32 @ord_f32(float %x, float %y) {
+  %a = fcmp ord float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: uno_f32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM0:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
+; CHECK-NEXT: return $pop[[NUM2]]{{$}}
+define i32 @uno_f32(float %x, float %y) {
+  %a = fcmp uno float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: oeq_f32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @oeq_f32(float %x, float %y) {
+  %a = fcmp oeq float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: une_f32:
+; CHECK: f32.ne $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @une_f32(float %x, float %y) {
+  %a = fcmp une float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: olt_f32:
+; CHECK: f32.lt $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @olt_f32(float %x, float %y) {
+  %a = fcmp olt float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ole_f32:
+; CHECK: f32.le $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ole_f32(float %x, float %y) {
+  %a = fcmp ole float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ogt_f32:
+; CHECK: f32.gt $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ogt_f32(float %x, float %y) {
+  %a = fcmp ogt float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: oge_f32:
+; CHECK: f32.ge $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @oge_f32(float %x, float %y) {
+  %a = fcmp oge float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; Expanded comparisons, which also check for NaN.
+
+; CHECK-LABEL: ueq_f32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
+define i32 @ueq_f32(float %x, float %y) {
+  %a = fcmp ueq float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: one_f32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.and $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.and $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]
+define i32 @one_f32(float %x, float %y) {
+  %a = fcmp one float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ult_f32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f32.lt $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
+define i32 @ult_f32(float %x, float %y) {
+  %a = fcmp ult float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ule_f32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f32.le $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
+define i32 @ule_f32(float %x, float %y) {
+  %a = fcmp ule float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ugt_f32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f32.gt $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
+define i32 @ugt_f32(float %x, float %y) {
+  %a = fcmp ugt float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: uge_f32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f32.ge $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
+define i32 @uge_f32(float %x, float %y) {
+  %a = fcmp uge float %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
diff --git a/test/CodeGen/WebAssembly/comparisons_f64.ll b/test/CodeGen/WebAssembly/comparisons_f64.ll
new file mode 100644
index 0000000000000..f5acc64b667c3
--- /dev/null
+++ b/test/CodeGen/WebAssembly/comparisons_f64.ll
@@ -0,0 +1,181 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic 64-bit floating-point comparison operations assemble as
+; expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: ord_f64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM0:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM1:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.and $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
+; CHECK-NEXT: return $pop[[NUM2]]{{$}}
+define i32 @ord_f64(double %x, double %y) {
+  %a = fcmp ord double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: uno_f64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM0:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
+; CHECK-NEXT: return $pop[[NUM2]]{{$}}
+define i32 @uno_f64(double %x, double %y) {
+  %a = fcmp uno double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: oeq_f64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @oeq_f64(double %x, double %y) {
+  %a = fcmp oeq double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: une_f64:
+; CHECK: f64.ne $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @une_f64(double %x, double %y) {
+  %a = fcmp une double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: olt_f64:
+; CHECK: f64.lt $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @olt_f64(double %x, double %y) {
+  %a = fcmp olt double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ole_f64:
+; CHECK: f64.le $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ole_f64(double %x, double %y) {
+  %a = fcmp ole double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ogt_f64:
+; CHECK: f64.gt $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ogt_f64(double %x, double %y) {
+  %a = fcmp ogt double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: oge_f64:
+; CHECK: f64.ge $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @oge_f64(double %x, double %y) {
+  %a = fcmp oge double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; Expanded comparisons, which also check for NaN.
+
+; CHECK-LABEL: ueq_f64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
+define i32 @ueq_f64(double %x, double %y) {
+  %a = fcmp ueq double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: one_f64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.and $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.and $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]
+define i32 @one_f64(double %x, double %y) {
+  %a = fcmp one double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ult_f64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f64.lt $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
+define i32 @ult_f64(double %x, double %y) {
+  %a = fcmp ult double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ule_f64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f64.le $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
+define i32 @ule_f64(double %x, double %y) {
+  %a = fcmp ule double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ugt_f64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f64.gt $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
+define i32 @ugt_f64(double %x, double %y) {
+  %a = fcmp ugt double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: uge_f64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: f64.ge $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
+define i32 @uge_f64(double %x, double %y) {
+  %a = fcmp uge double %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
diff --git a/test/CodeGen/WebAssembly/comparisons_i32.ll b/test/CodeGen/WebAssembly/comparisons_i32.ll
new file mode 100644
index 0000000000000..b724cec1cc632
--- /dev/null
+++ b/test/CodeGen/WebAssembly/comparisons_i32.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic 32-bit integer comparison operations assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: eq_i32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.eq $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @eq_i32(i32 %x, i32 %y) {
+  %a = icmp eq i32 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ne_i32:
+; CHECK: i32.ne $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ne_i32(i32 %x, i32 %y) {
+  %a = icmp ne i32 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: slt_i32:
+; CHECK: i32.lt_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @slt_i32(i32 %x, i32 %y) {
+  %a = icmp slt i32 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: sle_i32:
+; CHECK: i32.le_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @sle_i32(i32 %x, i32 %y) {
+  %a = icmp sle i32 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ult_i32:
+; CHECK: i32.lt_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ult_i32(i32 %x, i32 %y) {
+  %a = icmp ult i32 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ule_i32:
+; CHECK: i32.le_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ule_i32(i32 %x, i32 %y) {
+  %a = icmp ule i32 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: sgt_i32:
+; CHECK: i32.gt_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @sgt_i32(i32 %x, i32 %y) {
+  %a = icmp sgt i32 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: sge_i32:
+; CHECK: i32.ge_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @sge_i32(i32 %x, i32 %y) {
+  %a = icmp sge i32 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ugt_i32:
+; CHECK: i32.gt_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ugt_i32(i32 %x, i32 %y) {
+  %a = icmp ugt i32 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: uge_i32:
+; CHECK: i32.ge_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @uge_i32(i32 %x, i32 %y) {
+  %a = icmp uge i32 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
diff --git a/test/CodeGen/WebAssembly/comparisons_i64.ll b/test/CodeGen/WebAssembly/comparisons_i64.ll
new file mode 100644
index 0000000000000..898591999bec2
--- /dev/null
+++ b/test/CodeGen/WebAssembly/comparisons_i64.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic 64-bit integer comparison operations assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: eq_i64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i64.eq $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @eq_i64(i64 %x, i64 %y) {
+  %a = icmp eq i64 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ne_i64:
+; CHECK: i64.ne $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ne_i64(i64 %x, i64 %y) {
+  %a = icmp ne i64 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: slt_i64:
+; CHECK: i64.lt_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @slt_i64(i64 %x, i64 %y) {
+  %a = icmp slt i64 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: sle_i64:
+; CHECK: i64.le_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @sle_i64(i64 %x, i64 %y) {
+  %a = icmp sle i64 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ult_i64:
+; CHECK: i64.lt_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ult_i64(i64 %x, i64 %y) {
+  %a = icmp ult i64 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ule_i64:
+; CHECK: i64.le_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ule_i64(i64 %x, i64 %y) {
+  %a = icmp ule i64 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: sgt_i64:
+; CHECK: i64.gt_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @sgt_i64(i64 %x, i64 %y) {
+  %a = icmp sgt i64 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: sge_i64:
+; CHECK: i64.ge_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @sge_i64(i64 %x, i64 %y) {
+  %a = icmp sge i64 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: ugt_i64:
+; CHECK: i64.gt_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ugt_i64(i64 %x, i64 %y) {
+  %a = icmp ugt i64 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: uge_i64:
+; CHECK: i64.ge_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @uge_i64(i64 %x, i64 %y) {
+  %a = icmp uge i64 %x, %y
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll
new file mode 100644
index 0000000000000..e1acaca2c9ecf
--- /dev/null
+++ b/test/CodeGen/WebAssembly/conv.ll
@@ -0,0 +1,255 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic conversion operations assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: i32_wrap_i64:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.wrap/i64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @i32_wrap_i64(i64 %x) {
+  %a = trunc i64 %x to i32
+  ret i32 %a
+}
+
+; CHECK-LABEL: i64_extend_s_i32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.extend_s/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @i64_extend_s_i32(i32 %x) {
+  %a = sext i32 %x to i64
+  ret i64 %a
+}
+
+; CHECK-LABEL: i64_extend_u_i32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.extend_u/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @i64_extend_u_i32(i32 %x) {
+  %a = zext i32 %x to i64
+  ret i64 %a
+}
+
+; CHECK-LABEL: i32_trunc_s_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_s/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @i32_trunc_s_f32(float %x) {
+  %a = fptosi float %x to i32
+  ret i32 %a
+}
+
+; CHECK-LABEL: i32_trunc_u_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_u/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @i32_trunc_u_f32(float %x) {
+  %a = fptoui float %x to i32
+  ret i32 %a
+}
+
+; CHECK-LABEL: i32_trunc_s_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_s/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @i32_trunc_s_f64(double %x) {
+  %a = fptosi double %x to i32
+  ret i32 %a
+}
+
+; CHECK-LABEL: i32_trunc_u_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_u/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @i32_trunc_u_f64(double %x) {
+  %a = fptoui double %x to i32
+  ret i32 %a
+}
+
+; CHECK-LABEL: i64_trunc_s_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_s/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @i64_trunc_s_f32(float %x) {
+  %a = fptosi float %x to i64
+  ret i64 %a
+}
+
+; CHECK-LABEL: i64_trunc_u_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_u/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @i64_trunc_u_f32(float %x) {
+  %a = fptoui float %x to i64
+  ret i64 %a
+}
+
+; CHECK-LABEL: i64_trunc_s_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_s/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @i64_trunc_s_f64(double %x) {
+  %a = fptosi double %x to i64
+  ret i64 %a
+}
+
+; CHECK-LABEL: i64_trunc_u_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_u/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @i64_trunc_u_f64(double %x) {
+  %a = fptoui double %x to i64
+  ret i64 %a
+}
+
+; CHECK-LABEL: f32_convert_s_i32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.convert_s/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @f32_convert_s_i32(i32 %x) {
+  %a = sitofp i32 %x to float
+  ret float %a
+}
+
+; CHECK-LABEL: f32_convert_u_i32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.convert_u/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @f32_convert_u_i32(i32 %x) {
+  %a = uitofp i32 %x to float
+  ret float %a
+}
+
+; CHECK-LABEL: f64_convert_s_i32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.convert_s/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @f64_convert_s_i32(i32 %x) {
+  %a = sitofp i32 %x to double
+  ret double %a
+}
+
+; CHECK-LABEL: f64_convert_u_i32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.convert_u/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @f64_convert_u_i32(i32 %x) {
+  %a = uitofp i32 %x to double
+  ret double %a
+}
+
+; CHECK-LABEL: f32_convert_s_i64:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.convert_s/i64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @f32_convert_s_i64(i64 %x) {
+  %a = sitofp i64 %x to float
+  ret float %a
+}
+
+; CHECK-LABEL: f32_convert_u_i64:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.convert_u/i64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @f32_convert_u_i64(i64 %x) {
+  %a = uitofp i64 %x to float
+  ret float %a
+}
+
+; CHECK-LABEL: f64_convert_s_i64:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.convert_s/i64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @f64_convert_s_i64(i64 %x) {
+  %a = sitofp i64 %x to double
+  ret double %a
+}
+
+; CHECK-LABEL: f64_convert_u_i64:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.convert_u/i64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @f64_convert_u_i64(i64 %x) {
+  %a = uitofp i64 %x to double
+  ret double %a
+}
+
+; CHECK-LABEL: f64_promote_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.promote/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @f64_promote_f32(float %x) {
+  %a = fpext float %x to double
+  ret double %a
+}
+
+; CHECK-LABEL: f32_demote_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.demote/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @f32_demote_f64(double %x) {
+  %a = fptrunc double %x to float
+  ret float %a
+}
+
+; If the high its are unused, LLVM will optimize sext/zext into anyext, which
+; we need to patterm-match back to a specific instruction.
+
+; CHECK-LABEL: anyext:
+; CHECK: i64.extend_u/i32 $push0=, $0{{$}}
+define i64 @anyext(i32 %x) {
+    %y = sext i32 %x to i64
+    %w = shl i64 %y, 32
+    ret i64 %w
+}
+
+; CHECK-LABEL: bitcast_i32_to_float:
+; CHECK: f32.reinterpret/i32   $push0=, $0{{$}}
+define float @bitcast_i32_to_float(i32 %a) {
+  %t = bitcast i32 %a to float
+  ret float %t
+}
+
+; CHECK-LABEL: bitcast_float_to_i32:
+; CHECK: i32.reinterpret/f32   $push0=, $0{{$}}
+define i32 @bitcast_float_to_i32(float %a) {
+  %t = bitcast float %a to i32
+  ret i32 %t
+}
+
+; CHECK-LABEL: bitcast_i64_to_double:
+; CHECK: f64.reinterpret/i64   $push0=, $0{{$}}
+define double @bitcast_i64_to_double(i64 %a) {
+  %t = bitcast i64 %a to double
+  ret double %t
+}
+
+; CHECK-LABEL: bitcast_double_to_i64:
+; CHECK: i64.reinterpret/f64   $push0=, $0{{$}}
+define i64 @bitcast_double_to_i64(double %a) {
+  %t = bitcast double %a to i64
+  ret i64 %t
+}
diff --git a/test/CodeGen/WebAssembly/copysign-casts.ll b/test/CodeGen/WebAssembly/copysign-casts.ll
new file mode 100644
index 0000000000000..760e491330183
--- /dev/null
+++ b/test/CodeGen/WebAssembly/copysign-casts.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; DAGCombiner oddly folds casts into the rhs of copysign. Test that they get
+; unfolded.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare double @copysign(double, double) nounwind readnone
+declare float @copysignf(float, float) nounwind readnone
+
+; CHECK-LABEL: fold_promote:
+; CHECK: f64.promote/f32 $push0=, $1{{$}}
+; CHECK: f64.copysign    $push1=, $0, $pop0{{$}}
+define double @fold_promote(double %a, float %b) {
+  %c = fpext float %b to double
+  %t = call double @copysign(double %a, double %c)
+  ret double %t
+}
+
+; CHECK-LABEL: fold_demote:{{$}}
+; CHECK: f32.demote/f64  $push0=, $1{{$}}
+; CHECK: f32.copysign    $push1=, $0, $pop0{{$}}
+define float @fold_demote(float %a, double %b) {
+  %c = fptrunc double %b to float
+  %t = call float @copysignf(float %a, float %c)
+  ret float %t
+}
diff --git a/test/CodeGen/WebAssembly/cpus.ll b/test/CodeGen/WebAssembly/cpus.ll
new file mode 100644
index 0000000000000..2b77c5f475c83
--- /dev/null
+++ b/test/CodeGen/WebAssembly/cpus.ll
@@ -0,0 +1,17 @@
+; This tests that llc accepts all valid WebAssembly CPUs.
+
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=wasm64-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=wasm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=wasm64-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
+; RUN: llc < %s -mtriple=wasm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
+
+; CHECK-NOT: {{.*}} is not a recognized processor for this target
+; INVALID: {{.*}} is not a recognized processor for this target
+
+define i32 @f(i32 %i_like_the_web) {
+  ret i32 %i_like_the_web
+}
diff --git a/test/CodeGen/WebAssembly/dead-vreg.ll b/test/CodeGen/WebAssembly/dead-vreg.ll
new file mode 100644
index 0000000000000..b03e1569fde6b
--- /dev/null
+++ b/test/CodeGen/WebAssembly/dead-vreg.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+
+; Check that unused vregs aren't assigned registers.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+define void @foo(i32* nocapture %a, i32 %w, i32 %h) {
+; CHECK-LABEL: foo:
+; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .local i32, i32, i32, i32, i32, i32, i32{{$}}
+entry:
+  %cmp.19 = icmp sgt i32 %h, 0
+  br i1 %cmp.19, label %for.cond.1.preheader.lr.ph, label %for.end.7
+
+for.cond.1.preheader.lr.ph:
+  %cmp2.17 = icmp sgt i32 %w, 0
+  br label %for.cond.1.preheader
+
+for.cond.1.preheader:
+  %y.020 = phi i32 [ 0, %for.cond.1.preheader.lr.ph ], [ %inc6, %for.inc.5 ]
+  br i1 %cmp2.17, label %for.body.3.lr.ph, label %for.inc.5
+
+for.body.3.lr.ph:
+  %mul4 = mul nsw i32 %y.020, %w
+  br label %for.body.3
+
+for.body.3:
+  %x.018 = phi i32 [ 0, %for.body.3.lr.ph ], [ %inc, %for.body.3 ]
+  %mul = mul nsw i32 %x.018, %y.020
+  %add = add nsw i32 %x.018, %mul4
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %add
+  store i32 %mul, i32* %arrayidx, align 4
+  %inc = add nuw nsw i32 %x.018, 1
+  %exitcond = icmp eq i32 %inc, %w
+  br i1 %exitcond, label %for.inc.5.loopexit, label %for.body.3
+
+for.inc.5.loopexit:
+  br label %for.inc.5
+
+for.inc.5:
+  %inc6 = add nuw nsw i32 %y.020, 1
+  %exitcond22 = icmp eq i32 %inc6, %h
+  br i1 %exitcond22, label %for.end.7.loopexit, label %for.cond.1.preheader
+
+for.end.7.loopexit:
+  br label %for.end.7
+
+for.end.7:
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/f32.ll b/test/CodeGen/WebAssembly/f32.ll
new file mode 100644
index 0000000000000..777010064cdba
--- /dev/null
+++ b/test/CodeGen/WebAssembly/f32.ll
@@ -0,0 +1,154 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic 32-bit floating-point operations assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare float @llvm.fabs.f32(float)
+declare float @llvm.copysign.f32(float, float)
+declare float @llvm.sqrt.f32(float)
+declare float @llvm.ceil.f32(float)
+declare float @llvm.floor.f32(float)
+declare float @llvm.trunc.f32(float)
+declare float @llvm.nearbyint.f32(float)
+declare float @llvm.rint.f32(float)
+declare float @llvm.fma.f32(float, float, float)
+
+; CHECK-LABEL: fadd32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.add $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @fadd32(float %x, float %y) {
+  %a = fadd float %x, %y
+  ret float %a
+}
+
+; CHECK-LABEL: fsub32:
+; CHECK: f32.sub $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @fsub32(float %x, float %y) {
+  %a = fsub float %x, %y
+  ret float %a
+}
+
+; CHECK-LABEL: fmul32:
+; CHECK: f32.mul $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @fmul32(float %x, float %y) {
+  %a = fmul float %x, %y
+  ret float %a
+}
+
+; CHECK-LABEL: fdiv32:
+; CHECK: f32.div $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @fdiv32(float %x, float %y) {
+  %a = fdiv float %x, %y
+  ret float %a
+}
+
+; CHECK-LABEL: fabs32:
+; CHECK: f32.abs $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @fabs32(float %x) {
+  %a = call float @llvm.fabs.f32(float %x)
+  ret float %a
+}
+
+; CHECK-LABEL: fneg32:
+; CHECK: f32.neg $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @fneg32(float %x) {
+  %a = fsub float -0., %x
+  ret float %a
+}
+
+; CHECK-LABEL: copysign32:
+; CHECK: f32.copysign $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @copysign32(float %x, float %y) {
+  %a = call float @llvm.copysign.f32(float %x, float %y)
+  ret float %a
+}
+
+; CHECK-LABEL: sqrt32:
+; CHECK: f32.sqrt $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @sqrt32(float %x) {
+  %a = call float @llvm.sqrt.f32(float %x)
+  ret float %a
+}
+
+; CHECK-LABEL: ceil32:
+; CHECK: f32.ceil $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @ceil32(float %x) {
+  %a = call float @llvm.ceil.f32(float %x)
+  ret float %a
+}
+
+; CHECK-LABEL: floor32:
+; CHECK: f32.floor $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @floor32(float %x) {
+  %a = call float @llvm.floor.f32(float %x)
+  ret float %a
+}
+
+; CHECK-LABEL: trunc32:
+; CHECK: f32.trunc $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @trunc32(float %x) {
+  %a = call float @llvm.trunc.f32(float %x)
+  ret float %a
+}
+
+; CHECK-LABEL: nearest32:
+; CHECK: f32.nearest $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @nearest32(float %x) {
+  %a = call float @llvm.nearbyint.f32(float %x)
+  ret float %a
+}
+
+; CHECK-LABEL: nearest32_via_rint:
+; CHECK: f32.nearest $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @nearest32_via_rint(float %x) {
+  %a = call float @llvm.rint.f32(float %x)
+  ret float %a
+}
+
+; Min and max tests. LLVM currently only forms fminnan and fmaxnan nodes in
+; cases where there's a single fcmp with a select and it can prove that one
+; of the arms is never NaN, so we only test that case. In the future if LLVM
+; learns to form fminnan/fmaxnan in more cases, we can write more general
+; tests.
+
+; CHECK-LABEL: fmin32:
+; CHECK: f32.min $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: return $pop1{{$}}
+define float @fmin32(float %x) {
+  %a = fcmp ult float %x, 0.0
+  %b = select i1 %a, float %x, float 0.0
+  ret float %b
+}
+
+; CHECK-LABEL: fmax32:
+; CHECK: f32.max $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: return $pop1{{$}}
+define float @fmax32(float %x) {
+  %a = fcmp ugt float %x, 0.0
+  %b = select i1 %a, float %x, float 0.0
+  ret float %b
+}
+
+; CHECK-LABEL: fma32:
+; CHECK: {{^}} f32.call $push0=, fmaf, $0, $1, $2{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @fma32(float %a, float %b, float %c) {
+  %d = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %d
+}
diff --git a/test/CodeGen/WebAssembly/f64.ll b/test/CodeGen/WebAssembly/f64.ll
new file mode 100644
index 0000000000000..302ee79389b38
--- /dev/null
+++ b/test/CodeGen/WebAssembly/f64.ll
@@ -0,0 +1,154 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic 64-bit floating-point operations assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare double @llvm.fabs.f64(double)
+declare double @llvm.copysign.f64(double, double)
+declare double @llvm.sqrt.f64(double)
+declare double @llvm.ceil.f64(double)
+declare double @llvm.floor.f64(double)
+declare double @llvm.trunc.f64(double)
+declare double @llvm.nearbyint.f64(double)
+declare double @llvm.rint.f64(double)
+declare double @llvm.fma.f64(double, double, double)
+
+; CHECK-LABEL: fadd64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.add $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @fadd64(double %x, double %y) {
+  %a = fadd double %x, %y
+  ret double %a
+}
+
+; CHECK-LABEL: fsub64:
+; CHECK: f64.sub $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @fsub64(double %x, double %y) {
+  %a = fsub double %x, %y
+  ret double %a
+}
+
+; CHECK-LABEL: fmul64:
+; CHECK: f64.mul $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @fmul64(double %x, double %y) {
+  %a = fmul double %x, %y
+  ret double %a
+}
+
+; CHECK-LABEL: fdiv64:
+; CHECK: f64.div $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @fdiv64(double %x, double %y) {
+  %a = fdiv double %x, %y
+  ret double %a
+}
+
+; CHECK-LABEL: fabs64:
+; CHECK: f64.abs $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @fabs64(double %x) {
+  %a = call double @llvm.fabs.f64(double %x)
+  ret double %a
+}
+
+; CHECK-LABEL: fneg64:
+; CHECK: f64.neg $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @fneg64(double %x) {
+  %a = fsub double -0., %x
+  ret double %a
+}
+
+; CHECK-LABEL: copysign64:
+; CHECK: f64.copysign $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @copysign64(double %x, double %y) {
+  %a = call double @llvm.copysign.f64(double %x, double %y)
+  ret double %a
+}
+
+; CHECK-LABEL: sqrt64:
+; CHECK: f64.sqrt $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @sqrt64(double %x) {
+  %a = call double @llvm.sqrt.f64(double %x)
+  ret double %a
+}
+
+; CHECK-LABEL: ceil64:
+; CHECK: f64.ceil $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @ceil64(double %x) {
+  %a = call double @llvm.ceil.f64(double %x)
+  ret double %a
+}
+
+; CHECK-LABEL: floor64:
+; CHECK: f64.floor $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @floor64(double %x) {
+  %a = call double @llvm.floor.f64(double %x)
+  ret double %a
+}
+
+; CHECK-LABEL: trunc64:
+; CHECK: f64.trunc $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @trunc64(double %x) {
+  %a = call double @llvm.trunc.f64(double %x)
+  ret double %a
+}
+
+; CHECK-LABEL: nearest64:
+; CHECK: f64.nearest $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @nearest64(double %x) {
+  %a = call double @llvm.nearbyint.f64(double %x)
+  ret double %a
+}
+
+; CHECK-LABEL: nearest64_via_rint:
+; CHECK: f64.nearest $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @nearest64_via_rint(double %x) {
+  %a = call double @llvm.rint.f64(double %x)
+  ret double %a
+}
+
+; Min and max tests. LLVM currently only forms fminnan and fmaxnan nodes in
+; cases where there's a single fcmp with a select and it can prove that one
+; of the arms is never NaN, so we only test that case. In the future if LLVM
+; learns to form fminnan/fmaxnan in more cases, we can write more general
+; tests.
+
+; CHECK-LABEL: fmin64:
+; CHECK: f64.min $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: return $pop1{{$}}
+define double @fmin64(double %x) {
+  %a = fcmp ult double %x, 0.0
+  %b = select i1 %a, double %x, double 0.0
+  ret double %b
+}
+
+; CHECK-LABEL: fmax64:
+; CHECK: f64.max $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: return $pop1{{$}}
+define double @fmax64(double %x) {
+  %a = fcmp ugt double %x, 0.0
+  %b = select i1 %a, double %x, double 0.0
+  ret double %b
+}
+
+; CHECK-LABEL: fma64:
+; CHECK: {{^}} f64.call $push0=, fma, $0, $1, $2{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @fma64(double %a, double %b, double %c) {
+  %d = call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %d
+}
diff --git a/test/CodeGen/WebAssembly/fast-isel.ll b/test/CodeGen/WebAssembly/fast-isel.ll
new file mode 100644
index 0000000000000..07d78c1415e5f
--- /dev/null
+++ b/test/CodeGen/WebAssembly/fast-isel.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -asm-verbose=false \
+; RUN:   -fast-isel -fast-isel-abort=1 -verify-machineinstrs \
+; RUN:   | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; This tests very minimal fast-isel functionality.
+
+; CHECK-LABEL: immediate_f32:
+; CHECK: f32.const $push{{[0-9]+}}=, 0x1.4p1{{$}}
+define float @immediate_f32() {
+  ret float 2.5
+}
+
+; CHECK-LABEL: immediate_f64:
+; CHECK: f64.const $push{{[0-9]+}}=, 0x1.4p1{{$}}
+define double @immediate_f64() {
+  ret double 2.5
+}
diff --git a/test/CodeGen/WebAssembly/frem.ll b/test/CodeGen/WebAssembly/frem.ll
new file mode 100644
index 0000000000000..688370313b486
--- /dev/null
+++ b/test/CodeGen/WebAssembly/frem.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that the frem instruction works.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: frem32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: {{^}} f32.call $push0=, fmodf, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @frem32(float %x, float %y) {
+  %a = frem float %x, %y
+  ret float %a
+}
+
+; CHECK-LABEL: frem64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: {{^}} f64.call $push0=, fmod, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @frem64(double %x, double %y) {
+  %a = frem double %x, %y
+  ret double %a
+}
diff --git a/test/CodeGen/WebAssembly/func.ll b/test/CodeGen/WebAssembly/func.ll
new file mode 100644
index 0000000000000..6f42dc744ac79
--- /dev/null
+++ b/test/CodeGen/WebAssembly/func.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic functions assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: f0:
+; CHECK: return{{$}}
+; CHECK: .size f0,
+define void @f0() {
+  ret void
+}
+
+; CHECK-LABEL: f1:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+; CHECK: .size f1,
+define i32 @f1() {
+  ret i32 0
+}
+
+; CHECK-LABEL: f2:
+; CHECK-NEXT: .param i32, f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+; CHECK: .size f2,
+define i32 @f2(i32 %p1, float %p2) {
+  ret i32 0
+}
+
+; CHECK-LABEL: f3:
+; CHECK-NEXT: .param i32, f32{{$}}
+; CHECK-NOT: local
+; CHECK-NEXT: return{{$}}
+; CHECK: .size f3,
+define void @f3(i32 %p1, float %p2) {
+  ret void
+}
+
+; CHECK-LABEL: f4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: local
+define i32 @f4(i32 %x) {
+entry:
+   %c = trunc i32 %x to i1
+   br i1 %c, label %true, label %false
+true:
+   ret i32 0
+false:
+   ret i32 1
+}
+
+; CHECK-LABEL: f5:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: unreachable
+define float @f5()  {
+ unreachable
+}
diff --git a/test/CodeGen/WebAssembly/global.ll b/test/CodeGen/WebAssembly/global.ll
new file mode 100644
index 0000000000000..5f149ed067c82
--- /dev/null
+++ b/test/CodeGen/WebAssembly/global.ll
@@ -0,0 +1,177 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that globals assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-NOT: llvm.used
+; CHECK-NOT: llvm.metadata
+@llvm.used = appending global [1 x i32*] [i32* @g], section "llvm.metadata"
+
+; CHECK: foo:
+; CHECK: i32.const $push0=, 0{{$}}
+; CHECK-NEXT: i32.load $push1=, answer($pop0){{$}}
+; CHECK-NEXT: return $pop1{{$}}
+define i32 @foo() {
+  %a = load i32, i32* @answer
+  ret i32 %a
+}
+
+; CHECK-LABEL: call_memcpy:
+; CHECK-NEXT: .param          i32, i32, i32{{$}}
+; CHECK-NEXT: .result         i32{{$}}
+; CHECK-NEXT: call            memcpy, $0, $1, $2{{$}}
+; CHECK-NEXT: return          $0{{$}}
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
+define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret i8* %p
+}
+
+; CHECK: .type   g,@object
+; CHECK: .align  2{{$}}
+; CHECK-NEXT: g:
+; CHECK-NEXT: .int32 1337{{$}}
+; CHECK-NEXT: .size g, 4{{$}}
+@g = private global i32 1337
+
+; CHECK-LABEL: ud:
+; CHECK-NEXT: .zero 4{{$}}
+; CHECK-NEXT: .size ud, 4{{$}}
+@ud = internal global i32 undef
+
+; CHECK: .type nil,@object
+; CHECK-NEXT: .lcomm nil,4,2{{$}}
+@nil = internal global i32 zeroinitializer
+
+; CHECK: .type z,@object
+; CHECK-NEXT: .lcomm z,4,2{{$}}
+@z = internal global i32 0
+
+; CHECK-NEXT: .type one,@object
+; CHECK-NEXT: .align 2{{$}}
+; CHECK-NEXT: one:
+; CHECK-NEXT: .int32 1{{$}}
+; CHECK-NEXT: .size one, 4{{$}}
+@one = internal global i32 1
+
+; CHECK: .type answer,@object
+; CHECK: .align 2{{$}}
+; CHECK-NEXT: answer:
+; CHECK-NEXT: .int32 42{{$}}
+; CHECK-NEXT: .size answer, 4{{$}}
+@answer = internal global i32 42
+
+; CHECK: .type u32max,@object
+; CHECK: .align 2{{$}}
+; CHECK-NEXT: u32max:
+; CHECK-NEXT: .int32 4294967295{{$}}
+; CHECK-NEXT: .size u32max, 4{{$}}
+@u32max = internal global i32 -1
+
+; CHECK: .type ud64,@object
+; CHECK: .align 3{{$}}
+; CHECK-NEXT: ud64:
+; CHECK-NEXT: .zero 8{{$}}
+; CHECK-NEXT: .size ud64, 8{{$}}
+@ud64 = internal global i64 undef
+
+; CHECK: .type nil64,@object
+; CHECK: .lcomm nil64,8,3{{$}}
+@nil64 = internal global i64 zeroinitializer
+
+; CHECK: .type z64,@object
+; CHECK: .lcomm z64,8,3{{$}}
+@z64 = internal global i64 0
+
+; CHECK: .type twoP32,@object
+; CHECK: .align 3{{$}}
+; CHECK-NEXT: twoP32:
+; CHECK-NEXT: .int64 4294967296{{$}}
+; CHECK-NEXT: .size twoP32, 8{{$}}
+@twoP32 = internal global i64 4294967296
+
+; CHECK: .type u64max,@object
+; CHECK: .align 3{{$}}
+; CHECK-NEXT: u64max:
+; CHECK-NEXT: .int64 -1{{$}}
+; CHECK-NEXT: .size u64max, 8{{$}}
+@u64max = internal global i64 -1
+
+; CHECK: .type f32ud,@object
+; CHECK: .align 2{{$}}
+; CHECK-NEXT: f32ud:
+; CHECK-NEXT: .zero 4{{$}}
+; CHECK-NEXT: .size f32ud, 4{{$}}
+@f32ud = internal global float undef
+
+; CHECK: .type f32nil,@object
+; CHECK: .lcomm f32nil,4,2{{$}}
+@f32nil = internal global float zeroinitializer
+
+; CHECK: .type f32z,@object
+; CHECK: .lcomm f32z,4,2{{$}}
+@f32z = internal global float 0.0
+
+; CHECK: .type f32nz,@object
+; CHECK: .align 2{{$}}
+; CHECK: f32nz:
+; CHECK: .int32 2147483648{{$}}
+; CHECK: .size f32nz, 4{{$}}
+@f32nz = internal global float -0.0
+
+; CHECK: .type f32two,@object
+; CHECK: .align 2{{$}}
+; CHECK-NEXT: f32two:
+; CHECK-NEXT: .int32 1073741824{{$}}
+; CHECK-NEXT: .size f32two, 4{{$}}
+@f32two = internal global float 2.0
+
+; CHECK: .type f64ud,@object
+; CHECK: .align 3{{$}}
+; CHECK-NEXT: f64ud:
+; CHECK-NEXT: .zero 8{{$}}
+; CHECK-NEXT: .size f64ud, 8{{$}}
+@f64ud = internal global double undef
+
+; CHECK: .type f64nil,@object
+; CHECK: .lcomm f64nil,8,3{{$}}
+@f64nil = internal global double zeroinitializer
+
+; CHECK: .type f64z,@object
+; CHECK: .lcomm f64z,8,3{{$}}
+@f64z = internal global double 0.0
+
+; CHECK: .type f64nz,@object
+; CHECK: .align 3{{$}}
+; CHECK-NEXT: f64nz:
+; CHECK-NEXT: .int64 -9223372036854775808{{$}}
+; CHECK-NEXT: .size f64nz, 8{{$}}
+@f64nz = internal global double -0.0
+
+; CHECK: .type f64two,@object
+; CHECK: .align 3{{$}}
+; CHECK-NEXT: f64two:
+; CHECK-NEXT: .int64 4611686018427387904{{$}}
+; CHECK-NEXT: .size f64two, 8{{$}}
+@f64two = internal global double 2.0
+
+; Indexing into a global array produces a relocation.
+; CHECK:      .type arr,@object
+; CHECK:      .type ptr,@object
+; CHECK:      ptr:
+; CHECK-NEXT: .int32 arr+80
+; CHECK-NEXT: .size ptr, 4
+@arr = global [128 x i32] zeroinitializer, align 16
+@ptr = global i32* getelementptr inbounds ([128 x i32], [128 x i32]* @arr, i32 0, i32 20), align 4
+
+; Constant global.
+; CHECK: .type    rom,@object{{$}}
+; CHECK: .section .rodata,"a",@progbits{{$}}
+; CHECK: .globl   rom{{$}}
+; CHECK: .align   4{{$}}
+; CHECK: rom:
+; CHECK: .zero    512{{$}}
+; CHECK: .size    rom, 512{{$}}
+@rom = constant [128 x i32] zeroinitializer, align 16
diff --git a/test/CodeGen/WebAssembly/globl.ll b/test/CodeGen/WebAssembly/globl.ll
new file mode 100644
index 0000000000000..a5dc028c1db40
--- /dev/null
+++ b/test/CodeGen/WebAssembly/globl.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK: .globl foo
+; CHECK-LABEL: foo:
+define void @foo() {
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/i32.ll b/test/CodeGen/WebAssembly/i32.ll
new file mode 100644
index 0000000000000..ab29b0472bf2d
--- /dev/null
+++ b/test/CodeGen/WebAssembly/i32.ll
@@ -0,0 +1,190 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic 32-bit integer operations assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i32 @llvm.ctpop.i32(i32)
+
+; CHECK-LABEL: add32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.add $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @add32(i32 %x, i32 %y) {
+  %a = add i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: sub32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.sub $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @sub32(i32 %x, i32 %y) {
+  %a = sub i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: mul32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.mul $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @mul32(i32 %x, i32 %y) {
+  %a = mul i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: sdiv32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.div_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @sdiv32(i32 %x, i32 %y) {
+  %a = sdiv i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: udiv32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.div_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @udiv32(i32 %x, i32 %y) {
+  %a = udiv i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: srem32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.rem_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @srem32(i32 %x, i32 %y) {
+  %a = srem i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: urem32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.rem_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @urem32(i32 %x, i32 %y) {
+  %a = urem i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: and32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.and $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @and32(i32 %x, i32 %y) {
+  %a = and i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: or32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.or $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @or32(i32 %x, i32 %y) {
+  %a = or i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: xor32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.xor $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @xor32(i32 %x, i32 %y) {
+  %a = xor i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: shl32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.shl $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @shl32(i32 %x, i32 %y) {
+  %a = shl i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: shr32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.shr_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @shr32(i32 %x, i32 %y) {
+  %a = lshr i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: sar32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.shr_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @sar32(i32 %x, i32 %y) {
+  %a = ashr i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: clz32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.clz $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @clz32(i32 %x) {
+  %a = call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  ret i32 %a
+}
+
+; CHECK-LABEL: clz32_zero_undef:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.clz $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @clz32_zero_undef(i32 %x) {
+  %a = call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  ret i32 %a
+}
+
+; CHECK-LABEL: ctz32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.ctz $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @ctz32(i32 %x) {
+  %a = call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  ret i32 %a
+}
+
+; CHECK-LABEL: ctz32_zero_undef:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.ctz $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @ctz32_zero_undef(i32 %x) {
+  %a = call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  ret i32 %a
+}
+
+; CHECK-LABEL: popcnt32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.popcnt $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @popcnt32(i32 %x) {
+  %a = call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %a
+}
diff --git a/test/CodeGen/WebAssembly/i64.ll b/test/CodeGen/WebAssembly/i64.ll
new file mode 100644
index 0000000000000..769f74266754b
--- /dev/null
+++ b/test/CodeGen/WebAssembly/i64.ll
@@ -0,0 +1,190 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic 64-bit integer operations assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i64 @llvm.ctpop.i64(i64)
+
+; CHECK-LABEL: add64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.add $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @add64(i64 %x, i64 %y) {
+  %a = add i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: sub64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.sub $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @sub64(i64 %x, i64 %y) {
+  %a = sub i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: mul64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.mul $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @mul64(i64 %x, i64 %y) {
+  %a = mul i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: sdiv64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.div_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @sdiv64(i64 %x, i64 %y) {
+  %a = sdiv i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: udiv64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.div_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @udiv64(i64 %x, i64 %y) {
+  %a = udiv i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: srem64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.rem_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @srem64(i64 %x, i64 %y) {
+  %a = srem i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: urem64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.rem_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @urem64(i64 %x, i64 %y) {
+  %a = urem i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: and64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.and $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @and64(i64 %x, i64 %y) {
+  %a = and i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: or64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.or $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @or64(i64 %x, i64 %y) {
+  %a = or i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: xor64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.xor $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @xor64(i64 %x, i64 %y) {
+  %a = xor i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: shl64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.shl $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @shl64(i64 %x, i64 %y) {
+  %a = shl i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: shr64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.shr_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @shr64(i64 %x, i64 %y) {
+  %a = lshr i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: sar64:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.shr_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @sar64(i64 %x, i64 %y) {
+  %a = ashr i64 %x, %y
+  ret i64 %a
+}
+
+; CHECK-LABEL: clz64:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.clz $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @clz64(i64 %x) {
+  %a = call i64 @llvm.ctlz.i64(i64 %x, i1 false)
+  ret i64 %a
+}
+
+; CHECK-LABEL: clz64_zero_undef:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.clz $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @clz64_zero_undef(i64 %x) {
+  %a = call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  ret i64 %a
+}
+
+; CHECK-LABEL: ctz64:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.ctz $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @ctz64(i64 %x) {
+  %a = call i64 @llvm.cttz.i64(i64 %x, i1 false)
+  ret i64 %a
+}
+
+; CHECK-LABEL: ctz64_zero_undef:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.ctz $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @ctz64_zero_undef(i64 %x) {
+  %a = call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  ret i64 %a
+}
+
+; CHECK-LABEL: popcnt64:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.popcnt $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @popcnt64(i64 %x) {
+  %a = call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %a
+}
diff --git a/test/CodeGen/WebAssembly/ident.ll b/test/CodeGen/WebAssembly/ident.ll
new file mode 100644
index 0000000000000..1e0dc2aa67254
--- /dev/null
+++ b/test/CodeGen/WebAssembly/ident.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test llvm.ident.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK: .ident "hello world"
+
+!llvm.ident = !{!0}
+
+!0 = !{!"hello world"}
diff --git a/test/CodeGen/WebAssembly/immediates.ll b/test/CodeGen/WebAssembly/immediates.ll
new file mode 100644
index 0000000000000..abab11f2254ea
--- /dev/null
+++ b/test/CodeGen/WebAssembly/immediates.ll
@@ -0,0 +1,198 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic immediates assemble as expected.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: zero_i32:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @zero_i32() {
+  ret i32 0
+}
+
+; CHECK-LABEL: one_i32:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @one_i32() {
+  ret i32 1
+}
+
+; CHECK-LABEL: max_i32:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, 2147483647{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @max_i32() {
+  ret i32 2147483647
+}
+
+; CHECK-LABEL: min_i32:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, -2147483648{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @min_i32() {
+  ret i32 -2147483648
+}
+
+; CHECK-LABEL: zero_i64:
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.const $push[[NUM:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @zero_i64() {
+  ret i64 0
+}
+
+; CHECK-LABEL: one_i64:
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.const $push[[NUM:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @one_i64() {
+  ret i64 1
+}
+
+; CHECK-LABEL: max_i64:
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.const $push[[NUM:[0-9]+]]=, 9223372036854775807{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @max_i64() {
+  ret i64 9223372036854775807
+}
+
+; CHECK-LABEL: min_i64:
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.const $push[[NUM:[0-9]+]]=, -9223372036854775808{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @min_i64() {
+  ret i64 -9223372036854775808
+}
+
+; CHECK-LABEL: negzero_f32:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, -0x0p0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @negzero_f32() {
+  ret float -0.0
+}
+
+; CHECK-LABEL: zero_f32:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, 0x0p0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @zero_f32() {
+  ret float 0.0
+}
+
+; CHECK-LABEL: one_f32:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, 0x1p0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @one_f32() {
+  ret float 1.0
+}
+
+; CHECK-LABEL: two_f32:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, 0x1p1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @two_f32() {
+  ret float 2.0
+}
+
+; CHECK-LABEL: nan_f32:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, nan{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @nan_f32() {
+  ret float 0x7FF8000000000000
+}
+
+; CHECK-LABEL: negnan_f32:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, -nan{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @negnan_f32() {
+  ret float 0xFFF8000000000000
+}
+
+; CHECK-LABEL: inf_f32:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, infinity{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @inf_f32() {
+  ret float 0x7FF0000000000000
+}
+
+; CHECK-LABEL: neginf_f32:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, -infinity{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @neginf_f32() {
+  ret float 0xFFF0000000000000
+}
+
+; CHECK-LABEL: negzero_f64:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -0x0p0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @negzero_f64() {
+  ret double -0.0
+}
+
+; CHECK-LABEL: zero_f64:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, 0x0p0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @zero_f64() {
+  ret double 0.0
+}
+
+; CHECK-LABEL: one_f64:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, 0x1p0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @one_f64() {
+  ret double 1.0
+}
+
+; CHECK-LABEL: two_f64:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, 0x1p1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @two_f64() {
+  ret double 2.0
+}
+
+; CHECK-LABEL: nan_f64:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, nan{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @nan_f64() {
+  ret double 0x7FF8000000000000
+}
+
+; CHECK-LABEL: negnan_f64:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -nan{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @negnan_f64() {
+  ret double 0xFFF8000000000000
+}
+
+; CHECK-LABEL: inf_f64:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, infinity{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @inf_f64() {
+  ret double 0x7FF0000000000000
+}
+
+; CHECK-LABEL: neginf_f64:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -infinity{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @neginf_f64() {
+  ret double 0xFFF0000000000000
+}
diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll
new file mode 100644
index 0000000000000..fc066c4b812f8
--- /dev/null
+++ b/test/CodeGen/WebAssembly/inline-asm.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test basic inline assembly.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: foo:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: #APP{{$}}
+; CHECK-NEXT: # $0 = aaa($0){{$}}
+; CHECK-NEXT: #NO_APP{{$}}
+; CHECK-NEXT: return $0{{$}}
+define i32 @foo(i32 %r) {
+entry:
+  %0 = tail call i32 asm sideeffect "# $0 = aaa($1)", "=r,r"(i32 %r) #0, !srcloc !0
+  ret i32 %0
+}
+
+; CHECK-LABEL: bar:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: #APP{{$}}
+; CHECK-NEXT: # 0($1) = bbb(0($0)){{$}}
+; CHECK-NEXT: #NO_APP{{$}}
+; CHECK-NEXT: return{{$}}
+define void @bar(i32* %r, i32* %s) {
+entry:
+  tail call void asm sideeffect "# $0 = bbb($1)", "=*m,*m"(i32* %s, i32* %r) #0, !srcloc !1
+  ret void
+}
+
+; CHECK-LABEL: imm:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .local i32{{$}}
+; CHECK-NEXT: #APP{{$}}
+; CHECK-NEXT: # $0 = ccc(42){{$}}
+; CHECK-NEXT: #NO_APP{{$}}
+; CHECK-NEXT: return $0{{$}}
+define i32 @imm() {
+entry:
+  %0 = tail call i32 asm sideeffect "# $0 = ccc($1)", "=r,i"(i32 42) #0, !srcloc !2
+  ret i32 %0
+}
+
+; CHECK-LABEL: foo_i64:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: #APP{{$}}
+; CHECK-NEXT: # $0 = aaa($0){{$}}
+; CHECK-NEXT: #NO_APP{{$}}
+; CHECK-NEXT: return $0{{$}}
+define i64 @foo_i64(i64 %r) {
+entry:
+  %0 = tail call i64 asm sideeffect "# $0 = aaa($1)", "=r,r"(i64 %r) #0, !srcloc !0
+  ret i64 %0
+}
+
+; CHECK-LABEL: X_i16:
+; CHECK: foo $1{{$}}
+; CHECK: i32.store16 $discard=, 0($0), $1{{$}}
+define void @X_i16(i16 * %t) {
+  call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16* %t)
+  ret void
+}
+
+; CHECK-LABEL: X_ptr:
+; CHECK: foo $1{{$}}
+; CHECK: i32.store $discard=, 0($0), $1{{$}}
+define void @X_ptr(i16 ** %t) {
+  call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16** %t)
+  ret void
+}
+
+; CHECK-LABEL: funcname:
+; CHECK: foo funcname{{$}}
+define void @funcname() {
+  tail call void asm sideeffect "foo $0", "i"(void ()* nonnull @funcname) #0, !srcloc !0
+  ret void
+}
+
+; CHECK-LABEL: varname:
+; CHECK: foo gv+37{{$}}
+@gv = global [0 x i8] zeroinitializer
+define void @varname() {
+  tail call void asm sideeffect "foo $0", "i"(i8* getelementptr inbounds ([0 x i8], [0 x i8]* @gv, i64 0, i64 37)) #0, !srcloc !0
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{i32 47}
+!1 = !{i32 145}
+!2 = !{i32 231}
diff --git a/test/CodeGen/WebAssembly/legalize.ll b/test/CodeGen/WebAssembly/legalize.ll
new file mode 100644
index 0000000000000..e780b2ee36ca3
--- /dev/null
+++ b/test/CodeGen/WebAssembly/legalize.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test various types and operators that need to be legalized.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: shl_i3:
+; CHECK: i32.const   $push0=, 7{{$}}
+; CHECK: i32.and     $push1=, $1, $pop0{{$}}
+; CHECK: i32.shl     $push2=, $0, $pop1{{$}}
+define i3 @shl_i3(i3 %a, i3 %b, i3* %p) {
+  %t = shl i3 %a, %b
+  ret i3 %t
+}
+
+; CHECK-LABEL: shl_i53:
+; CHECK: i64.const   $push0=, 9007199254740991{{$}}
+; CHECK: i64.and     $push1=, $1, $pop0{{$}}
+; CHECK: i64.shl     $push2=, $0, $pop1{{$}}
+define i53 @shl_i53(i53 %a, i53 %b, i53* %p) {
+  %t = shl i53 %a, %b
+  ret i53 %t
+}
+
+; CHECK-LABEL: sext_in_reg_i32_i64:
+; CHECK: i64.shl
+; CHECK: i64.shr_s
+define i64 @sext_in_reg_i32_i64(i64 %a) {
+  %b = shl i64 %a, 32
+  %c = ashr i64 %b, 32
+  ret i64 %c
+}
+
+; CHECK-LABEL: fpext_f32_f64:
+; CHECK: f32.load $push0=, 0($0){{$}}
+; CHECK: f64.promote/f32 $push1=, $pop0{{$}}
+; CHECK: return $pop1{{$}}
+define double @fpext_f32_f64(float *%p) {
+  %v = load float, float* %p
+  %e = fpext float %v to double
+  ret double %e
+}
+
+; CHECK-LABEL: fpconv_f64_f32:
+; CHECK: f64.load $push0=, 0($0){{$}}
+; CHECK: f32.demote/f64 $push1=, $pop0{{$}}
+; CHECK: return $pop1{{$}}
+define float @fpconv_f64_f32(double *%p) {
+  %v = load double, double* %p
+  %e = fptrunc double %v to float
+  ret float %e
+}
+
+; Check that big shifts work. This generates a big pile of code from the
+; legalizer; the main thing here is that we don't abort.
+
+; CHECK-LABEL: bigshift:
+define i1024 @bigshift(i1024 %a, i1024 %b) {
+    %c = shl i1024 %a, %b
+    ret i1024 %c
+}
diff --git a/test/CodeGen/WebAssembly/load-ext.ll b/test/CodeGen/WebAssembly/load-ext.ll
new file mode 100644
index 0000000000000..0ffcd38a86660
--- /dev/null
+++ b/test/CodeGen/WebAssembly/load-ext.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that extending loads are assembled properly.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: sext_i8_i32:
+; CHECK: i32.load8_s $push0=, 0($0){{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @sext_i8_i32(i8 *%p) {
+  %v = load i8, i8* %p
+  %e = sext i8 %v to i32
+  ret i32 %e
+}
+
+; CHECK-LABEL: zext_i8_i32:
+; CHECK: i32.load8_u $push0=, 0($0){{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @zext_i8_i32(i8 *%p) {
+  %v = load i8, i8* %p
+  %e = zext i8 %v to i32
+  ret i32 %e
+}
+
+; CHECK-LABEL: sext_i16_i32:
+; CHECK: i32.load16_s $push0=, 0($0){{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @sext_i16_i32(i16 *%p) {
+  %v = load i16, i16* %p
+  %e = sext i16 %v to i32
+  ret i32 %e
+}
+
+; CHECK-LABEL: zext_i16_i32:
+; CHECK: i32.load16_u $push0=, 0($0){{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @zext_i16_i32(i16 *%p) {
+  %v = load i16, i16* %p
+  %e = zext i16 %v to i32
+  ret i32 %e
+}
+
+; CHECK-LABEL: sext_i8_i64:
+; CHECK: i64.load8_s $push0=, 0($0){{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @sext_i8_i64(i8 *%p) {
+  %v = load i8, i8* %p
+  %e = sext i8 %v to i64
+  ret i64 %e
+}
+
+; CHECK-LABEL: zext_i8_i64:
+; CHECK: i64.load8_u $push0=, 0($0){{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @zext_i8_i64(i8 *%p) {
+  %v = load i8, i8* %p
+  %e = zext i8 %v to i64
+  ret i64 %e
+}
+
+; CHECK-LABEL: sext_i16_i64:
+; CHECK: i64.load16_s $push0=, 0($0){{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @sext_i16_i64(i16 *%p) {
+  %v = load i16, i16* %p
+  %e = sext i16 %v to i64
+  ret i64 %e
+}
+
+; CHECK-LABEL: zext_i16_i64:
+; CHECK: i64.load16_u $push0=, 0($0){{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @zext_i16_i64(i16 *%p) {
+  %v = load i16, i16* %p
+  %e = zext i16 %v to i64
+  ret i64 %e
+}
+
+; CHECK-LABEL: sext_i32_i64:
+; CHECK: i64.load32_s $push0=, 0($0){{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @sext_i32_i64(i32 *%p) {
+  %v = load i32, i32* %p
+  %e = sext i32 %v to i64
+  ret i64 %e
+}
+
+; CHECK-LABEL: zext_i32_i64:
+; CHECK: i64.load32_u $push0=, 0($0){{$}}
+; CHECK: return $pop0{{$}}
+define i64 @zext_i32_i64(i32 *%p) {
+  %v = load i32, i32* %p
+  %e = zext i32 %v to i64
+  ret i64 %e
+}
diff --git a/test/CodeGen/WebAssembly/load-store-i1.ll b/test/CodeGen/WebAssembly/load-store-i1.ll
new file mode 100644
index 0000000000000..37b5147294793
--- /dev/null
+++ b/test/CodeGen/WebAssembly/load-store-i1.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that i1 extending loads and truncating stores are assembled properly.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: load_u_i1_i32:
+; CHECK:      i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM0]]{{$}}
+define i32 @load_u_i1_i32(i1* %p) {
+  %v = load i1, i1* %p
+  %e = zext i1 %v to i32
+  ret i32 %e
+}
+
+; CHECK-LABEL: load_s_i1_i32:
+; CHECK:      i32.const $[[NUM1:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $[[NUM1]]{{$}}
+; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM1]]{{$}}
+; CHECK-NEXT: return $pop[[NUM3]]{{$}}
+define i32 @load_s_i1_i32(i1* %p) {
+  %v = load i1, i1* %p
+  %e = sext i1 %v to i32
+  ret i32 %e
+}
+
+; CHECK-LABEL: load_u_i1_i64:
+; CHECK:      i64.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM0]]{{$}}
+define i64 @load_u_i1_i64(i1* %p) {
+  %v = load i1, i1* %p
+  %e = zext i1 %v to i64
+  ret i64 %e
+}
+
+; CHECK-LABEL: load_s_i1_i64:
+; CHECK:      i64.const $[[NUM1:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $[[NUM1]]{{$}}
+; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM1]]{{$}}
+; CHECK-NEXT: return $pop[[NUM3]]{{$}}
+define i64 @load_s_i1_i64(i1* %p) {
+  %v = load i1, i1* %p
+  %e = sext i1 %v to i64
+  ret i64 %e
+}
+
+; CHECK-LABEL: store_i32_i1:
+; CHECK:      i32.const $push[[NUM0:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.and $push[[NUM1:[0-9]+]]=, $1, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: i32.store8 $discard=, 0($0), $pop[[NUM1]]{{$}}
+define void @store_i32_i1(i1* %p, i32 %v) {
+  %t = trunc i32 %v to i1
+  store i1 %t, i1* %p
+  ret void
+}
+
+; CHECK-LABEL: store_i64_i1:
+; CHECK:      i64.const $push[[NUM0:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.and $push[[NUM1:[0-9]+]]=, $1, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: i64.store8 $discard=, 0($0), $pop[[NUM1]]{{$}}
+define void @store_i64_i1(i1* %p, i64 %v) {
+  %t = trunc i64 %v to i1
+  store i1 %t, i1* %p
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/load.ll b/test/CodeGen/WebAssembly/load.ll
new file mode 100644
index 0000000000000..aa8ae689e0d11
--- /dev/null
+++ b/test/CodeGen/WebAssembly/load.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic loads are assembled properly.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: ldi32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ldi32(i32 *%p) {
+  %v = load i32, i32* %p
+  ret i32 %v
+}
+
+; CHECK-LABEL: ldi64:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi64(i64 *%p) {
+  %v = load i64, i64* %p
+  ret i64 %v
+}
+
+; CHECK-LABEL: ldf32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @ldf32(float *%p) {
+  %v = load float, float* %p
+  ret float %v
+}
+
+; CHECK-LABEL: ldf64:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @ldf64(double *%p) {
+  %v = load double, double* %p
+  ret double %v
+}
diff --git a/test/CodeGen/WebAssembly/loop-idiom.ll b/test/CodeGen/WebAssembly/loop-idiom.ll
new file mode 100644
index 0000000000000..2906df20a2290
--- /dev/null
+++ b/test/CodeGen/WebAssembly/loop-idiom.ll
@@ -0,0 +1,53 @@
+; RUN: opt -loop-idiom -S < %s -march=wasm32 | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+
+; Make sure loop-idiom doesn't create memcpy or memset. These aren't well
+; supported in WebAssembly for now.
+;
+; TODO Check the patterns are recognized once memcpy / memset are supported.
+
+; CHECK-LABEL: @cpy(
+; CHECK-NOT: llvm.memcpy
+; CHECK: load
+; CHECK: store
+define void @cpy(i64 %Size) {
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load i8, i8* %I.0.014, align 1
+  store i8 %V, i8* %DestI, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: @set(
+; CHECK-NOT: llvm.memset
+; CHECK: store
+define void @set(i8* %Base, i64 %Size) {
+bb.nph:
+  br label %for.body
+
+for.body:
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  store i8 0, i8* %I.0.014, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/memory-addr32.ll b/test/CodeGen/WebAssembly/memory-addr32.ll
new file mode 100644
index 0000000000000..e2dd556bddc05
--- /dev/null
+++ b/test/CodeGen/WebAssembly/memory-addr32.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic memory operations assemble as expected with 32-bit addresses.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare i32 @llvm.wasm.memory.size.i32() nounwind readonly
+declare void @llvm.wasm.grow.memory.i32(i32) nounwind
+
+; CHECK-LABEL: memory_size:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: memory_size $push0={{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @memory_size() {
+  %a = call i32 @llvm.wasm.memory.size.i32()
+  ret i32 %a
+}
+
+; CHECK-LABEL: grow_memory:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK: grow_memory $0{{$}}
+; CHECK-NEXT: return{{$}}
+define void @grow_memory(i32 %n) {
+  call void @llvm.wasm.grow.memory.i32(i32 %n)
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/memory-addr64.ll b/test/CodeGen/WebAssembly/memory-addr64.ll
new file mode 100644
index 0000000000000..5de1f2b11cfda
--- /dev/null
+++ b/test/CodeGen/WebAssembly/memory-addr64.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic memory operations assemble as expected with 64-bit addresses.
+
+target datalayout = "e-p:64:64-i64:64-n32:64-S128"
+target triple = "wasm64-unknown-unknown"
+
+declare i64 @llvm.wasm.memory.size.i64() nounwind readonly
+declare void @llvm.wasm.grow.memory.i64(i64) nounwind
+
+; CHECK-LABEL: memory_size:
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: memory_size $push0={{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @memory_size() {
+  %a = call i64 @llvm.wasm.memory.size.i64()
+  ret i64 %a
+}
+
+; CHECK-LABEL: grow_memory:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK: grow_memory $0{{$}}
+; CHECK-NEXT: return{{$}}
+define void @grow_memory(i64 %n) {
+  call void @llvm.wasm.grow.memory.i64(i64 %n)
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/offset-folding.ll b/test/CodeGen/WebAssembly/offset-folding.ll
new file mode 100644
index 0000000000000..2b4e8a90b0f0d
--- /dev/null
+++ b/test/CodeGen/WebAssembly/offset-folding.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that constant offsets can be folded into global addresses.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; FIXME: make this 'external' and make sure it still works. WebAssembly
+;        currently only supports linking single files, so 'external' makes
+;        little sense.
+@x = global [0 x i32] zeroinitializer
+@y = global [50 x i32] zeroinitializer
+
+; Test basic constant offsets of both defined and external symbols.
+
+; CHECK-LABEL: test0:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, x+188{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test0() {
+  ret i32* getelementptr ([0 x i32], [0 x i32]* @x, i32 0, i32 47)
+}
+
+; CHECK-LABEL: test1:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, y+188{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test1() {
+  ret i32* getelementptr ([50 x i32], [50 x i32]* @y, i32 0, i32 47)
+}
+
+; Test zero offsets.
+
+; CHECK-LABEL: test2:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, x{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test2() {
+  ret i32* getelementptr ([0 x i32], [0 x i32]* @x, i32 0, i32 0)
+}
+
+; CHECK-LABEL: test3:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, y{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test3() {
+  ret i32* getelementptr ([50 x i32], [50 x i32]* @y, i32 0, i32 0)
+}
diff --git a/test/CodeGen/WebAssembly/offset.ll b/test/CodeGen/WebAssembly/offset.ll
new file mode 100644
index 0000000000000..75a0bc9ab6c60
--- /dev/null
+++ b/test/CodeGen/WebAssembly/offset.ll
@@ -0,0 +1,185 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test constant load and store address offsets.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; With an nuw add, we can fold an offset.
+
+; CHECK-LABEL: load_i32_with_folded_offset:
+; CHECK: i32.load  $push0=, 24($0){{$}}
+define i32 @load_i32_with_folded_offset(i32* %p) {
+  %q = ptrtoint i32* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  %t = load i32, i32* %s
+  ret i32 %t
+}
+
+; Without nuw, and even with nsw, we can't fold an offset.
+
+; CHECK-LABEL: load_i32_with_unfolded_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i32.load  $push2=, 0($pop1){{$}}
+define i32 @load_i32_with_unfolded_offset(i32* %p) {
+  %q = ptrtoint i32* %p to i32
+  %r = add nsw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  %t = load i32, i32* %s
+  ret i32 %t
+}
+
+; Same as above but with i64.
+
+; CHECK-LABEL: load_i64_with_folded_offset:
+; CHECK: i64.load  $push0=, 24($0){{$}}
+define i64 @load_i64_with_folded_offset(i64* %p) {
+  %q = ptrtoint i64* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i64*
+  %t = load i64, i64* %s
+  ret i64 %t
+}
+
+; Same as above but with i64.
+
+; CHECK-LABEL: load_i64_with_unfolded_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i64.load  $push2=, 0($pop1){{$}}
+define i64 @load_i64_with_unfolded_offset(i64* %p) {
+  %q = ptrtoint i64* %p to i32
+  %r = add nsw i32 %q, 24
+  %s = inttoptr i32 %r to i64*
+  %t = load i64, i64* %s
+  ret i64 %t
+}
+
+; Same as above but with store.
+
+; CHECK-LABEL: store_i32_with_folded_offset:
+; CHECK: i32.store $discard=, 24($0), $pop0{{$}}
+define void @store_i32_with_folded_offset(i32* %p) {
+  %q = ptrtoint i32* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  store i32 0, i32* %s
+  ret void
+}
+
+; Same as above but with store.
+
+; CHECK-LABEL: store_i32_with_unfolded_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i32.store $discard=, 0($pop1), $pop2{{$}}
+define void @store_i32_with_unfolded_offset(i32* %p) {
+  %q = ptrtoint i32* %p to i32
+  %r = add nsw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  store i32 0, i32* %s
+  ret void
+}
+
+; Same as above but with store with i64.
+
+; CHECK-LABEL: store_i64_with_folded_offset:
+; CHECK: i64.store $discard=, 24($0), $pop0{{$}}
+define void @store_i64_with_folded_offset(i64* %p) {
+  %q = ptrtoint i64* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i64*
+  store i64 0, i64* %s
+  ret void
+}
+
+; Same as above but with store with i64.
+
+; CHECK-LABEL: store_i64_with_unfolded_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i64.store $discard=, 0($pop1), $pop2{{$}}
+define void @store_i64_with_unfolded_offset(i64* %p) {
+  %q = ptrtoint i64* %p to i32
+  %r = add nsw i32 %q, 24
+  %s = inttoptr i32 %r to i64*
+  store i64 0, i64* %s
+  ret void
+}
+
+; When loading from a fixed address, materialize a zero.
+
+; CHECK-LABEL: load_i32_from_numeric_address
+; CHECK: i32.const $push0=, 0{{$}}
+; CHECK: i32.load  $push1=, 42($pop0){{$}}
+define i32 @load_i32_from_numeric_address() {
+  %s = inttoptr i32 42 to i32*
+  %t = load i32, i32* %s
+  ret i32 %t
+}
+
+; CHECK-LABEL: load_i32_from_global_address
+; CHECK: i32.const $push0=, 0{{$}}
+; CHECK: i32.load  $push1=, gv($pop0){{$}}
+@gv = global i32 0
+define i32 @load_i32_from_global_address() {
+  %t = load i32, i32* @gv
+  ret i32 %t
+}
+
+; CHECK-LABEL: store_i32_to_numeric_address:
+; CHECK: i32.const $0=, 0{{$}}
+; CHECK: i32.store $discard=, 42($0), $0{{$}}
+define void @store_i32_to_numeric_address() {
+  %s = inttoptr i32 42 to i32*
+  store i32 0, i32* %s
+  ret void
+}
+
+; CHECK-LABEL: store_i32_to_global_address:
+; CHECK: i32.const $0=, 0{{$}}
+; CHECK: i32.store $discard=, gv($0), $0{{$}}
+define void @store_i32_to_global_address() {
+  store i32 0, i32* @gv
+  ret void
+}
+
+; Fold an offset into a sign-extending load.
+
+; CHECK-LABEL: load_i8_s_with_folded_offset:
+; CHECK: i32.load8_s $push0=, 24($0){{$}}
+define i32 @load_i8_s_with_folded_offset(i8* %p) {
+  %q = ptrtoint i8* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i8*
+  %t = load i8, i8* %s
+  %u = sext i8 %t to i32
+  ret i32 %u
+}
+
+; Fold an offset into a zero-extending load.
+
+; CHECK-LABEL: load_i8_u_with_folded_offset:
+; CHECK: i32.load8_u $push0=, 24($0){{$}}
+define i32 @load_i8_u_with_folded_offset(i8* %p) {
+  %q = ptrtoint i8* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i8*
+  %t = load i8, i8* %s
+  %u = zext i8 %t to i32
+  ret i32 %u
+}
+
+; Fold an offset into a truncating store.
+
+; CHECK-LABEL: store_i8_with_folded_offset:
+; CHECK: i32.store8 $discard=, 24($0), $pop0{{$}}
+define void @store_i8_with_folded_offset(i8* %p) {
+  %q = ptrtoint i8* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i8*
+  store i8 0, i8* %s
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/phi.ll b/test/CodeGen/WebAssembly/phi.ll
new file mode 100644
index 0000000000000..bae8a7c9e3b85
--- /dev/null
+++ b/test/CodeGen/WebAssembly/phi.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+
+; Test that phis are lowered.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; Basic phi triangle.
+
+; CHECK-LABEL: test0:
+; CHECK: div_s $[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}}
+; CHECK: return $[[NUM0]]{{$}}
+define i32 @test0(i32 %p) {
+entry:
+  %t = icmp slt i32 %p, 0
+  br i1 %t, label %true, label %done
+true:
+  %a = sdiv i32 %p, 3
+  br label %done
+done:
+  %s = phi i32 [ %a, %true ], [ %p, %entry ]
+  ret i32 %s
+}
+
+; Swap phis.
+
+; CHECK-LABEL: test1:
+; CHECK: BB1_1:
+; CHECK: copy_local $[[NUM0:[0-9]+]]=, $[[NUM1:[0-9]+]]{{$}}
+; CHECK: copy_local $[[NUM1]]=, $[[NUM2:[0-9]+]]{{$}}
+; CHECK: copy_local $[[NUM2]]=, $[[NUM0]]{{$}}
+define i32 @test1(i32 %n) {
+entry:
+  br label %loop
+
+loop:
+  %a = phi i32 [ 0, %entry ], [ %b, %loop ]
+  %b = phi i32 [ 1, %entry ], [ %a, %loop ]
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+
+  %i.next = add i32 %i, 1
+  %t = icmp slt i32 %i.next, %n
+  br i1 %t, label %loop, label %exit
+
+exit:
+  ret i32 %a
+}
diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll
new file mode 100644
index 0000000000000..1c1b1e193f7af
--- /dev/null
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@@ -0,0 +1,126 @@
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+
+; Test the register stackifier pass.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; No because of pointer aliasing.
+
+; CHECK-LABEL: no0:
+; CHECK: return $1{{$}}
+define i32 @no0(i32* %p, i32* %q) {
+  %t = load i32, i32* %q
+  store i32 0, i32* %p
+  ret i32 %t
+}
+
+; No because of side effects.
+
+; CHECK-LABEL: no1:
+; CHECK: return $1{{$}}
+define i32 @no1(i32* %p, i32* dereferenceable(4) %q) {
+  %t = load volatile i32, i32* %q, !invariant.load !0
+  store volatile i32 0, i32* %p
+  ret i32 %t
+}
+
+; Yes because of invariant load and no side effects.
+
+; CHECK-LABEL: yes0:
+; CHECK: return $pop0{{$}}
+define i32 @yes0(i32* %p, i32* dereferenceable(4) %q) {
+  %t = load i32, i32* %q, !invariant.load !0
+  store i32 0, i32* %p
+  ret i32 %t
+}
+
+; Yes because of no intervening side effects.
+
+; CHECK-LABEL: yes1:
+; CHECK: return $pop0{{$}}
+define i32 @yes1(i32* %q) {
+  %t = load volatile i32, i32* %q
+  ret i32 %t
+}
+
+; Don't schedule stack uses into the stack. To reduce register pressure, the
+; scheduler might be tempted to move the definition of $2 down. However, this
+; would risk getting incorrect liveness if the instructions are later
+; rearranged to make the stack contiguous.
+
+; CHECK-LABEL: stack_uses:
+; CHECK-NEXT: .param i32, i32, i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .local i32, i32{{$}}
+; CHECK-NEXT: i32.const   $5=, 2{{$}}
+; CHECK-NEXT: i32.const   $4=, 1{{$}}
+; CHECK-NEXT: block       BB4_2{{$}}
+; CHECK-NEXT: i32.lt_s    $push0=, $0, $4{{$}}
+; CHECK-NEXT: i32.lt_s    $push1=, $1, $5{{$}}
+; CHECK-NEXT: i32.xor     $push4=, $pop0, $pop1{{$}}
+; CHECK-NEXT: i32.lt_s    $push2=, $2, $4{{$}}
+; CHECK-NEXT: i32.lt_s    $push3=, $3, $5{{$}}
+; CHECK-NEXT: i32.xor     $push5=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.xor     $push6=, $pop4, $pop5{{$}}
+; CHECK-NEXT: i32.ne      $push7=, $pop6, $4{{$}}
+; CHECK-NEXT: br_if       $pop7, BB4_2{{$}}
+; CHECK-NEXT: i32.const   $push8=, 0{{$}}
+; CHECK-NEXT: return      $pop8{{$}}
+; CHECK-NEXT: BB4_2:
+; CHECK-NEXT: return      $4{{$}}
+define i32 @stack_uses(i32 %x, i32 %y, i32 %z, i32 %w) {
+entry:
+  %c = icmp sle i32 %x, 0
+  %d = icmp sle i32 %y, 1
+  %e = icmp sle i32 %z, 0
+  %f = icmp sle i32 %w, 1
+  %g = xor i1 %c, %d
+  %h = xor i1 %e, %f
+  %i = xor i1 %g, %h
+  br i1 %i, label %true, label %false
+true:
+  ret i32 0
+false:
+  ret i32 1
+}
+
+; Test an interesting case where the load has multiple uses and cannot
+; be trivially stackified.
+
+; CHECK-LABEL: multiple_uses:
+; CHECK-NEXT: .param      i32, i32, i32{{$}}
+; CHECK-NEXT: .local      i32{{$}}
+; CHECK-NEXT: i32.load    $3=, 0($2){{$}}
+; CHECK-NEXT: block       BB5_3{{$}}
+; CHECK-NEXT: i32.ge_u    $push0=, $3, $1{{$}}
+; CHECK-NEXT: br_if       $pop0, BB5_3{{$}}
+; CHECK-NEXT: i32.lt_u    $push1=, $3, $0{{$}}
+; CHECK-NEXT: br_if       $pop1, BB5_3{{$}}
+; CHECK-NEXT: i32.store   $discard=, 0($2), $3{{$}}
+; CHECK-NEXT: BB5_3:
+; CHECK-NEXT: return{{$}}
+define void @multiple_uses(i32* %arg0, i32* %arg1, i32* %arg2) nounwind {
+bb:
+  br label %loop
+
+loop:
+  %tmp7 = load i32, i32* %arg2
+  %tmp8 = inttoptr i32 %tmp7 to i32*
+  %tmp9 = icmp uge i32* %tmp8, %arg1
+  %tmp10 = icmp ult i32* %tmp8, %arg0
+  %tmp11 = or i1 %tmp9, %tmp10
+  br i1 %tmp11, label %back, label %then
+
+then:
+  store i32 %tmp7, i32* %arg2
+  br label %back
+
+back:
+  br i1 undef, label %return, label %loop
+
+return:
+  ret void
+}
+
+!0 = !{}
diff --git a/test/CodeGen/WebAssembly/return-int32.ll b/test/CodeGen/WebAssembly/return-int32.ll
new file mode 100644
index 0000000000000..663cef4e459d2
--- /dev/null
+++ b/test/CodeGen/WebAssembly/return-int32.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: return_i32:
+; CHECK: return $0{{$}}
+define i32 @return_i32(i32 %p) {
+  ret i32 %p
+}
diff --git a/test/CodeGen/WebAssembly/return-void.ll b/test/CodeGen/WebAssembly/return-void.ll
new file mode 100644
index 0000000000000..4933bfcb87e66
--- /dev/null
+++ b/test/CodeGen/WebAssembly/return-void.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: return_void:
+; CHECK: return{{$}}
+define void @return_void() {
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/returned.ll b/test/CodeGen/WebAssembly/returned.ll
new file mode 100644
index 0000000000000..e208e198c73dc
--- /dev/null
+++ b/test/CodeGen/WebAssembly/returned.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that the "returned" attribute is optimized effectively.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: _Z3foov:
+; CHECK-NEXT: .result   i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 1{{$}}
+; CHECK-NEXT: {{^}} i32.call      $push1=, _Znwm, $pop0{{$}}
+; CHECK-NEXT: {{^}} i32.call      $push2=, _ZN5AppleC1Ev, $pop1{{$}}
+; CHECK-NEXT: return    $pop2{{$}}
+%class.Apple = type { i8 }
+declare noalias i8* @_Znwm(i32)
+declare %class.Apple* @_ZN5AppleC1Ev(%class.Apple* returned)
+define %class.Apple* @_Z3foov() {
+entry:
+  %call = tail call noalias i8* @_Znwm(i32 1)
+  %0 = bitcast i8* %call to %class.Apple*
+  %call1 = tail call %class.Apple* @_ZN5AppleC1Ev(%class.Apple* %0)
+  ret %class.Apple* %0
+}
+
+; CHECK-LABEL: _Z3barPvS_l:
+; CHECK-NEXT: .param   i32, i32, i32{{$}}
+; CHECK-NEXT: .result  i32{{$}}
+; CHECK-NEXT: {{^}} i32.call     $push0=, memcpy, $0, $1, $2{{$}}
+; CHECK-NEXT: return   $pop0{{$}}
+declare i8* @memcpy(i8* returned, i8*, i32)
+define i8* @_Z3barPvS_l(i8* %p, i8* %s, i32 %n) {
+entry:
+  %call = tail call i8* @memcpy(i8* %p, i8* %s, i32 %n)
+  ret i8* %p
+}
+
+; Test that the optimization isn't performed on constant arguments.
+
+; CHECK-LABEL: test_constant_arg:
+; CHECK-NEXT: i32.const   $push0=, global{{$}}
+; CHECK-NEXT: {{^}} i32.call        $discard=, returns_arg, $pop0{{$}}
+; CHECK-NEXT: return{{$}}
+@global = external global i32
+@addr = global i32* @global
+define void @test_constant_arg() {
+  %call = call i32* @returns_arg(i32* @global)
+  ret void
+}
+declare i32* @returns_arg(i32* returned)
diff --git a/test/CodeGen/WebAssembly/select.ll b/test/CodeGen/WebAssembly/select.ll
new file mode 100644
index 0000000000000..1b1d7aed7154f
--- /dev/null
+++ b/test/CodeGen/WebAssembly/select.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -fast-isel | FileCheck %s
+
+; Test that wasm select instruction is selected from LLVM select instruction.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: select_i32_bool:
+; CHECK-NEXT: .param     i32, i32, i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: i32.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define i32 @select_i32_bool(i1 zeroext %a, i32 %b, i32 %c) {
+  %cond = select i1 %a, i32 %b, i32 %c
+  ret i32 %cond
+}
+
+; CHECK-LABEL: select_i32_eq:
+; CHECK-NEXT: .param     i32, i32, i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: i32.select $push0=, $0, $2, $1{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define i32 @select_i32_eq(i32 %a, i32 %b, i32 %c) {
+  %cmp = icmp eq i32 %a, 0
+  %cond = select i1 %cmp, i32 %b, i32 %c
+  ret i32 %cond
+}
+
+; CHECK-LABEL: select_i32_ne:
+; CHECK-NEXT: .param     i32, i32, i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: i32.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define i32 @select_i32_ne(i32 %a, i32 %b, i32 %c) {
+  %cmp = icmp ne i32 %a, 0
+  %cond = select i1 %cmp, i32 %b, i32 %c
+  ret i32 %cond
+}
+
+; CHECK-LABEL: select_i64_bool:
+; CHECK-NEXT: .param     i32, i64, i64{{$}}
+; CHECK-NEXT: .result    i64{{$}}
+; CHECK-NEXT: i64.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define i64 @select_i64_bool(i1 zeroext %a, i64 %b, i64 %c) {
+  %cond = select i1 %a, i64 %b, i64 %c
+  ret i64 %cond
+}
+
+; CHECK-LABEL: select_i64_eq:
+; CHECK-NEXT: .param     i32, i64, i64{{$}}
+; CHECK-NEXT: .result    i64{{$}}
+; CHECK-NEXT: i64.select $push0=, $0, $2, $1{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define i64 @select_i64_eq(i32 %a, i64 %b, i64 %c) {
+  %cmp = icmp eq i32 %a, 0
+  %cond = select i1 %cmp, i64 %b, i64 %c
+  ret i64 %cond
+}
+
+; CHECK-LABEL: select_i64_ne:
+; CHECK-NEXT: .param     i32, i64, i64{{$}}
+; CHECK-NEXT: .result    i64{{$}}
+; CHECK-NEXT: i64.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define i64 @select_i64_ne(i32 %a, i64 %b, i64 %c) {
+  %cmp = icmp ne i32 %a, 0
+  %cond = select i1 %cmp, i64 %b, i64 %c
+  ret i64 %cond
+}
+
+; CHECK-LABEL: select_f32_bool:
+; CHECK-NEXT: .param     i32, f32, f32{{$}}
+; CHECK-NEXT: .result    f32{{$}}
+; CHECK-NEXT: f32.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define float @select_f32_bool(i1 zeroext %a, float %b, float %c) {
+  %cond = select i1 %a, float %b, float %c
+  ret float %cond
+}
+
+; CHECK-LABEL: select_f32_eq:
+; CHECK-NEXT: .param     i32, f32, f32{{$}}
+; CHECK-NEXT: .result    f32{{$}}
+; CHECK-NEXT: f32.select $push0=, $0, $2, $1{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define float @select_f32_eq(i32 %a, float %b, float %c) {
+  %cmp = icmp eq i32 %a, 0
+  %cond = select i1 %cmp, float %b, float %c
+  ret float %cond
+}
+
+; CHECK-LABEL: select_f32_ne:
+; CHECK-NEXT: .param     i32, f32, f32{{$}}
+; CHECK-NEXT: .result    f32{{$}}
+; CHECK-NEXT: f32.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define float @select_f32_ne(i32 %a, float %b, float %c) {
+  %cmp = icmp ne i32 %a, 0
+  %cond = select i1 %cmp, float %b, float %c
+  ret float %cond
+}
+
+; CHECK-LABEL: select_f64_bool:
+; CHECK-NEXT: .param     i32, f64, f64{{$}}
+; CHECK-NEXT: .result    f64{{$}}
+; CHECK-NEXT: f64.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define double @select_f64_bool(i1 zeroext %a, double %b, double %c) {
+  %cond = select i1 %a, double %b, double %c
+  ret double %cond
+}
+
+; CHECK-LABEL: select_f64_eq:
+; CHECK-NEXT: .param     i32, f64, f64{{$}}
+; CHECK-NEXT: .result    f64{{$}}
+; CHECK-NEXT: f64.select $push0=, $0, $2, $1{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define double @select_f64_eq(i32 %a, double %b, double %c) {
+  %cmp = icmp eq i32 %a, 0
+  %cond = select i1 %cmp, double %b, double %c
+  ret double %cond
+}
+
+; CHECK-LABEL: select_f64_ne:
+; CHECK-NEXT: .param     i32, f64, f64{{$}}
+; CHECK-NEXT: .result    f64{{$}}
+; CHECK-NEXT: f64.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: return     $pop0{{$}}
+define double @select_f64_ne(i32 %a, double %b, double %c) {
+  %cmp = icmp ne i32 %a, 0
+  %cond = select i1 %cmp, double %b, double %c
+  ret double %cond
+}
diff --git a/test/CodeGen/WebAssembly/signext-zeroext.ll b/test/CodeGen/WebAssembly/signext-zeroext.ll
new file mode 100644
index 0000000000000..40d49af0ccc7a
--- /dev/null
+++ b/test/CodeGen/WebAssembly/signext-zeroext.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test zeroext and signext ABI keywords
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: z2s_func:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .local i32{{$}}
+; CHECK-NEXT: i32.const $[[NUM0:[0-9]+]]=, 24{{$}}
+; CHECK-NEXT: i32.shl $push[[NUM2:[0-9]+]]=, $0, $[[NUM0]]{{$}}
+; CHECK-NEXT: i32.shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM0]]{{$}}
+; CHECK-NEXT: return $pop[[NUM3]]{{$}}
+define signext i8 @z2s_func(i8 zeroext %t) {
+  ret i8 %t
+}
+
+; CHECK-LABEL: s2z_func:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push[[NUM0:[0-9]+]]=, 255{{$}}
+; CHECK-NEXT: i32.and $push[[NUM1:[0-9]+]]=, $0, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: return $pop[[NUM1]]{{$}}
+define zeroext i8 @s2z_func(i8 signext %t) {
+  ret i8 %t
+}
+
+; CHECK-LABEL: z2s_call:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push[[NUM0:[0-9]+]]=, 255{{$}}
+; CHECK-NEXT: i32.and $push[[NUM1:[0-9]+]]=, $0, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: call $push[[NUM2:[0-9]+]]=, z2s_func, $pop[[NUM1]]{{$}}
+; CHECK-NEXT: return $pop[[NUM2]]{{$}}
+define i32 @z2s_call(i32 %t) {
+  %s = trunc i32 %t to i8
+  %u = call signext i8 @z2s_func(i8 zeroext %s)
+  %v = sext i8 %u to i32
+  ret i32 %v
+}
+
+; CHECK-LABEL: s2z_call:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .local i32{{$}}
+; CHECK-NEXT: i32.const $[[NUM0:[0-9]+]]=, 24{{$}}
+; CHECK-NEXT: i32.shl $push[[NUM1:[0-9]+]]=, $0, $[[NUM0]]{{$}}
+; CHECK-NEXT: i32.shr_s $push[[NUM2:[0-9]+]]=, $pop[[NUM1]], $[[NUM0]]{{$}}
+; CHECK-NEXT: call $push[[NUM3:[0-9]]]=, s2z_func, $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.shl $push[[NUM4:[0-9]+]]=, $pop[[NUM3]], $[[NUM0]]{{$}}
+; CHECK-NEXT: i32.shr_s $push[[NUM5:[0-9]+]]=, $pop[[NUM4]], $[[NUM0]]{{$}}
+; CHECK-NEXT: return $pop[[NUM5]]{{$}}
+define i32 @s2z_call(i32 %t) {
+  %s = trunc i32 %t to i8
+  %u = call zeroext i8 @s2z_func(i8 signext %s)
+  %v = sext i8 %u to i32
+  ret i32 %v
+}
diff --git a/test/CodeGen/WebAssembly/store-results.ll b/test/CodeGen/WebAssembly/store-results.ll
new file mode 100644
index 0000000000000..73479e544db94
--- /dev/null
+++ b/test/CodeGen/WebAssembly/store-results.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that the wasm-store-results pass makes users of stored values use the
+; result of store expressions to reduce get_local/set_local traffic.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: single_block:
+; CHECK-NOT: .local
+; CHECK: i32.const $push{{[0-9]+}}=, 0{{$}}
+; CHECK: i32.store $push[[STORE:[0-9]+]]=, 0($0), $pop{{[0-9]+}}{{$}}
+; CHECK: return $pop[[STORE]]{{$}}
+define i32 @single_block(i32* %p) {
+entry:
+  store i32 0, i32* %p
+  ret i32 0
+}
+
+; Test interesting corner cases for wasm-store-results, in which the operand of
+; a store ends up getting used by a phi, which needs special handling in the
+; dominance test, since phis use their operands on their incoming edges.
+
+%class.Vec3 = type { float, float, float }
+
+@pos = global %class.Vec3 zeroinitializer, align 4
+
+; CHECK-LABEL: foo:
+; CHECK: i32.store $discard=, pos($0), $0{{$}}
+define void @foo() {
+for.body.i:
+  br label %for.body5.i
+
+for.body5.i:
+  %i.0168.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body5.i ]
+  %conv6.i = sitofp i32 %i.0168.i to float
+  store volatile float 0.0, float* getelementptr inbounds (%class.Vec3, %class.Vec3* @pos, i32 0, i32 0)
+  %inc.i = add nuw nsw i32 %i.0168.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 256
+  br i1 %exitcond.i, label %for.cond.cleanup4.i, label %for.body5.i
+
+for.cond.cleanup4.i:
+  ret void
+}
+
+; CHECK-LABEL: bar:
+; CHECK: i32.store $discard=, pos($0), $0{{$}}
+define void @bar() {
+for.body.i:
+  br label %for.body5.i
+
+for.body5.i:
+  %i.0168.i = phi float [ 0.0, %for.body.i ], [ %inc.i, %for.body5.i ]
+  store volatile float 0.0, float* getelementptr inbounds (%class.Vec3, %class.Vec3* @pos, i32 0, i32 0)
+  %inc.i = fadd float %i.0168.i, 1.0
+  %exitcond.i = fcmp oeq float %inc.i, 256.0
+  br i1 %exitcond.i, label %for.cond.cleanup4.i, label %for.body5.i
+
+for.cond.cleanup4.i:
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/store-trunc.ll b/test/CodeGen/WebAssembly/store-trunc.ll
new file mode 100644
index 0000000000000..c12b716dfd59c
--- /dev/null
+++ b/test/CodeGen/WebAssembly/store-trunc.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that truncating stores are assembled properly.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: trunc_i8_i32:
+; CHECK: i32.store8 $discard=, 0($0), $1{{$}}
+define void @trunc_i8_i32(i8 *%p, i32 %v) {
+  %t = trunc i32 %v to i8
+  store i8 %t, i8* %p
+  ret void
+}
+
+; CHECK-LABEL: trunc_i16_i32:
+; CHECK: i32.store16 $discard=, 0($0), $1{{$}}
+define void @trunc_i16_i32(i16 *%p, i32 %v) {
+  %t = trunc i32 %v to i16
+  store i16 %t, i16* %p
+  ret void
+}
+
+; CHECK-LABEL: trunc_i8_i64:
+; CHECK: i64.store8 $discard=, 0($0), $1{{$}}
+define void @trunc_i8_i64(i8 *%p, i64 %v) {
+  %t = trunc i64 %v to i8
+  store i8 %t, i8* %p
+  ret void
+}
+
+; CHECK-LABEL: trunc_i16_i64:
+; CHECK: i64.store16 $discard=, 0($0), $1{{$}}
+define void @trunc_i16_i64(i16 *%p, i64 %v) {
+  %t = trunc i64 %v to i16
+  store i16 %t, i16* %p
+  ret void
+}
+
+; CHECK-LABEL: trunc_i32_i64:
+; CHECK: i64.store32 $discard=, 0($0), $1{{$}}
+define void @trunc_i32_i64(i32 *%p, i64 %v) {
+  %t = trunc i64 %v to i32
+  store i32 %t, i32* %p
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/store.ll b/test/CodeGen/WebAssembly/store.ll
new file mode 100644
index 0000000000000..442caedef3a7d
--- /dev/null
+++ b/test/CodeGen/WebAssembly/store.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic stores are assembled properly.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: sti32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store $discard=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti32(i32 *%p, i32 %v) {
+  store i32 %v, i32* %p
+  ret void
+}
+
+; CHECK-LABEL: sti64:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store $discard=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti64(i64 *%p, i64 %v) {
+  store i64 %v, i64* %p
+  ret void
+}
+
+; CHECK-LABEL: stf32:
+; CHECK-NEXT: .param i32, f32{{$}}
+; CHECK-NEXT: f32.store $discard=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @stf32(float *%p, float %v) {
+  store float %v, float* %p
+  ret void
+}
+
+; CHECK-LABEL: stf64:
+; CHECK-NEXT: .param i32, f64{{$}}
+; CHECK-NEXT: f64.store $discard=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @stf64(double *%p, double %v) {
+  store double %v, double* %p
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/switch.ll b/test/CodeGen/WebAssembly/switch.ll
new file mode 100644
index 0000000000000..7f6f6efff7d60
--- /dev/null
+++ b/test/CodeGen/WebAssembly/switch.ll
@@ -0,0 +1,174 @@
+; RUN: llc < %s -asm-verbose=false -disable-block-placement -verify-machineinstrs | FileCheck %s
+
+; Test switch instructions. Block placement is disabled because it reorders
+; the blocks in a way that isn't interesting here.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare void @foo0()
+declare void @foo1()
+declare void @foo2()
+declare void @foo3()
+declare void @foo4()
+declare void @foo5()
+
+; CHECK-LABEL: bar32:
+; CHECK: block BB0_8{{$}}
+; CHECK: block BB0_7{{$}}
+; CHECK: block BB0_6{{$}}
+; CHECK: block BB0_5{{$}}
+; CHECK: block BB0_4{{$}}
+; CHECK: block BB0_3{{$}}
+; CHECK: block BB0_2{{$}}
+; CHECK: tableswitch {{[^,]*}}, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_5, BB0_6, BB0_7{{$}}
+; CHECK: BB0_2:
+; CHECK:   call foo0
+; CHECK: BB0_3:
+; CHECK:   call foo1
+; CHECK: BB0_4:
+; CHECK:   call foo2
+; CHECK: BB0_5:
+; CHECK:   call foo3
+; CHECK: BB0_6:
+; CHECK:   call foo4
+; CHECK: BB0_7:
+; CHECK:   call foo5
+; CHECK: BB0_8:
+; CHECK:   return{{$}}
+define void @bar32(i32 %n) {
+entry:
+  switch i32 %n, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb
+    i32 4, label %sw.bb
+    i32 5, label %sw.bb
+    i32 6, label %sw.bb
+    i32 7, label %sw.bb.1
+    i32 8, label %sw.bb.1
+    i32 9, label %sw.bb.1
+    i32 10, label %sw.bb.1
+    i32 11, label %sw.bb.1
+    i32 12, label %sw.bb.1
+    i32 13, label %sw.bb.1
+    i32 14, label %sw.bb.1
+    i32 15, label %sw.bb.2
+    i32 16, label %sw.bb.2
+    i32 17, label %sw.bb.2
+    i32 18, label %sw.bb.2
+    i32 19, label %sw.bb.2
+    i32 20, label %sw.bb.2
+    i32 21, label %sw.bb.3
+    i32 22, label %sw.bb.4
+    i32 23, label %sw.bb.5
+  ]
+
+sw.bb:                                            ; preds = %entry, %entry, %entry, %entry, %entry, %entry, %entry
+  tail call void @foo0()
+  br label %sw.epilog
+
+sw.bb.1:                                          ; preds = %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry
+  tail call void @foo1()
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry, %entry, %entry, %entry, %entry, %entry
+  tail call void @foo2()
+  br label %sw.epilog
+
+sw.bb.3:                                          ; preds = %entry
+  tail call void @foo3()
+  br label %sw.epilog
+
+sw.bb.4:                                          ; preds = %entry
+  tail call void @foo4()
+  br label %sw.epilog
+
+sw.bb.5:                                          ; preds = %entry
+  tail call void @foo5()
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %entry, %sw.bb.5, %sw.bb.4, %sw.bb.3, %sw.bb.2, %sw.bb.1, %sw.bb
+  ret void
+}
+
+; CHECK-LABEL: bar64:
+; CHECK: block BB1_8{{$}}
+; CHECK: block BB1_7{{$}}
+; CHECK: block BB1_6{{$}}
+; CHECK: block BB1_5{{$}}
+; CHECK: block BB1_4{{$}}
+; CHECK: block BB1_3{{$}}
+; CHECK: block BB1_2{{$}}
+; CHECK: tableswitch {{[^,]*}}, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_5, BB1_6, BB1_7{{$}}
+; CHECK: BB1_2:
+; CHECK:   call foo0
+; CHECK: BB1_3:
+; CHECK:   call foo1
+; CHECK: BB1_4:
+; CHECK:   call foo2
+; CHECK: BB1_5:
+; CHECK:   call foo3
+; CHECK: BB1_6:
+; CHECK:   call foo4
+; CHECK: BB1_7:
+; CHECK:   call foo5
+; CHECK: BB1_8:
+; CHECK:   return{{$}}
+define void @bar64(i64 %n) {
+entry:
+  switch i64 %n, label %sw.epilog [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 2, label %sw.bb
+    i64 3, label %sw.bb
+    i64 4, label %sw.bb
+    i64 5, label %sw.bb
+    i64 6, label %sw.bb
+    i64 7, label %sw.bb.1
+    i64 8, label %sw.bb.1
+    i64 9, label %sw.bb.1
+    i64 10, label %sw.bb.1
+    i64 11, label %sw.bb.1
+    i64 12, label %sw.bb.1
+    i64 13, label %sw.bb.1
+    i64 14, label %sw.bb.1
+    i64 15, label %sw.bb.2
+    i64 16, label %sw.bb.2
+    i64 17, label %sw.bb.2
+    i64 18, label %sw.bb.2
+    i64 19, label %sw.bb.2
+    i64 20, label %sw.bb.2
+    i64 21, label %sw.bb.3
+    i64 22, label %sw.bb.4
+    i64 23, label %sw.bb.5
+  ]
+
+sw.bb:                                            ; preds = %entry, %entry, %entry, %entry, %entry, %entry, %entry
+  tail call void @foo0()
+  br label %sw.epilog
+
+sw.bb.1:                                          ; preds = %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry
+  tail call void @foo1()
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry, %entry, %entry, %entry, %entry, %entry
+  tail call void @foo2()
+  br label %sw.epilog
+
+sw.bb.3:                                          ; preds = %entry
+  tail call void @foo3()
+  br label %sw.epilog
+
+sw.bb.4:                                          ; preds = %entry
+  tail call void @foo4()
+  br label %sw.epilog
+
+sw.bb.5:                                          ; preds = %entry
+  tail call void @foo5()
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %entry, %sw.bb.5, %sw.bb.4, %sw.bb.3, %sw.bb.2, %sw.bb.1, %sw.bb
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/unreachable.ll b/test/CodeGen/WebAssembly/unreachable.ll
new file mode 100644
index 0000000000000..414767e5c35df
--- /dev/null
+++ b/test/CodeGen/WebAssembly/unreachable.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -fast-isel -verify-machineinstrs | FileCheck %s
+
+; Test that LLVM unreachable instruction and trap intrinsic are lowered to
+; wasm unreachable
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare void @llvm.trap()
+declare void @llvm.debugtrap()
+declare void @abort()
+
+; CHECK-LABEL: f1:
+; CHECK: call abort
+; CHECK: unreachable
+define i32 @f1() {
+  call void @abort()
+  unreachable
+}
+
+; CHECK-LABEL: f2:
+; CHECK: unreachable
+define void @f2() {
+  call void @llvm.trap()
+  ret void
+}
+
+; CHECK-LABEL: f3:
+; CHECK: unreachable
+define void @f3() {
+  call void @llvm.debugtrap()
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/unused-argument.ll b/test/CodeGen/WebAssembly/unused-argument.ll
new file mode 100644
index 0000000000000..e7851b216cb47
--- /dev/null
+++ b/test/CodeGen/WebAssembly/unused-argument.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Make sure that argument offsets are correct even if some arguments are unused.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: unused_first:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: return $1{{$}}
+define i32 @unused_first(i32 %x, i32 %y) {
+  ret i32 %y
+}
+
+; CHECK-LABEL: unused_second:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: return $0{{$}}
+define i32 @unused_second(i32 %x, i32 %y) {
+  ret i32 %x
+}
+
+; CHECK-LABEL: call_something:
+; CHECK-NEXT: {{^}} i32.call $discard=, return_something{{$}}
+; CHECK-NEXT: return{{$}}
+declare i32 @return_something()
+define void @call_something() {
+  call i32 @return_something()
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll
new file mode 100644
index 0000000000000..6e01e36cf9fa4
--- /dev/null
+++ b/test/CodeGen/WebAssembly/userstack.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -fast-isel | FileCheck %s
+
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: alloca32:
+; Check that there is an extra local for the stack pointer.
+; CHECK: .local i32, i32, i32, i32{{$}}
+define void @alloca32() {
+ ; CHECK: i32.const [[L1:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.load [[L1]]=, 0([[L1]])
+ ; CHECK-NEXT: i32.const [[L2:.+]]=, 16
+ ; CHECK-NEXT: i32.sub [[SP:.+]]=, [[L1]], [[L2]]
+ %retval = alloca i32
+ ; CHECK: i32.const $push[[L3:.+]]=, 0
+ ; CHECK: i32.store {{.*}}=, 12([[SP]]), $pop[[L3]]
+ store i32 0, i32* %retval
+ ; CHECK: i32.const [[L4:.+]]=, 16
+ ; CHECK-NEXT: i32.add [[SP]]=, [[SP]], [[L4]]
+ ; CHECK-NEXT: i32.const [[L5:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.store [[SP]]=, 0([[L5]]), [[SP]]
+ ret void
+}
+
+; CHECK-LABEL: alloca3264:
+; CHECK: .local i32, i32, i32, i32{{$}}
+define void @alloca3264() {
+ ; CHECK: i32.const [[L1:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.load [[L1]]=, 0([[L1]])
+ ; CHECK-NEXT: i32.const [[L2:.+]]=, 16
+ ; CHECK-NEXT: i32.sub [[SP:.+]]=, [[L1]], [[L2]]
+ %r1 = alloca i32
+ %r2 = alloca double
+ ; CHECK: i32.const $push[[L3:.+]]=, 0
+ ; CHECK: i32.store {{.*}}=, 12([[SP]]), $pop[[L3]]
+ store i32 0, i32* %r1
+ ; CHECK: i64.const $push[[L4:.+]]=, 0
+ ; CHECK: i64.store {{.*}}=, 0([[SP]]), $pop[[L4]]
+ store double 0.0, double* %r2
+ ; CHECK: i32.const [[L4:.+]]=, 16
+ ; CHECK-NEXT: i32.add [[SP]]=, [[SP]], [[L4]]
+ ; CHECK-NEXT: i32.const [[L5:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.store [[SP]]=, 0([[L5]]), [[SP]]
+ ret void
+}
+
+; CHECK-LABEL: allocarray:
+; CHECK: .local i32, i32, i32, i32, i32, i32{{$}}
+define void @allocarray() {
+ ; CHECK: i32.const [[L1:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.load [[L1]]=, 0([[L1]])
+ ; CHECK-NEXT: i32.const [[L2:.+]]=, 32
+ ; CHECK-NEXT: i32.sub [[SP:.+]]=, [[L1]], [[L2]]
+ %r = alloca [5 x i32]
+ ; CHECK: i32.const $push[[L3:.+]]=, 1
+ ; CHECK: i32.store {{.*}}=, 12([[SP]]), $pop[[L3]]
+ %p = getelementptr [5 x i32], [5 x i32]* %r, i32 0, i32 0
+ store i32 1, i32* %p
+ ; CHECK: i32.const $push[[L4:.+]]=, 4
+ ; CHECK: i32.const [[L5:.+]]=, 12
+ ; CHECK: i32.add [[L5]]=, [[SP]], [[L5]]
+ ; CHECK: i32.add $push[[L6:.+]]=, [[L5]], $pop[[L4]]
+ ; CHECK: i32.store {{.*}}=, 0($pop[[L6]]), ${{.+}}
+ %p2 = getelementptr [5 x i32], [5 x i32]* %r, i32 0, i32 1
+ store i32 1, i32* %p2
+ ; CHECK: i32.const [[L7:.+]]=, 32
+ ; CHECK-NEXT: i32.add [[SP]]=, [[SP]], [[L7]]
+ ; CHECK-NEXT: i32.const [[L8:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.store [[SP]]=, 0([[L7]]), [[SP]]
+ ret void
+}
+
+define void @dynamic_alloca(i32 %alloc) {
+ ; TODO: Support frame pointers
+ ;%r = alloca i32, i32 %alloc
+ ;store i32 0, i32* %r
+ ret void
+}
+; TODO: test aligned alloc
diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll
new file mode 100644
index 0000000000000..c564d94207424
--- /dev/null
+++ b/test/CodeGen/WebAssembly/varargs.ll
@@ -0,0 +1,123 @@
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+
+; Test varargs constructs.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; Test va_start.
+
+; TODO: Test va_start.
+
+;define void @start(i8** %ap, ...) {
+;entry:
+;  %0 = bitcast i8** %ap to i8*
+;  call void @llvm.va_start(i8* %0)
+;  ret void
+;}
+
+; Test va_end.
+
+; CHECK-LABEL: end:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: return{{$}}
+define void @end(i8** %ap) {
+entry:
+  %0 = bitcast i8** %ap to i8*
+  call void @llvm.va_end(i8* %0)
+  ret void
+}
+
+; Test va_copy.
+
+; CHECK-LABEL: copy:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.load  $push0=, 0($1){{$}}
+; CHECK-NEXT: i32.store $discard=, 0($0), $pop0{{$}}
+; CHECK-NEXT: return{{$}}
+define void @copy(i8** %ap, i8** %bp) {
+entry:
+  %0 = bitcast i8** %ap to i8*
+  %1 = bitcast i8** %bp to i8*
+  call void @llvm.va_copy(i8* %0, i8* %1)
+  ret void
+}
+
+; Test va_arg with an i8 argument.
+
+; CHECK-LABEL: arg_i8:
+; CHECK-NEXT: .param     i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .local     i32{{$}}
+; CHECK-NEXT: i32.load   $1=, 0($0){{$}}
+; CHECK-NEXT: i32.const  $push0=, 4{{$}}
+; CHECK-NEXT: i32.add    $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.store  $discard=, 0($0), $pop1{{$}}
+; CHECK-NEXT: i32.load   $push2=, 0($1){{$}}
+; CHECK-NEXT: return     $pop2{{$}}
+define i8 @arg_i8(i8** %ap) {
+entry:
+  %t = va_arg i8** %ap, i8
+  ret i8 %t
+}
+
+; Test va_arg with an i32 argument.
+
+; CHECK-LABEL: arg_i32:
+; CHECK-NEXT: .param     i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .local     i32{{$}}
+; CHECK-NEXT: i32.load   $push0=, 0($0){{$}}
+; CHECK-NEXT: i32.const  $push1=, 3{{$}}
+; CHECK-NEXT: i32.add    $push2=, $pop0, $pop1{{$}}
+; CHECK-NEXT: i32.const  $push3=, -4{{$}}
+; CHECK-NEXT: i32.and    $1=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.const  $push4=, 4{{$}}
+; CHECK-NEXT: i32.add    $push5=, $1, $pop4{{$}}
+; CHECK-NEXT: i32.store  $discard=, 0($0), $pop5{{$}}
+; CHECK-NEXT: i32.load   $push6=, 0($1){{$}}
+; CHECK-NEXT: return     $pop6{{$}}
+define i32 @arg_i32(i8** %ap) {
+entry:
+  %t = va_arg i8** %ap, i32
+  ret i32 %t
+}
+
+; Test va_arg with an i128 argument.
+
+; CHECK-LABEL: arg_i128:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .local
+; CHECK: i32.and
+; CHECK: i64.load
+; CHECK: i64.load
+; CHECK: return{{$}}
+define i128 @arg_i128(i8** %ap) {
+entry:
+  %t = va_arg i8** %ap, i128
+  ret i128 %t
+}
+
+; Test a varargs call with no actual arguments.
+
+declare void @callee(...)
+
+; CHECK-LABEL: caller_none:
+; CHECK-NEXT: call callee{{$}}
+; CHECK-NEXT: return{{$}}
+define void @caller_none() {
+  call void (...) @callee()
+  ret void
+}
+
+; CHECK-LABEL: caller_some
+define void @caller_some() {
+  ; TODO: Fix interaction between register coalescer and reg stackifier,
+  ; or disable coalescer.
+  ;call void (...) @callee(i32 0, double 2.0)
+  ret void
+}
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+declare void @llvm.va_copy(i8*, i8*)
diff --git a/test/CodeGen/WebAssembly/vtable.ll b/test/CodeGen/WebAssembly/vtable.ll
new file mode 100644
index 0000000000000..38298bc474b53
--- /dev/null
+++ b/test/CodeGen/WebAssembly/vtable.ll
@@ -0,0 +1,171 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s --check-prefix=TYPEINFONAME
+; RUN: llc < %s -asm-verbose=false | FileCheck %s --check-prefix=VTABLE
+; RUN: llc < %s -asm-verbose=false | FileCheck %s --check-prefix=TYPEINFO
+
+; Test that simple vtables assemble as expected.
+;
+; The class hierarchy is:
+;   struct A;
+;   struct B : public A;
+;   struct C : public A;
+;   struct D : public B;
+; Each with a virtual dtor and method foo.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+%struct.A = type { i32 (...)** }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+%struct.D = type { %struct.B }
+
+@_ZTVN10__cxxabiv117__class_type_infoE = external global i8*
+@_ZTVN10__cxxabiv120__si_class_type_infoE = external global i8*
+
+; TYPEINFONAME-LABEL: _ZTS1A:
+; TYPEINFONAME-NEXT: .asciz "1A"
+@_ZTS1A = constant [3 x i8] c"1A\00"
+; TYPEINFONAME-LABEL: _ZTS1B:
+; TYPEINFONAME-NEXT: .asciz "1B"
+@_ZTS1B = constant [3 x i8] c"1B\00"
+; TYPEINFONAME-LABEL: _ZTS1C:
+; TYPEINFONAME-NEXT: .asciz "1C"
+@_ZTS1C = constant [3 x i8] c"1C\00"
+; TYPEINFONAME-LABEL: _ZTS1D:
+; TYPEINFONAME-NEXT: .asciz "1D"
+@_ZTS1D = constant [3 x i8] c"1D\00"
+
+; VTABLE:       .type _ZTV1A,@object
+; VTABLE-NEXT:  .section .data.rel.ro,"aw",@progbits
+; VTABLE-NEXT:  .globl _ZTV1A
+; VTABLE-LABEL: _ZTV1A:
+; VTABLE-NEXT:  .int32 0
+; VTABLE-NEXT:  .int32 _ZTI1A
+; VTABLE-NEXT:  .int32 _ZN1AD2Ev
+; VTABLE-NEXT:  .int32 _ZN1AD0Ev
+; VTABLE-NEXT:  .int32 _ZN1A3fooEv
+; VTABLE-NEXT:  .size _ZTV1A, 20
+@_ZTV1A = constant [5 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI1A to i8*), i8* bitcast (%struct.A* (%struct.A*)* @_ZN1AD2Ev to i8*), i8* bitcast (void (%struct.A*)* @_ZN1AD0Ev to i8*), i8* bitcast (void (%struct.A*)* @_ZN1A3fooEv to i8*)], align 4
+; VTABLE:       .type _ZTV1B,@object
+; VTABLE-NEXT:  .globl _ZTV1B
+; VTABLE-LABEL: _ZTV1B:
+; VTABLE-NEXT:  .int32 0
+; VTABLE-NEXT:  .int32 _ZTI1B
+; VTABLE-NEXT:  .int32 _ZN1AD2Ev
+; VTABLE-NEXT:  .int32 _ZN1BD0Ev
+; VTABLE-NEXT:  .int32 _ZN1B3fooEv
+; VTABLE-NEXT:  .size _ZTV1B, 20
+@_ZTV1B = constant [5 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1B to i8*), i8* bitcast (%struct.A* (%struct.A*)* @_ZN1AD2Ev to i8*), i8* bitcast (void (%struct.B*)* @_ZN1BD0Ev to i8*), i8* bitcast (void (%struct.B*)* @_ZN1B3fooEv to i8*)], align 4
+; VTABLE:       .type _ZTV1C,@object
+; VTABLE-NEXT:  .globl _ZTV1C
+; VTABLE-LABEL: _ZTV1C:
+; VTABLE-NEXT:  .int32 0
+; VTABLE-NEXT:  .int32 _ZTI1C
+; VTABLE-NEXT:  .int32 _ZN1AD2Ev
+; VTABLE-NEXT:  .int32 _ZN1CD0Ev
+; VTABLE-NEXT:  .int32 _ZN1C3fooEv
+; VTABLE-NEXT:  .size _ZTV1C, 20
+@_ZTV1C = constant [5 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1C to i8*), i8* bitcast (%struct.A* (%struct.A*)* @_ZN1AD2Ev to i8*), i8* bitcast (void (%struct.C*)* @_ZN1CD0Ev to i8*), i8* bitcast (void (%struct.C*)* @_ZN1C3fooEv to i8*)], align 4
+; VTABLE:       .type _ZTV1D,@object
+; VTABLE-NEXT:  .globl _ZTV1D
+; VTABLE-LABEL: _ZTV1D:
+; VTABLE-NEXT:  .int32 0
+; VTABLE-NEXT:  .int32 _ZTI1D
+; VTABLE-NEXT:  .int32 _ZN1AD2Ev
+; VTABLE-NEXT:  .int32 _ZN1DD0Ev
+; VTABLE-NEXT:  .int32 _ZN1D3fooEv
+; VTABLE-NEXT:  .size _ZTV1D, 20
+@_ZTV1D = constant [5 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1D to i8*), i8* bitcast (%struct.A* (%struct.A*)* @_ZN1AD2Ev to i8*), i8* bitcast (void (%struct.D*)* @_ZN1DD0Ev to i8*), i8* bitcast (void (%struct.D*)* @_ZN1D3fooEv to i8*)], align 4
+
+; TYPEINFO:       .type _ZTI1A,@object
+; TYPEINFO:       .globl _ZTI1A
+; TYPEINFO-LABEL: _ZTI1A:
+; TYPEINFO-NEXT:  .int32 _ZTVN10__cxxabiv117__class_type_infoE+8
+; TYPEINFO-NEXT:  .int32 _ZTS1A
+; TYPEINFO-NEXT:  .size _ZTI1A, 8
+@_ZTI1A = constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv117__class_type_infoE, i32 2) to i8*), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @_ZTS1A, i32 0, i32 0) }
+; TYPEINFO:       .type _ZTI1B,@object
+; TYPEINFO:       .globl _ZTI1B
+; TYPEINFO-LABEL: _ZTI1B:
+; TYPEINFO-NEXT:  .int32 _ZTVN10__cxxabiv120__si_class_type_infoE+8
+; TYPEINFO-NEXT:  .int32 _ZTS1B
+; TYPEINFO-NEXT:  .int32 _ZTI1A
+; TYPEINFO-NEXT:  .size _ZTI1B, 12
+@_ZTI1B = constant { i8*, i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv120__si_class_type_infoE, i32 2) to i8*), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @_ZTS1B, i32 0, i32 0), i8* bitcast ({ i8*, i8* }* @_ZTI1A to i8*) }
+; TYPEINFO:       .type _ZTI1C,@object
+; TYPEINFO:       .globl _ZTI1C
+; TYPEINFO-LABEL: _ZTI1C:
+; TYPEINFO-NEXT:  .int32 _ZTVN10__cxxabiv120__si_class_type_infoE+8
+; TYPEINFO-NEXT:  .int32 _ZTS1C
+; TYPEINFO-NEXT:  .int32 _ZTI1A
+; TYPEINFO-NEXT:  .size _ZTI1C, 12
+@_ZTI1C = constant { i8*, i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv120__si_class_type_infoE, i32 2) to i8*), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @_ZTS1C, i32 0, i32 0), i8* bitcast ({ i8*, i8* }* @_ZTI1A to i8*) }
+; TYPEINFO:       .type _ZTI1D,@object
+; TYPEINFO:       .globl _ZTI1D
+; TYPEINFO-LABEL: _ZTI1D:
+; TYPEINFO-NEXT:  .int32 _ZTVN10__cxxabiv120__si_class_type_infoE+8
+; TYPEINFO-NEXT:  .int32 _ZTS1D
+; TYPEINFO-NEXT:  .int32 _ZTI1B
+; TYPEINFO-NEXT:  .size _ZTI1D, 12
+@_ZTI1D = constant { i8*, i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv120__si_class_type_infoE, i32 2) to i8*), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @_ZTS1D, i32 0, i32 0), i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1B to i8*) }
+
+@g = global i32 0, align 4
+
+define void @_ZN1A3fooEv(%struct.A* %this) {
+entry:
+  store i32 2, i32* @g, align 4
+  ret void
+}
+
+define void @_ZN1B3fooEv(%struct.B* %this) {
+entry:
+  store i32 4, i32* @g, align 4
+  ret void
+}
+
+define void @_ZN1C3fooEv(%struct.C* %this) {
+entry:
+  store i32 6, i32* @g, align 4
+  ret void
+}
+
+define void @_ZN1D3fooEv(%struct.D* %this) {
+entry:
+  store i32 8, i32* @g, align 4
+  ret void
+}
+
+define linkonce_odr void @_ZN1AD0Ev(%struct.A* %this) {
+entry:
+  %0 = bitcast %struct.A* %this to i8*
+  tail call void @_ZdlPv(i8* %0)
+  ret void
+}
+
+define linkonce_odr void @_ZN1BD0Ev(%struct.B* %this) {
+entry:
+  %0 = bitcast %struct.B* %this to i8*
+  tail call void @_ZdlPv(i8* %0)
+  ret void
+}
+
+define linkonce_odr void @_ZN1CD0Ev(%struct.C* %this) {
+entry:
+  %0 = bitcast %struct.C* %this to i8*
+  tail call void @_ZdlPv(i8* %0)
+  ret void
+}
+
+define linkonce_odr %struct.A* @_ZN1AD2Ev(%struct.A* returned %this) {
+entry:
+  ret %struct.A* %this
+}
+
+define linkonce_odr void @_ZN1DD0Ev(%struct.D* %this) {
+entry:
+  %0 = bitcast %struct.D* %this to i8*
+  tail call void @_ZdlPv(i8* %0)
+  ret void
+}
+
+declare void @_ZdlPv(i8*)
diff --git a/test/CodeGen/WinEH/cppeh-alloca-sink.ll b/test/CodeGen/WinEH/cppeh-alloca-sink.ll
deleted file mode 100644
index f215dca2ddd3c..0000000000000
--- a/test/CodeGen/WinEH/cppeh-alloca-sink.ll
+++ /dev/null
@@ -1,180 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test describes two difficult cases in sinking allocas into child frames.
-; We don't currently do this optimization, but we'll need to tweak these tests
-; when we do.
-
-; This test is based on the following code:
-;
-; // In this case we can sink the alloca from the parent into the catch because
-; // the lifetime is limited to the catch.
-; extern "C" void may_throw();
-; extern "C" void sink_alloca_to_catch() {
-;   try {
-;     may_throw();
-;   } catch (int) {
-;     volatile int only_used_in_catch = 42;
-;   }
-; }
-;
-; // In this case we cannot. The variable should live as long as the parent
-; // frame lives.
-; extern "C" void use_catch_var(int *);
-; extern "C" void dont_sink_alloca_to_catch(int n) {
-;   int live_in_out_catch = 0;
-;   while (n > 0) {
-;     try {
-;       may_throw();
-;     } catch (int) {
-;       use_catch_var(&live_in_out_catch);
-;     }
-;     n--;
-;   }
-; }
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-declare void @may_throw() #1
-declare i32 @__CxxFrameHandler3(...)
-declare i32 @llvm.eh.typeid.for(i8*) #2
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
-declare void @llvm.eh.endcatch() #3
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.CatchHandlerType = type { i32, i8* }
-
-$"\01??_R0H@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@llvm.eh.handlertype.H.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
-
-; Function Attrs: uwtable
-define void @sink_alloca_to_catch() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %0 = alloca i32
-  %only_used_in_catch = alloca i32, align 4
-  invoke void @may_throw()
-          to label %try.cont unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %1 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-  %2 = extractvalue { i8*, i32 } %1, 1
-  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)) #3
-  %matches = icmp eq i32 %2, %3
-  br i1 %matches, label %catch, label %eh.resume
-
-catch:                                            ; preds = %lpad
-  %4 = extractvalue { i8*, i32 } %1, 0
-  call void @llvm.eh.begincatch(i8* %4, i8* null) #3
-  store volatile i32 42, i32* %only_used_in_catch, align 4
-  tail call void @llvm.eh.endcatch() #3
-  br label %try.cont
-
-try.cont:                                         ; preds = %entry, %catch
-  ret void
-
-eh.resume:                                        ; preds = %lpad
-  resume { i8*, i32 } %1
-}
-
-; CHECK-LABEL: define void @sink_alloca_to_catch()
-; CHECK: call void (...) @llvm.localescape(i32* %only_used_in_catch)
-
-declare void @use_catch_var(i32*) #1
-
-; Function Attrs: uwtable
-define void @dont_sink_alloca_to_catch(i32 %n) #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %0 = alloca i32
-  %n.addr = alloca i32, align 4
-  %live_in_out_catch = alloca i32, align 4
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  store i32 %n, i32* %n.addr, align 4
-  br label %while.cond
-
-while.cond:                                       ; preds = %try.cont, %entry
-  %1 = load i32, i32* %n.addr, align 4
-  %cmp = icmp sgt i32 %1, 0
-  br i1 %cmp, label %while.body, label %while.end
-
-while.body:                                       ; preds = %while.cond
-  invoke void @may_throw()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %while.body
-  br label %try.cont
-
-lpad:                                             ; preds = %while.body
-  %2 = landingpad { i8*, i32 }
-          catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)
-  %3 = extractvalue { i8*, i32 } %2, 0
-  store i8* %3, i8** %exn.slot
-  %4 = extractvalue { i8*, i32 } %2, 1
-  store i32 %4, i32* %ehselector.slot
-  br label %catch.dispatch
-
-catch.dispatch:                                   ; preds = %lpad
-  %sel = load i32, i32* %ehselector.slot
-  %5 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)) #3
-  %matches = icmp eq i32 %sel, %5
-  br i1 %matches, label %catch, label %eh.resume
-
-catch:                                            ; preds = %catch.dispatch
-  %exn = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn, i8* null) #3
-  invoke void @use_catch_var(i32* %live_in_out_catch)
-          to label %invoke.cont2 unwind label %lpad1
-
-invoke.cont2:                                     ; preds = %catch
-  call void @llvm.eh.endcatch() #3
-  br label %try.cont
-
-try.cont:                                         ; preds = %invoke.cont2, %invoke.cont
-  %6 = load i32, i32* %0
-  %7 = load i32, i32* %n.addr, align 4
-  %dec = add nsw i32 %7, -1
-  store i32 %dec, i32* %n.addr, align 4
-  br label %while.cond
-
-lpad1:                                            ; preds = %catch
-  %8 = landingpad { i8*, i32 }
-          cleanup
-  %9 = extractvalue { i8*, i32 } %8, 0
-  store i8* %9, i8** %exn.slot
-  %10 = extractvalue { i8*, i32 } %8, 1
-  store i32 %10, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #3
-  br label %eh.resume
-
-while.end:                                        ; preds = %while.cond
-  ret void
-
-eh.resume:                                        ; preds = %lpad1, %catch.dispatch
-  %exn3 = load i8*, i8** %exn.slot
-  %sel4 = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn3, 0
-  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel4, 1
-  resume { i8*, i32 } %lpad.val5
-}
-
-; CHECK-LABEL: define void @dont_sink_alloca_to_catch(i32 %n)
-; CHECK: call void (...) @llvm.localescape(i32* %live_in_out_catch)
-
-; CHECK-LABEL: define internal i8* @sink_alloca_to_catch.catch(i8*, i8*)
-; CHECK: %only_used_in_catch.i8 = call i8* @llvm.localrecover({{.*}}, i32 0)
-; CHECK: %only_used_in_catch = bitcast
-
-; CHECK-LABEL: define internal i8* @dont_sink_alloca_to_catch.catch(i8*, i8*)
-; CHECK: %live_in_out_catch.i8 = call i8* @llvm.localrecover({{.*}}, i32 0)
-; CHECK: %live_in_out_catch = bitcast
-
-
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
diff --git a/test/CodeGen/WinEH/cppeh-catch-all-win32.ll b/test/CodeGen/WinEH/cppeh-catch-all-win32.ll
deleted file mode 100644
index b2e84b90d69fc..0000000000000
--- a/test/CodeGen/WinEH/cppeh-catch-all-win32.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: opt -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-; extern "C" void may_throw();
-; extern "C" void handle_exception();
-; extern "C" void test() {
-;   try {
-;     may_throw();
-;   } catch (...) {
-;     handle_exception();
-;   }
-; }
-
-target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
-target triple = "i686-pc-windows-msvc"
-
-; The function entry in this case remains unchanged.
-; CHECK: define void @test()
-; CHECK: entry:
-; CHECK:   invoke void @may_throw()
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-define void @test() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  invoke void @may_throw()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  br label %try.cont
-
-; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           catch i8* null
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* null, i32 -1, i8* ()* @test.catch)
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont]
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* null
-  %1 = extractvalue { i8*, i32 } %0, 0
-  store i8* %1, i8** %exn.slot
-  %2 = extractvalue { i8*, i32 } %0, 1
-  store i32 %2, i32* %ehselector.slot
-  br label %catch
-
-; CHECK-NOT: catch:
-; CHECK-NOT: @handle_exception()
-
-catch:                                            ; preds = %lpad
-  %exn = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn, i8* null) #1
-  call void @handle_exception()
-  call void @llvm.eh.endcatch() #1
-  br label %try.cont
-
-try.cont:                                         ; preds = %catch, %invoke.cont
-  ret void
-
-; CHECK: }
-}
-
-; CHECK: define internal i8* @test.catch()
-; CHECK:   call i8* @llvm.frameaddress(i32 1)
-; CHECK:   call i8* @llvm.x86.seh.recoverfp(i8* bitcast (void ()* @test to i8*), i8* %{{.*}})
-; CHECK:   call void @handle_exception()
-; CHECK:   ret i8* blockaddress(@test, %try.cont)
-; CHECK: }
-
-
-declare void @may_throw() #0
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #1
-
-declare void @handle_exception() #0
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #1
-
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/WinEH/cppeh-catch-all.ll b/test/CodeGen/WinEH/cppeh-catch-all.ll
deleted file mode 100644
index 266dd3e305ca6..0000000000000
--- a/test/CodeGen/WinEH/cppeh-catch-all.ll
+++ /dev/null
@@ -1,97 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-; void test()
-; {
-;   try {
-;     may_throw();
-;   } catch (...) {
-;     handle_exception();
-;   }
-; }
-;
-; Parts of the IR have been hand-edited to simplify the test case.
-; The full IR will be restored when Windows C++ EH support is complete.
-
-; ModuleID = 'catch-all.cpp'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-; The function entry in this case remains unchanged.
-; CHECK: define void @_Z4testv()
-; CHECK: entry:
-; CHECK:   invoke void @_Z9may_throwv()
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-; Function Attrs: uwtable
-define void @_Z4testv() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  invoke void @_Z9may_throwv()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  br label %try.cont
-
-; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           catch i8* null
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* null, i32 -1, i8* (i8*, i8*)* @_Z4testv.catch)
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont]
-
-lpad:                                             ; preds = %entry
-  %tmp = landingpad { i8*, i32 }
-          catch i8* null
-  %tmp1 = extractvalue { i8*, i32 } %tmp, 0
-  store i8* %tmp1, i8** %exn.slot
-  %tmp2 = extractvalue { i8*, i32 } %tmp, 1
-  store i32 %tmp2, i32* %ehselector.slot
-  br label %catch
-
-; CHECK-NOT: catch:
-
-catch:                                            ; preds = %lpad
-  %exn = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn, i8* null) #2
-  call void @_Z16handle_exceptionv()
-  br label %invoke.cont2
-
-; CHECK-NOT: invoke.cont2:
-
-invoke.cont2:                                     ; preds = %catch
-  call void @llvm.eh.endcatch()
-  br label %try.cont
-
-try.cont:                                         ; preds = %invoke.cont2, %invoke.cont
-  ret void
-
-; CHECK: }
-}
-
-; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*)
-; CHECK: entry:
-; CHECK:   call void @_Z16handle_exceptionv()
-; CHECK:   ret i8* blockaddress(@_Z4testv, %try.cont)
-; CHECK: }
-
-declare void @_Z9may_throwv() #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-declare void @llvm.eh.begincatch(i8*, i8*)
-
-declare void @_Z16handle_exceptionv() #1
-
-declare void @llvm.eh.endcatch()
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline noreturn nounwind }
-attributes #3 = { nounwind }
-attributes #4 = { noreturn nounwind }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.7.0 (trunk 226027)"}
diff --git a/test/CodeGen/WinEH/cppeh-catch-and-throw.ll b/test/CodeGen/WinEH/cppeh-catch-and-throw.ll
deleted file mode 100644
index d604b86deb35f..0000000000000
--- a/test/CodeGen/WinEH/cppeh-catch-and-throw.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-; class Obj {
-; public:
-;   ~Obj();
-; };
-;
-; void test(void)
-; {
-;   try {
-;     Obj o;
-;     throw 1;
-;   } catch (...) {
-;     throw;
-;   }
-; }
-
-; ModuleID = 'cppeh-catch-and-throw.cpp'
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
-%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
-%eh.ThrowInfo = type { i32, i32, i32, i32 }
-%class.Obj = type { i8 }
-
-$"\01??_R0H@8" = comdat any
-
-$"_CT??_R0H@84" = comdat any
-
-$_CTA1H = comdat any
-
-$_TI1H = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@__ImageBase = external constant i8
-@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
-@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
-@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
-
-; This is just a minimal check to verify that main was handled by WinEHPrepare.
-; CHECK: define void @"\01?test@@YAXXZ"()
-; CHECK: entry:
-; CHECK:   call void (...) @llvm.localescape
-; CHECK:   invoke void @_CxxThrowException
-; CHECK: }
-
-; Function Attrs: uwtable
-define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %o = alloca %class.Obj, align 1
-  %tmp = alloca i32, align 4
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  store i32 1, i32* %tmp
-  %0 = bitcast i32* %tmp to i8*
-  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #3
-          to label %unreachable unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %1 = landingpad { i8*, i32 }
-          catch i8* null
-  %2 = extractvalue { i8*, i32 } %1, 0
-  store i8* %2, i8** %exn.slot
-  %3 = extractvalue { i8*, i32 } %1, 1
-  store i32 %3, i32* %ehselector.slot
-  call void @"\01??1Obj@@QEAA@XZ"(%class.Obj* %o) #2
-  br label %catch
-
-catch:                                            ; preds = %lpad
-  %exn = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn, i8* null) #2
-  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) #3
-          to label %unreachable unwind label %lpad1
-
-lpad1:                                            ; preds = %catch
-  %4 = landingpad { i8*, i32 }
-          cleanup
-  %5 = extractvalue { i8*, i32 } %4, 0
-  store i8* %5, i8** %exn.slot
-  %6 = extractvalue { i8*, i32 } %4, 1
-  store i32 %6, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #2
-  br label %eh.resume
-
-try.cont:                                         ; No predecessors!
-  ret void
-
-eh.resume:                                        ; preds = %lpad1
-  %exn2 = load i8*, i8** %exn.slot
-  %sel = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn2, 0
-  %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
-  resume { i8*, i32 } %lpad.val3
-
-unreachable:                                      ; preds = %catch, %entry
-  unreachable
-}
-
-; Verify that we inserted a stub invoke into the outlined cleanup handler.
-;
-; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   call i8* @llvm.localrecover
-; CHECK:   call void @"\01??1Obj@@QEAA@XZ"
-; CHECK:   invoke void @llvm.donothing()
-; CHECK:           to label %[[SPLIT_LABEL:.+]] unwind label %[[LPAD_LABEL:.+]]
-;
-; CHECK: [[SPLIT_LABEL]]
-;
-; CHECK: [[LPAD_LABEL]]
-; CHECK:   landingpad { i8*, i32 }
-; CHECK:           cleanup
-; CHECK:   unreachable
-; CHECK: }
-
-declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind
-declare void @"\01??1Obj@@QEAA@XZ"(%class.Obj*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #2
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #2
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-attributes #3 = { noreturn }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 235214) (llvm/trunk 235213)"}
diff --git a/test/CodeGen/WinEH/cppeh-catch-scalar.ll b/test/CodeGen/WinEH/cppeh-catch-scalar.ll
deleted file mode 100644
index 3b5ab746d63ca..0000000000000
--- a/test/CodeGen/WinEH/cppeh-catch-scalar.ll
+++ /dev/null
@@ -1,126 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-; void test()
-; {
-;   try {
-;     may_throw();
-;   } catch (int i) {
-;     handle_int(i);
-;   }
-; }
-;
-; Parts of the IR have been hand-edited to simplify the test case.
-; The full IR will be restored when Windows C++ EH support is complete.
-
-;ModuleID = 'cppeh-catch-scalar.cpp'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-@_ZTIi = external constant i8*
-
-; The function entry will be rewritten like this.
-; CHECK: define void @_Z4testv()
-; CHECK: entry:
-; CHECK:   [[I_PTR:\%.+]] = alloca i32, align 4
-; CHECK:   call void (...) @llvm.localescape(i32* [[I_PTR]])
-; CHECK:   invoke void @_Z9may_throwv()
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-; Function Attrs: uwtable
-define void @_Z4testv() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  %i = alloca i32, align 4
-  invoke void @_Z9may_throwv()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  br label %try.cont
-
-; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIi to i8*)
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i8** @_ZTIi to i8*), i32 0, i8* (i8*, i8*)* @_Z4testv.catch)
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont]
-
-lpad:                                             ; preds = %entry
-  %tmp = landingpad { i8*, i32 }
-          catch i8* bitcast (i8** @_ZTIi to i8*)
-  %tmp1 = extractvalue { i8*, i32 } %tmp, 0
-  store i8* %tmp1, i8** %exn.slot
-  %tmp2 = extractvalue { i8*, i32 } %tmp, 1
-  store i32 %tmp2, i32* %ehselector.slot
-  br label %catch.dispatch
-
-; CHECK-NOT: catch-dispatch:
-
-catch.dispatch:                                   ; preds = %lpad
-  %sel = load i32, i32* %ehselector.slot
-  %tmp3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #3
-  %matches = icmp eq i32 %sel, %tmp3
-  br i1 %matches, label %catch, label %eh.resume
-
-; CHECK-NOT: catch:
-
-catch:                                            ; preds = %catch.dispatch
-  %exn11 = load i8*, i8** %exn.slot
-  %i.i8 = bitcast i32* %i to i8*
-  call void @llvm.eh.begincatch(i8* %exn11, i8* %i.i8) #3
-  %tmp7 = load i32, i32* %i, align 4
-  call void @_Z10handle_inti(i32 %tmp7)
-  br label %invoke.cont2
-
-; CHECK-NOT: invoke.cont2:
-
-invoke.cont2:                                     ; preds = %catch
-  call void @llvm.eh.endcatch() #3
-  br label %try.cont
-
-try.cont:                                         ; preds = %invoke.cont2, %invoke.cont
-  ret void
-
-; CHECK-NOT: eh.resume:
-
-eh.resume:                                        ; preds = %catch.dispatch
-  %exn3 = load i8*, i8** %exn.slot
-  %sel4 = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn3, 0
-  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel4, 1
-  resume { i8*, i32 } %lpad.val5
-
-; CHECK: }
-}
-
-; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
-; CHECK:   [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK:   [[TMP:\%.+]] = load i32, i32* [[I_PTR1]], align 4
-; CHECK:   call void @_Z10handle_inti(i32 [[TMP]])
-; CHECK:   ret i8* blockaddress(@_Z4testv, %try.cont)
-; CHECK: }
-
-declare void @_Z9may_throwv() #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
-
-declare void @llvm.eh.begincatch(i8*, i8*)
-
-declare void @llvm.eh.endcatch()
-
-declare void @_Z10handle_inti(i32) #1
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.7.0 (trunk 227474) (llvm/trunk 227508)"}
diff --git a/test/CodeGen/WinEH/cppeh-catch-unwind.ll b/test/CodeGen/WinEH/cppeh-catch-unwind.ll
deleted file mode 100644
index 8fdda9bbc02ae..0000000000000
--- a/test/CodeGen/WinEH/cppeh-catch-unwind.ll
+++ /dev/null
@@ -1,240 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test was generated from the following source:
-;
-; void test() {
-;   try {
-;     SomeClass obj;
-;     may_throw();
-;     try {
-;       may_throw();
-;     } catch (int) {
-;       handle_exception();
-;     }
-;   } catch (int) {
-;     handle_exception();
-;   }
-; }
-;
-; The code above was compiled with the -O2 option.
-
-; ModuleID = 'catch-unwind.cpp'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%class.SomeClass = type { i8 }
-
-$"\01??_R0H@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-
-
-; CHECK-LABEL: define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-; CHECK: entry:
-; CHECK:   [[OBJ_PTR:\%.+]] = alloca %class.SomeClass
-; CHECK:   [[TMP0:\%.+]] = alloca i32, align 4
-; CHECK:   [[TMP1:\%.+]] = alloca i32, align 4
-; CHECK:   call void (...) @llvm.localescape(i32* [[TMP1]], %class.SomeClass* [[OBJ_PTR]], i32* [[TMP0]])
-; CHECK:   %call = invoke %class.SomeClass* @"\01??0SomeClass@@QEAA@XZ"(%class.SomeClass* %obj)
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-; Function Attrs: uwtable
-define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %obj = alloca %class.SomeClass, align 1
-  %0 = alloca i32, align 4
-  %1 = alloca i32, align 4
-  %call = invoke %class.SomeClass* @"\01??0SomeClass@@QEAA@XZ"(%class.SomeClass* %obj)
-          to label %invoke.cont unwind label %lpad
-
-; CHECK: invoke.cont:
-; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
-; CHECK:           to label %invoke.cont2 unwind label %[[LPAD1_LABEL:lpad[0-9]*]]
-
-invoke.cont:                                      ; preds = %entry
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont2 unwind label %lpad1
-
-; CHECK: invoke.cont2:
-; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
-; CHECK:           to label %try.cont unwind label %[[LPAD3_LABEL:lpad[0-9]*]]
-
-invoke.cont2:                                     ; preds = %invoke.cont
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %try.cont unwind label %lpad3
-
-; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   [[LPAD_VAL:\%.+]] = landingpad { i8*, i32 }
-; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont15]
-
-lpad:                                             ; preds = %entry
-  %2 = landingpad { i8*, i32 }
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-  %3 = extractvalue { i8*, i32 } %2, 0
-  %4 = extractvalue { i8*, i32 } %2, 1
-  br label %catch.dispatch7
-
-; CHECK: [[LPAD1_LABEL]]:{{[ ]+}}; preds = %invoke.cont
-; CHECK:   [[LPAD1_VAL:\%.+]] = landingpad { i8*, i32 }
-; CHECK-NEXT:           cleanup
-; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER1:\%.+]] = call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @"\01?test@@YAXXZ.cleanup", i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
-; CHECK-NEXT:   indirectbr i8* [[RECOVER1]], [label %try.cont15]
-
-lpad1:                                            ; preds = %invoke.cont
-  %5 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-  %6 = extractvalue { i8*, i32 } %5, 0
-  %7 = extractvalue { i8*, i32 } %5, 1
-  br label %ehcleanup
-
-; CHECK: [[LPAD3_LABEL]]:{{[ ]+}}; preds = %invoke.cont2
-; CHECK:   [[LPAD3_VAL:\%.+]] = landingpad { i8*, i32 }
-; CHECK-NEXT:           cleanup
-; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER3:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch.1", i32 0, void (i8*, i8*)* @"\01?test@@YAXXZ.cleanup")
-; CHECK-NEXT:   indirectbr i8* [[RECOVER3]], [label %try.cont, label %try.cont15]
-
-lpad3:                                            ; preds = %invoke.cont2
-  %8 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-  %9 = extractvalue { i8*, i32 } %8, 0
-  %10 = extractvalue { i8*, i32 } %8, 1
-  %11 = call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)) #3
-  %matches = icmp eq i32 %10, %11
-  br i1 %matches, label %catch, label %ehcleanup
-
-; CHECK-NOT: catch:
-catch:                                            ; preds = %lpad3
-  %12 = bitcast i32* %0 to i8*
-  call void @llvm.eh.begincatch(i8* %9, i8* %12) #3
-  invoke void @"\01?handle_exception@@YAXXZ"()
-          to label %invoke.cont6 unwind label %lpad5
-
-; CHECK-NOT: invoke.cont6:
-invoke.cont6:                                     ; preds = %catch
-  call void @llvm.eh.endcatch() #3
-  br label %try.cont
-
-try.cont:                                         ; preds = %invoke.cont2, %invoke.cont6
-  call void @"\01??1SomeClass@@QEAA@XZ"(%class.SomeClass* %obj) #3
-  br label %try.cont15
-
-; CHECK-NOT: lpad5:
-lpad5:                                            ; preds = %catch
-  %13 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-  %14 = extractvalue { i8*, i32 } %13, 0
-  %15 = extractvalue { i8*, i32 } %13, 1
-  call void @llvm.eh.endcatch() #3
-  br label %ehcleanup
-
-; CHECK-NOT: ehcleanup
-ehcleanup:                                        ; preds = %lpad5, %lpad3, %lpad1
-  %exn.slot.0 = phi i8* [ %14, %lpad5 ], [ %9, %lpad3 ], [ %6, %lpad1 ]
-  %ehselector.slot.0 = phi i32 [ %15, %lpad5 ], [ %10, %lpad3 ], [ %7, %lpad1 ]
-  call void @"\01??1SomeClass@@QEAA@XZ"(%class.SomeClass* %obj) #3
-  br label %catch.dispatch7
-
-; CHECK-NOT: catch.dispatch7:
-catch.dispatch7:                                  ; preds = %ehcleanup, %lpad
-  %exn.slot.1 = phi i8* [ %exn.slot.0, %ehcleanup ], [ %3, %lpad ]
-  %ehselector.slot.1 = phi i32 [ %ehselector.slot.0, %ehcleanup ], [ %4, %lpad ]
-  %16 = call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)) #3
-  %matches9 = icmp eq i32 %ehselector.slot.1, %16
-  br i1 %matches9, label %catch10, label %eh.resume
-
-; CHECK-NOT: catch10:
-catch10:                                          ; preds = %catch.dispatch7
-  %17 = bitcast i32* %1 to i8*
-  call void @llvm.eh.begincatch(i8* %exn.slot.1, i8* %17) #3
-  call void @"\01?handle_exception@@YAXXZ"()
-  br label %invoke.cont13
-
-; CHECK-NOT: invoke.cont13:
-invoke.cont13:                                    ; preds = %catch10
-  call void @llvm.eh.endcatch() #3
-  br label %try.cont15
-
-try.cont15:                                       ; preds = %invoke.cont13, %try.cont
-  ret void
-
-; CHECK-NOT: eh.resume
-eh.resume:                                        ; preds = %catch.dispatch7
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.1, 0
-  %lpad.val18 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.1, 1
-  resume { i8*, i32 } %lpad.val18
-
-; CHECK: }
-}
-
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_TMP1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
-; CHECK:   [[TMP1_PTR:\%.+]] = bitcast i8* [[RECOVER_TMP1]] to i32*
-; CHECK:   call void @"\01?handle_exception@@YAXXZ"()
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont15)
-; CHECK: }
-
-; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
-; CHECK:   [[OBJ_PTR:\%.+]] = bitcast i8* %obj.i8 to %class.SomeClass*
-; CHECK:   call void @"\01??1SomeClass@@QEAA@XZ"(%class.SomeClass* [[OBJ_PTR]])
-; CHECK:   ret void
-; CHECK: }
-
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_TMP0:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
-; CHECK:   [[TMP0_PTR:\%.+]] = bitcast i8* [[RECOVER_TMP0]] to i32*
-; CHECK:   invoke void @"\01?handle_exception@@YAXXZ"()
-; CHECK:           to label %invoke.cont6 unwind label %[[LPAD5_LABEL:lpad[0-9]+]]
-;
-; CHECK: invoke.cont6:                                     ; preds = %entry
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont)
-;
-; CHECK: [[LPAD5_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   [[LPAD5_VAL:\%.+]] = landingpad { i8*, i32 }
-; CHECK:           cleanup
-; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK: }
-
-declare %class.SomeClass* @"\01??0SomeClass@@QEAA@XZ"(%class.SomeClass* returned) #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-declare void @"\01?may_throw@@YAXXZ"() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
-
-declare void @"\01?handle_exception@@YAXXZ"() #1
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #3
-
-; Function Attrs: nounwind
-declare void @"\01??1SomeClass@@QEAA@XZ"(%class.SomeClass*) #4
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 232069) (llvm/trunk 232070)"}
diff --git a/test/CodeGen/WinEH/cppeh-cleanup-invoke.ll b/test/CodeGen/WinEH/cppeh-cleanup-invoke.ll
deleted file mode 100644
index 7e5f659f2a4f4..0000000000000
--- a/test/CodeGen/WinEH/cppeh-cleanup-invoke.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: opt -winehprepare -S < %s | FileCheck %s
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-; Modified based on this code:
-; struct HasDtor {
-;   ~HasDtor();
-; };
-; extern "C" void may_throw();
-; int main() {
-;   try {
-;     HasDtor o;
-;     may_throw();
-;   } catch (int) {
-;   }
-; }
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.CatchHandlerType = type { i32, i8* }
-%struct.HasDtor = type { i8 }
-
-$"\01??_R0H@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@llvm.eh.handlertype.H.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
-
-define i32 @main() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %o = alloca %struct.HasDtor, align 1
-  invoke void @may_throw()
-          to label %invoke.cont2 unwind label %lpad1
-
-invoke.cont2:                                     ; preds = %invoke.cont
-  call void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor* %o)
-  br label %try.cont
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-  %1 = extractvalue { i8*, i32 } %0, 0
-  %2 = extractvalue { i8*, i32 } %0, 1
-  br label %catch.dispatch
-
-lpad1:                                            ; preds = %invoke.cont
-  %3 = landingpad { i8*, i32 }
-          cleanup
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-  %4 = extractvalue { i8*, i32 } %3, 0
-  %5 = extractvalue { i8*, i32 } %3, 1
-  invoke void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor* %o)
-	  to label %catch.dispatch unwind label %lpad
-
-catch.dispatch:                                   ; preds = %lpad1, %lpad
-  %exn.slot.0 = phi i8* [ %4, %lpad1 ], [ %1, %lpad ]
-  %ehselector.slot.0 = phi i32 [ %5, %lpad1 ], [ %2, %lpad ]
-  %6 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*))
-  %matches = icmp eq i32 %ehselector.slot.0, %6
-  br i1 %matches, label %catch, label %eh.resume
-
-catch:                                            ; preds = %catch.dispatch
-  call void @llvm.eh.begincatch(i8* %exn.slot.0, i8* null)
-  call void @llvm.eh.endcatch()
-  br label %try.cont
-
-try.cont:                                         ; preds = %catch, %invoke.cont2
-  ret i32 0
-
-eh.resume:                                        ; preds = %catch.dispatch
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
-  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
-  resume { i8*, i32 } %lpad.val5
-}
-
-; CHECK-LABEL: define i32 @main()
-; CHECK: @llvm.eh.actions(i32 0, void (i8*, i8*)* @main.cleanup, i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*), i32 -1, i8* (i8*, i8*)* @main.catch)
-
-; CHECK-LABEL: define internal void @main.cleanup(i8*, i8*)
-; CHECK: call void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor* %{{.*}})
-; CHECK: ret void
-
-declare void @may_throw()
-
-declare i32 @__CxxFrameHandler3(...)
-
-declare void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor*)
-
-declare i32 @llvm.eh.typeid.for(i8*)
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture)
-declare void @llvm.eh.endcatch()
diff --git a/test/CodeGen/WinEH/cppeh-demote-liveout.ll b/test/CodeGen/WinEH/cppeh-demote-liveout.ll
deleted file mode 100644
index 309952bfc94be..0000000000000
--- a/test/CodeGen/WinEH/cppeh-demote-liveout.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S < %s | FileCheck %s
-
-; Notionally based on this C++ source:
-; int liveout_catch(int p) {
-;   int val = p + 1;
-;   try {
-;     might_throw();
-;   } catch (int) {
-;     val++;
-;   }
-;   return val;
-; }
-
-declare void @llvm.eh.begincatch(i8*, i8*)
-declare void @llvm.eh.endcatch()
-declare void @might_throw()
-declare i32 @__CxxFrameHandler3(...)
-declare i32 @llvm.eh.typeid.for(i8*)
-
-@typeinfo.int = external global i32
-
-define i32 @liveout_catch(i32 %p) personality i32 (...)* @__CxxFrameHandler3 {
-entry:
-  %val.entry = add i32 %p, 1
-  invoke void @might_throw()
-      to label %ret unwind label %lpad
-
-lpad:
-  %ehvals = landingpad { i8*, i32 }
-      cleanup
-      catch i32* @typeinfo.int
-  %ehptr = extractvalue { i8*, i32 } %ehvals, 0
-  %sel = extractvalue { i8*, i32 } %ehvals, 1
-  %int_sel = call i32 @llvm.eh.typeid.for(i8* bitcast (i32* @typeinfo.int to i8*))
-  %match = icmp eq i32 %sel, %int_sel
-  br i1 %match, label %catchit, label %resume
-
-catchit:
-  call void @llvm.eh.begincatch(i8* %ehptr, i8* null)
-  %val.lpad = add i32 %val.entry, 1
-  call void @llvm.eh.endcatch()
-  br label %ret
-
-ret:
-  %rv = phi i32 [%val.entry, %entry], [%val.lpad, %catchit]
-  ret i32 %rv
-
-resume:
-  resume {i8*, i32} %ehvals
-}
-
-; CHECK-LABEL: define i32 @liveout_catch(i32 %p)
-; CHECK: %val.entry = add i32 %p, 1
-; CHECK-NEXT: store i32 %val.entry, i32* %val.entry.reg2mem
-; CHECK: invoke void @might_throw()
-;
-; CHECK: landingpad
-; CHECK: indirectbr i8* {{.*}}, [label %catchit.split]
-;
-; CHECK: catchit.split:
-; CHECK: load i32, i32* %val.lpad.reg2mem
-; CHECK: br label %ret
-;
-; CHECK: ret:
-; CHECK: %rv = phi i32 [ {{.*}}, %entry ], [ {{.*}}, %catchit.split ]
-; CHECK: ret i32
-
-; CHECK-LABEL: define internal i8* @liveout_catch.catch(i8*, i8*)
-; CHECK: %[[val:[^ ]*]] = load i32, i32*
-; CHECK-NEXT: %[[val_lpad:[^ ]*]] = add i32 %[[val]], 1
-; CHECK-NEXT: store i32 %[[val_lpad]], i32*
-; CHECK: ret i8* blockaddress(@liveout_catch, %catchit.split)
diff --git a/test/CodeGen/WinEH/cppeh-frame-vars.ll b/test/CodeGen/WinEH/cppeh-frame-vars.ll
deleted file mode 100644
index c2dbd8ecab604..0000000000000
--- a/test/CodeGen/WinEH/cppeh-frame-vars.ll
+++ /dev/null
@@ -1,272 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-; struct SomeData {
-;   int a;
-;   int b;
-; };
-;
-; void may_throw();
-; void does_not_throw(int i);
-; void dump(int *, int, SomeData&);
-;
-; void test() {
-;   int NumExceptions = 0;
-;   int ExceptionVal[10];
-;   SomeData Data = { 0, 0 };
-;
-;   for (int i = 0; i < 10; ++i) {
-;     try {
-;       may_throw();
-;       Data.a += i;
-;     }
-;     catch (int e) {
-;       ExceptionVal[NumExceptions] = e;
-;       ++NumExceptions;
-;       if (e == i)
-;         Data.b += e;
-;       else
-;         Data.a += e;
-;     }
-;     does_not_throw(NumExceptions);
-;   }
-;   dump(ExceptionVal, NumExceptions, Data);
-; }
-
-; ModuleID = 'cppeh-frame-vars.cpp'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%struct.SomeData = type { i32, i32 }
-
-$"\01??_R0H@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-
-; The function entry should be rewritten like this.
-; CHECK: define void @"\01?test@@YAXXZ"()
-; CHECK: entry:
-; CHECK:   [[NUMEXCEPTIONS_PTR:\%.+]] = alloca i32, align 4
-; CHECK:   [[EXCEPTIONVAL_PTR:\%.+]] = alloca [10 x i32], align 16
-; CHECK:   [[DATA_PTR:\%.+]] = alloca %struct.SomeData, align 4
-; CHECK:   [[I_PTR:\%.+]] = alloca i32, align 4
-; CHECK:   [[E_PTR:\%.+]] = alloca i32, align 4
-; CHECK:   store i32 0, i32* [[NUMEXCEPTIONS_PTR]], align 4
-; CHECK:   [[TMP:\%.+]] = bitcast %struct.SomeData* [[DATA_PTR]] to i8*
-; CHECK:   call void @llvm.memset(i8* [[TMP]], i8 0, i64 8, i32 4, i1 false)
-; CHECK:   store i32 0, i32* [[I_PTR]], align 4
-; CHECK:   call void (...) @llvm.localescape(i32* [[E_PTR]], i32* [[NUMEXCEPTIONS_PTR]], [10 x i32]* [[EXCEPTIONVAL_PTR]], i32* [[I_PTR]], %struct.SomeData* [[DATA_PTR]])
-; CHECK:   br label %for.cond
-
-; Function Attrs: uwtable
-define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %NumExceptions = alloca i32, align 4
-  %ExceptionVal = alloca [10 x i32], align 16
-  %Data = alloca %struct.SomeData, align 4
-  %i = alloca i32, align 4
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  %e = alloca i32, align 4
-  store i32 0, i32* %NumExceptions, align 4
-  %tmp = bitcast %struct.SomeData* %Data to i8*
-  call void @llvm.memset(i8* %tmp, i8 0, i64 8, i32 4, i1 false)
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %tmp1 = load i32, i32* %i, align 4
-  %cmp = icmp slt i32 %tmp1, 10
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK: for.body:
-; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-for.body:                                         ; preds = %for.cond
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %for.body
-  %tmp2 = load i32, i32* %i, align 4
-  %a = getelementptr inbounds %struct.SomeData, %struct.SomeData* %Data, i32 0, i32 0
-  %tmp3 = load i32, i32* %a, align 4
-  %add = add nsw i32 %tmp3, %tmp2
-  store i32 %add, i32* %a, align 4
-  br label %try.cont
-
-; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %for.body
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont]
-
-lpad:                                             ; preds = %for.body
-  %tmp4 = landingpad { i8*, i32 }
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-  %tmp5 = extractvalue { i8*, i32 } %tmp4, 0
-  store i8* %tmp5, i8** %exn.slot
-  %tmp6 = extractvalue { i8*, i32 } %tmp4, 1
-  store i32 %tmp6, i32* %ehselector.slot
-  br label %catch.dispatch
-
-; CHECK-NOT: catch.dispatch:
-
-catch.dispatch:                                   ; preds = %lpad
-  %sel = load i32, i32* %ehselector.slot
-  %tmp7 = call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)) #1
-  %matches = icmp eq i32 %sel, %tmp7
-  br i1 %matches, label %catch, label %eh.resume
-
-; CHECK-NOT: catch:
-
-catch:                                            ; preds = %catch.dispatch
-  %exn = load i8*, i8** %exn.slot
-  %e.i8 = bitcast i32* %e to i8*
-  call void @llvm.eh.begincatch(i8* %exn, i8* %e.i8) #1
-  %tmp11 = load i32, i32* %e, align 4
-  %tmp12 = load i32, i32* %NumExceptions, align 4
-  %idxprom = sext i32 %tmp12 to i64
-  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %ExceptionVal, i32 0, i64 %idxprom
-  store i32 %tmp11, i32* %arrayidx, align 4
-  %tmp13 = load i32, i32* %NumExceptions, align 4
-  %inc = add nsw i32 %tmp13, 1
-  store i32 %inc, i32* %NumExceptions, align 4
-  %tmp14 = load i32, i32* %e, align 4
-  %tmp15 = load i32, i32* %i, align 4
-  %cmp1 = icmp eq i32 %tmp14, %tmp15
-  br i1 %cmp1, label %if.then, label %if.else
-
-; CHECK-NOT: if.then:
-
-if.then:                                          ; preds = %catch
-  %tmp16 = load i32, i32* %e, align 4
-  %b = getelementptr inbounds %struct.SomeData, %struct.SomeData* %Data, i32 0, i32 1
-  %tmp17 = load i32, i32* %b, align 4
-  %add2 = add nsw i32 %tmp17, %tmp16
-  store i32 %add2, i32* %b, align 4
-  br label %if.end
-
-; CHECK-NOT: if.else:
-
-if.else:                                          ; preds = %catch
-  %tmp18 = load i32, i32* %e, align 4
-  %a3 = getelementptr inbounds %struct.SomeData, %struct.SomeData* %Data, i32 0, i32 0
-  %tmp19 = load i32, i32* %a3, align 4
-  %add4 = add nsw i32 %tmp19, %tmp18
-  store i32 %add4, i32* %a3, align 4
-  br label %if.end
-
-; CHECK-NOT: if.end:
-
-if.end:                                           ; preds = %if.else, %if.then
-  call void @llvm.eh.endcatch() #1
-  br label %try.cont
-
-try.cont:                                         ; preds = %if.end, %invoke.cont
-  %tmp20 = load i32, i32* %NumExceptions, align 4
-  call void @"\01?does_not_throw@@YAXH@Z"(i32 %tmp20)
-  br label %for.inc
-
-for.inc:                                          ; preds = %try.cont
-  %tmp21 = load i32, i32* %i, align 4
-  %inc5 = add nsw i32 %tmp21, 1
-  store i32 %inc5, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %tmp22 = load i32, i32* %NumExceptions, align 4
-  %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %ExceptionVal, i32 0, i32 0
-  call void @"\01?dump@@YAXPEAHHAEAUSomeData@@@Z"(i32* %arraydecay, i32 %tmp22, %struct.SomeData* dereferenceable(8) %Data)
-  ret void
-
-; CHECK-NOT: eh.resume:
-
-eh.resume:                                        ; preds = %catch.dispatch
-  %exn6 = load i8*, i8** %exn.slot
-  %sel7 = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn6, 0
-  %lpad.val8 = insertvalue { i8*, i32 } %lpad.val, i32 %sel7, 1
-  resume { i8*, i32 } %lpad.val8
-
-; CHECK: }
-}
-
-; The following catch handler should be outlined.
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
-; CHECK:   [[E_PTR1:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
-; CHECK:   [[RECOVER_NUMEXCEPTIONS:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
-; CHECK:   [[NUMEXCEPTIONS_PTR1:\%.+]] = bitcast i8* [[RECOVER_NUMEXCEPTIONS]] to i32*
-; CHECK:   [[RECOVER_EXCEPTIONVAL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
-; CHECK:   [[EXCEPTIONVAL_PTR1:\%.+]] = bitcast i8* [[RECOVER_EXCEPTIONVAL]] to [10 x i32]*
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 3)
-; CHECK:   [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK:   [[RECOVER_DATA:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 4)
-; CHECK:   [[DATA_PTR1:\%.+]] = bitcast i8* [[RECOVER_DATA]] to %struct.SomeData*
-; CHECK:   [[TMP:\%.+]] = load i32, i32* [[E_PTR1]], align 4
-; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[NUMEXCEPTIONS_PTR]], align 4
-; CHECK:   [[IDXPROM:\%.+]] = sext i32 [[TMP1]] to i64
-; CHECK:   [[ARRAYIDX:\%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[EXCEPTIONVAL_PTR1]], i32 0, i64 [[IDXPROM]]
-; CHECK:   store i32 [[TMP]], i32* [[ARRAYIDX]], align 4
-; CHECK:   [[TMP2:\%.+]] = load i32, i32* [[NUMEXCEPTIONS_PTR1]], align 4
-; CHECK:   [[INC:\%.+]] = add nsw i32 [[TMP2]], 1
-; CHECK:   store i32 [[INC]], i32* [[NUMEXCEPTIONS_PTR]], align 4
-; CHECK:   [[TMP3:\%.+]] = load i32, i32* [[E_PTR1]], align 4
-; CHECK:   [[TMP4:\%.+]] = load i32, i32* [[I_PTR1]], align 4
-; CHECK:   [[CMP:\%.+]] = icmp eq i32 [[TMP3]], [[TMP4]]
-; CHECK:   br i1 [[CMP]], label %if.then, label %if.else
-;
-; CHECK: if.then:                                          ; preds = %entry
-; CHECK:   [[TMP5:\%.+]] = load i32, i32* [[E_PTR1]], align 4
-; CHECK:   [[B_PTR:\%.+]] = getelementptr inbounds %struct.SomeData, %struct.SomeData* [[DATA_PTR1]], i32 0, i32 1
-; CHECK:   [[TMP6:\%.+]] = load i32, i32* [[B_PTR]], align 4
-; CHECK:   %add2 = add nsw i32 [[TMP6]], [[TMP5]]
-; CHECK:   store i32 [[ADD:\%.+]], i32* [[B_PTR]], align 4
-; CHECK:   br label %if.end
-;
-; CHECK: if.else:                                          ; preds = %entry
-; CHECK:   [[TMP7:\%.+]] = load i32, i32* %e, align 4
-; CHECK:   [[A3:\%.+]] = getelementptr inbounds %struct.SomeData, %struct.SomeData* %Data, i32 0, i32 0
-; CHECK:   [[TMP8:\%.+]] = load i32, i32* %a3, align 4
-; CHECK:   [[ADD1:\%.+]] = add nsw i32 [[TMP8]], [[TMP7]]
-; CHECK:   store i32 [[ADD1]], i32* [[A3]], align 4
-; CHECK:   br label %if.end
-;
-; CHECK: if.end:                                           ; preds = %if.else, %if.then
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont)
-; CHECK: }
-
-
-; Function Attrs: nounwind
-declare void @llvm.memset(i8* nocapture, i8, i64, i32, i1) #1
-
-declare void @"\01?may_throw@@YAXXZ"() #2
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #3
-
-declare void @llvm.eh.begincatch(i8*, i8*)
-
-declare void @llvm.eh.endcatch()
-
-declare void @"\01?does_not_throw@@YAXH@Z"(i32) #2
-
-declare void @"\01?dump@@YAXPEAHHAEAUSomeData@@@Z"(i32*, i32, %struct.SomeData* dereferenceable(8)) #2
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 228868)"}
diff --git a/test/CodeGen/WinEH/cppeh-inalloca.ll b/test/CodeGen/WinEH/cppeh-inalloca.ll
deleted file mode 100644
index 649c5e72e2dd2..0000000000000
--- a/test/CodeGen/WinEH/cppeh-inalloca.ll
+++ /dev/null
@@ -1,194 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is built from the following code:
-; struct A {
-;   A(int a);
-;   A(const A &o);
-;   ~A();
-;   int a;
-; };
-;
-; void may_throw();
-;
-; int test(A a) {
-;   try {
-;     may_throw();
-;   }
-;   catch (int e) {
-;     return a.a + e;
-;   }
-;   return 0;
-; }
-;
-; The test was built for a 32-bit Windows target and then the reference to
-; the inalloca instruction was manually sunk into the landingpad.
-
-; ModuleID = 'cppeh-inalloca.cpp'
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%struct.A = type { i32 }
-
-$"\01??_R0H@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-
-; The function entry should be rewritten like this.
-; CHECK: define i32 @"\01?test@@YAHUA@@@Z"(<{ %struct.A }>* inalloca)
-; CHECK: entry:
-; CHECK:   [[TMP_REGMEM:\%.+]] = alloca <{ %struct.A }>*
-; CHECK:   store <{ %struct.A }>* %0, <{ %struct.A }>** [[TMP_REGMEM]]
-; CHECK:   [[RETVAL:\%.+]] = alloca i32, align 4
-; CHECK:   [[E_PTR:\%.+]] = alloca i32, align 4
-; CHECK:   [[CLEANUP_SLOT:\%.+]] = alloca i32
-; CHECK:   call void (...) @llvm.localescape(i32* %e, <{ %struct.A }>** [[TMP_REGMEM]], i32* [[RETVAL]], i32* [[CLEANUP_SLOT]])
-; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-define i32 @"\01?test@@YAHUA@@@Z"(<{ %struct.A }>* inalloca) #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %retval = alloca i32, align 4
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  %e = alloca i32, align 4
-  %cleanup.dest.slot = alloca i32
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  br label %try.cont
-
-; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           cleanup
-; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER:\%recover.*]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAHUA@@@Z.catch", i32 0, void (i8*, i8*)* @"\01?test@@YAHUA@@@Z.cleanup")
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %cleanup]
-
-lpad:                                             ; preds = %entry
-  %1 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-  %2 = extractvalue { i8*, i32 } %1, 0
-  store i8* %2, i8** %exn.slot
-  %3 = extractvalue { i8*, i32 } %1, 1
-  store i32 %3, i32* %ehselector.slot
-  br label %catch.dispatch
-
-; CHECK-NOT: catch.dispatch:
-
-catch.dispatch:                                   ; preds = %lpad
-  %sel = load i32, i32* %ehselector.slot
-  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)) #3
-  %matches = icmp eq i32 %sel, %4
-  br i1 %matches, label %catch, label %ehcleanup
-
-; CHECK-NOT: catch:
-
-catch:                                            ; preds = %catch.dispatch
-  %exn = load i8*, i8** %exn.slot
-  %e.i8 = bitcast i32* %e to i8*
-  call void @llvm.eh.begincatch(i8* %exn, i8* %e.i8) #3
-  %a = getelementptr inbounds <{ %struct.A }>, <{ %struct.A }>* %0, i32 0, i32 0
-  %a1 = getelementptr inbounds %struct.A, %struct.A* %a, i32 0, i32 0
-  %tmp8 = load i32, i32* %a1, align 4
-  %tmp9 = load i32, i32* %e, align 4
-  %add = add nsw i32 %tmp8, %tmp9
-  store i32 %add, i32* %retval
-  store i32 1, i32* %cleanup.dest.slot
-  call void @llvm.eh.endcatch() #3
-  br label %cleanup
-
-try.cont:                                         ; preds = %invoke.cont
-  store i32 0, i32* %retval
-  store i32 1, i32* %cleanup.dest.slot
-  br label %cleanup
-
-; The cleanup block should be re-written like this.
-; CHECK: cleanup:{{[ ]+}}; preds = %[[LPAD_LABEL]], %try.cont
-; CHECK:   %a2 = getelementptr inbounds <{ %struct.A }>, <{ %struct.A }>* %0, i32 0, i32 0
-; CHECK:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"(%struct.A* %a2)
-; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[RETVAL]]
-; CHECK:   ret i32 [[TMP1]]
-
-cleanup:                                          ; preds = %try.cont, %catch
-  %a2 = getelementptr inbounds <{ %struct.A }>, <{ %struct.A }>* %0, i32 0, i32 0
-  call x86_thiscallcc void @"\01??1A@@QAE@XZ"(%struct.A* %a2) #3
-  %tmp10 = load i32, i32* %retval
-  ret i32 %tmp10
-
-; CHECK-NOT: ehcleanup:
-
-ehcleanup:                                        ; preds = %catch.dispatch
-  %a3 = getelementptr inbounds <{ %struct.A }>, <{ %struct.A }>* %0, i32 0, i32 0
-  call x86_thiscallcc void @"\01??1A@@QAE@XZ"(%struct.A* %a3) #3
-  br label %eh.resume
-
-; CHECK-NOT: eh.resume:
-
-eh.resume:                                        ; preds = %ehcleanup
-  %exn2 = load i8*, i8** %exn.slot
-  %sel3 = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn2, 0
-  %lpad.val4 = insertvalue { i8*, i32 } %lpad.val, i32 %sel3, 1
-  resume { i8*, i32 } %lpad.val4
-
-; CHECK: }
-}
-
-; The following catch handler should be outlined.
-; CHECK: define internal i8* @"\01?test@@YAHUA@@@Z.catch"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 0)
-; CHECK:   [[E_PTR:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
-; CHECK:   [[RECOVER_EH_TEMP:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
-; CHECK:   [[EH_TEMP:\%.+]] = bitcast i8* [[RECOVER_EH_TEMP]] to <{ %struct.A }>**
-; CHECK:   [[RECOVER_RETVAL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 2)
-; CHECK:   [[RETVAL1:\%.+]] = bitcast i8* [[RECOVER_RETVAL]] to i32*
-; CHECK:   [[RECOVER_CLEANUPSLOT:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 3)
-; CHECK:   [[CLEANUPSLOT1:\%.+]] = bitcast i8* [[RECOVER_CLEANUPSLOT]] to i32*
-; CHECK:   [[E_I8PTR:\%.+]] = bitcast i32* [[E_PTR]] to i8*
-; CHECK:   [[TMP_RELOAD:\%.+]] = load <{ %struct.A }>*, <{ %struct.A }>** [[EH_TEMP]]
-; CHECK:   [[RECOVER_A:\%.+]] = getelementptr inbounds <{ %struct.A }>, <{ %struct.A }>* [[TMP_RELOAD]], i32 0, i32 0
-; CHECK:   [[A1:\%.+]] = getelementptr inbounds %struct.A, %struct.A* [[RECOVER_A]], i32 0, i32 0
-; CHECK:   [[TMP2:\%.+]] = load i32, i32* [[A1]], align 4
-; CHECK:   [[TMP3:\%.+]] = load i32, i32* [[E_PTR]], align 4
-; CHECK:   [[ADD:\%.+]] = add nsw i32 [[TMP2]], [[TMP3]]
-; CHECK:   store i32 [[ADD]], i32* [[RETVAL1]]
-; CHECK:   store i32 1, i32* [[CLEANUPSLOT1]]
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAHUA@@@Z", %cleanup)
-; CHECK: }
-
-; The following cleanup handler should be outlined.
-; CHECK: define internal void @"\01?test@@YAHUA@@@Z.cleanup"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_EH_TEMP1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
-; CHECK:   [[EH_TEMP1:\%.+]] = bitcast i8* [[RECOVER_EH_TEMP]] to <{ %struct.A }>**
-; CHECK:   [[TMP_RELOAD1:\%.+]] = load <{ %struct.A }>*, <{ %struct.A }>** [[EH_TEMP1]]
-; CHECK:   [[A3:\%.+]] = getelementptr inbounds <{ %struct.A }>, <{ %struct.A }>* [[TMP_RELOAD1]], i32 0, i32 0
-; CHECK:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"(%struct.A* [[A3]])
-; CHECK:   ret void
-; CHECK: }
-
-declare void @"\01?may_throw@@YAXXZ"() #0
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #1
-
-declare void @llvm.eh.begincatch(i8*, i8*)
-
-declare void @llvm.eh.endcatch()
-
-; Function Attrs: nounwind
-declare x86_thiscallcc void @"\01??1A@@QAE@XZ"(%struct.A*) #2
-
-attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.7.0 (trunk 228868)"}
diff --git a/test/CodeGen/WinEH/cppeh-min-unwind.ll b/test/CodeGen/WinEH/cppeh-min-unwind.ll
deleted file mode 100644
index 98d6d6fcacb66..0000000000000
--- a/test/CodeGen/WinEH/cppeh-min-unwind.ll
+++ /dev/null
@@ -1,99 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test was generated from the following source:
-;
-; class SomeClass {
-; public:
-;   SomeClass();
-;   ~SomeClass();
-; };
-;
-; void test() {
-;   SomeClass obj;
-;   may_throw();
-; }
-
-
-; ModuleID = 'min-unwind.cpp'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%class.SomeClass = type { [28 x i32] }
-
-; The function entry should be rewritten like this.
-; CHECK: define void @_Z4testv()
-; CHECK: entry:
-; CHECK:   [[OBJ_PTR:\%.+]] = alloca %class.SomeClass, align 4
-; CHECK:   call void @_ZN9SomeClassC1Ev(%class.SomeClass* [[OBJ_PTR]])
-; CHECK:   call void (...) @llvm.localescape(%class.SomeClass* [[OBJ_PTR]])
-; CHECK:   invoke void @_Z9may_throwv()
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-; Function Attrs: uwtable
-define void @_Z4testv() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %obj = alloca %class.SomeClass, align 4
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  call void @_ZN9SomeClassC1Ev(%class.SomeClass* %obj)
-  invoke void @_Z9may_throwv()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  call void @_ZN9SomeClassD1Ev(%class.SomeClass* %obj)
-  ret void
-
-; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           cleanup
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @_Z4testv.cleanup)
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], []
-
-lpad:                                             ; preds = %entry
-  %tmp = landingpad { i8*, i32 }
-          cleanup
-  %tmp1 = extractvalue { i8*, i32 } %tmp, 0
-  store i8* %tmp1, i8** %exn.slot
-  %tmp2 = extractvalue { i8*, i32 } %tmp, 1
-  store i32 %tmp2, i32* %ehselector.slot
-  call void @_ZN9SomeClassD1Ev(%class.SomeClass* %obj)
-  br label %eh.resume
-
-; CHECK-NOT: eh.resume:
-
-eh.resume:                                        ; preds = %lpad
-  %exn = load i8*, i8** %exn.slot
-  %sel = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
-  %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
-  resume { i8*, i32 } %lpad.val2
-
-; CHECK: }
-}
-
-; This cleanup handler should be outlined.
-; CHECK: define internal void @_Z4testv.cleanup(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
-; CHECK:   [[OBJ_PTR1:\%.+]] = bitcast i8* [[RECOVER_OBJ]] to %class.SomeClass*
-; CHECK:   call void @_ZN9SomeClassD1Ev(%class.SomeClass* [[OBJ_PTR1]])
-; CHECK:   ret void
-; CHECK: }
-
-declare void @_ZN9SomeClassC1Ev(%class.SomeClass*) #1
-
-declare void @_Z9may_throwv() #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-declare void @_ZN9SomeClassD1Ev(%class.SomeClass*) #1
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline noreturn nounwind }
-attributes #3 = { noreturn nounwind }
-attributes #4 = { nounwind }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.7.0 (trunk 226027)"}
diff --git a/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll b/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll
deleted file mode 100644
index c69633f17e282..0000000000000
--- a/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll
+++ /dev/null
@@ -1,106 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-; void test()
-; {
-;   try {
-;     Obj o;
-;     may_throw();
-;   } catch (...) {
-;   }
-; }
-;
-; The purpose of this test is to verify that we create separate catch and
-; cleanup handlers.  When compiling for the C++ 11 standard, this isn't
-; strictly necessary, since calling the destructor from the catch handler
-; would be logically equivalent to calling it from a cleanup handler.
-; However, if the -std=c++98 option is used, an exception in the cleanup
-; code should terminate the process (the MSVCRT runtime will do that) but
-; if the destructor is called from the catch handler, it wouldn't terminate
-; the process
-
-
-; ModuleID = 'cppeh-mixed-catch-and-cleanup.cpp'
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%class.Obj = type { i8 }
-
-; This just verifies that the function was processed by WinEHPrepare.
-;
-; CHECK-LABEL: define void @"\01?test@@YAXXZ"()
-; CHECK: entry:
-; CHECK:   call void (...) @llvm.localescape
-; CHECK: }
-
-; Function Attrs: nounwind uwtable
-define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %o = alloca %class.Obj, align 1
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  call void @"\01??1Obj@@QEAA@XZ"(%class.Obj* %o) #3
-  br label %try.cont
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* null
-  %1 = extractvalue { i8*, i32 } %0, 0
-  store i8* %1, i8** %exn.slot
-  %2 = extractvalue { i8*, i32 } %0, 1
-  store i32 %2, i32* %ehselector.slot
-  call void @"\01??1Obj@@QEAA@XZ"(%class.Obj* %o) #3
-  %exn = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn, i8* null) #3
-  call void @llvm.eh.endcatch() #3
-  br label %try.cont
-
-try.cont:                                         ; preds = %catch, %invoke.cont
-  ret void
-}
-
-; Verify that a cleanup handler was created and that it calls ~Obj().
-; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
-; CHECK:   call void @"\01??1Obj@@QEAA@XZ"
-; CHECK:   ret void
-; CHECK: }
-
-; Verify that a catch handler was created and that it does not call ~Obj().
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
-; CHECK: entry:
-; CHECK-NOT:  call void @"\01??1Obj@@QEAA@XZ"
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont)
-; CHECK: }
-
-
-
-declare void @"\01?may_throw@@YAXXZ"() #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind
-declare void @"\01??1Obj@@QEAA@XZ"(%class.Obj*) #2
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #3
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 235779) (llvm/trunk 235769)"}
diff --git a/test/CodeGen/WinEH/cppeh-multi-catch.ll b/test/CodeGen/WinEH/cppeh-multi-catch.ll
deleted file mode 100644
index 266cdea20cdbc..0000000000000
--- a/test/CodeGen/WinEH/cppeh-multi-catch.ll
+++ /dev/null
@@ -1,226 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-; void test()
-; {
-;   try {
-;     may_throw();
-;   } catch (int i) {
-;     handle_int(i);
-;   } catch (long long ll) {
-;     handle_long_long(ll);
-;   } catch (SomeClass &obj) {
-;     handle_obj(&obj);
-;   } catch (...) {
-;     handle_exception();
-;   }
-; }
-;
-; The catch handlers were edited to insert 'ret void' after the endcatch call.
-
-; ModuleID = 'catch-with-type.cpp'
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.HandlerMapEntry = type { i32, i32 }
-%rtti.TypeDescriptor3 = type { i8**, i8*, [4 x i8] }
-%rtti.TypeDescriptor15 = type { i8**, i8*, [16 x i8] }
-%class.SomeClass = type { i8 }
-
-$"\01??_R0H@8" = comdat any
-
-$"\01??_R0_J@8" = comdat any
-
-$"\01??_R0?AVSomeClass@@@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@__ImageBase = external constant i8
-@llvm.eh.handlermapentry.H = private unnamed_addr constant %eh.HandlerMapEntry { i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section "llvm.metadata"
-@"\01??_R0_J@8" = linkonce_odr global %rtti.TypeDescriptor3 { i8** @"\01??_7type_info@@6B@", i8* null, [4 x i8] c"._J\00" }, comdat
-@llvm.eh.handlermapentry._J = private unnamed_addr constant %eh.HandlerMapEntry { i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor3* @"\01??_R0_J@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section "llvm.metadata"
-@"\01??_R0?AVSomeClass@@@8" = linkonce_odr global %rtti.TypeDescriptor15 { i8** @"\01??_7type_info@@6B@", i8* null, [16 x i8] c".?AVSomeClass@@\00" }, comdat
-@"llvm.eh.handlermapentry.reference.?AVSomeClass@@" = private unnamed_addr constant %eh.HandlerMapEntry { i32 8, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor15* @"\01??_R0?AVSomeClass@@@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section "llvm.metadata"
-
-
-; CHECK: define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-; CHECK: entry:
-; CHECK:   [[OBJ_PTR:\%.+]] = alloca %class.SomeClass*, align 8
-; CHECK:   [[LL_PTR:\%.+]] = alloca i64, align 8
-; CHECK:   [[I_PTR:\%.+]] = alloca i32, align 4
-; CHECK:   call void (...) @llvm.localescape(i32* [[I_PTR]], i64* [[LL_PTR]], %class.SomeClass** [[OBJ_PTR]])
-; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-; Function Attrs: uwtable
-define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  %obj = alloca %class.SomeClass*, align 8
-  %ll = alloca i64, align 8
-  %i = alloca i32, align 4
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  br label %try.cont
-
-; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           catch %eh.HandlerMapEntry* @llvm.eh.handlermapentry.H
-; CHECK-NEXT:           catch %eh.HandlerMapEntry* @llvm.eh.handlermapentry._J
-; CHECK-NEXT:           catch %eh.HandlerMapEntry* @"llvm.eh.handlermapentry.reference.?AVSomeClass@@"
-; CHECK-NEXT:           catch i8* null
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(
-; CHECK-SAME:	     i32 1, i8* bitcast (%eh.HandlerMapEntry* @llvm.eh.handlermapentry.H to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch",
-; CHECK-SAME:        i32 1, i8* bitcast (%eh.HandlerMapEntry* @llvm.eh.handlermapentry._J to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch.1",
-; CHECK-SAME:        i32 1, i8* bitcast (%eh.HandlerMapEntry* @"llvm.eh.handlermapentry.reference.?AVSomeClass@@" to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch.2",
-; CHECK-SAME:        i32 1, i8* null, i32 -1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch.3")
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %ret]
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch %eh.HandlerMapEntry* @llvm.eh.handlermapentry.H
-          catch %eh.HandlerMapEntry* @llvm.eh.handlermapentry._J
-          catch %eh.HandlerMapEntry* @"llvm.eh.handlermapentry.reference.?AVSomeClass@@"
-          catch i8* null
-  %1 = extractvalue { i8*, i32 } %0, 0
-  store i8* %1, i8** %exn.slot
-  %2 = extractvalue { i8*, i32 } %0, 1
-  store i32 %2, i32* %ehselector.slot
-  br label %catch.dispatch
-
-; CHECK-NOT: catch.dispatch:
-catch.dispatch:                                   ; preds = %lpad
-  %sel = load i32, i32* %ehselector.slot
-  %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.HandlerMapEntry* @llvm.eh.handlermapentry.H to i8*)) #3
-  %matches = icmp eq i32 %sel, %3
-  br i1 %matches, label %catch14, label %catch.fallthrough
-
-ret:
-  ret void
-
-; CHECK-NOT: catch14:
-; CHECK: ret:
-; CHECK-NEXT:   ret void
-catch14:                                          ; preds = %catch.dispatch
-  %exn15 = load i8*, i8** %exn.slot
-  %4 = bitcast i32* %i to i8*
-  call void @llvm.eh.begincatch(i8* %exn15, i8* %4) #3
-  %5 = load i32, i32* %i, align 4
-  call void @"\01?handle_int@@YAXH@Z"(i32 %5)
-  call void @llvm.eh.endcatch() #3
-  br label %ret
-
-try.cont:                                         ; preds = %invoke.cont
-  br label %ret
-
-; CHECK-NOT: catch.fallthrough:
-catch.fallthrough:                                ; preds = %catch.dispatch
-  %6 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.HandlerMapEntry* @llvm.eh.handlermapentry._J to i8*)) #3
-  %matches1 = icmp eq i32 %sel, %6
-  br i1 %matches1, label %catch10, label %catch.fallthrough2
-
-; CHECK-NOT: catch10:
-catch10:                                          ; preds = %catch.fallthrough
-  %exn11 = load i8*, i8** %exn.slot
-  %7 = bitcast i64* %ll to i8*
-  call void @llvm.eh.begincatch(i8* %exn11, i8* %7) #3
-  %8 = load i64, i64* %ll, align 8
-  call void @"\01?handle_long_long@@YAX_J@Z"(i64 %8)
-  call void @llvm.eh.endcatch() #3
-  br label %ret
-
-; CHECK-NOT: catch.fallthrough2:
-catch.fallthrough2:                               ; preds = %catch.fallthrough
-  %9 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.HandlerMapEntry* @"llvm.eh.handlermapentry.reference.?AVSomeClass@@" to i8*)) #3
-  %matches3 = icmp eq i32 %sel, %9
-  br i1 %matches3, label %catch6, label %catch
-
-; CHECK-NOT: catch6:
-catch6:                                           ; preds = %catch.fallthrough2
-  %exn7 = load i8*, i8** %exn.slot
-  %10 = bitcast %class.SomeClass** %obj to i8*
-  call void @llvm.eh.begincatch(i8* %exn7, i8* %10) #3
-  %11 = load %class.SomeClass*, %class.SomeClass** %obj, align 8
-  call void @"\01?handle_obj@@YAXPEAVSomeClass@@@Z"(%class.SomeClass* %11)
-  call void @llvm.eh.endcatch() #3
-  br label %ret
-
-; CHECK-NOT: catch:
-catch:                                            ; preds = %catch.fallthrough2
-  %exn = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn, i8* null) #3
-  call void @"\01?handle_exception@@YAXXZ"()  call void @llvm.eh.endcatch() #3
-  br label %ret
-; CHECK: }
-}
-
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
-; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
-; CHECK:   call void @"\01?handle_int@@YAXH@Z"(i32 [[TMP1]])
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %ret)
-; CHECK: }
-
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_LL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
-; CHECK:   [[LL_PTR:\%.+]] = bitcast i8* [[RECOVER_LL]] to i64*
-; CHECK:   [[TMP2:\%.+]] = load i64, i64* [[LL_PTR]], align 8
-; CHECK:   call void @"\01?handle_long_long@@YAX_J@Z"(i64 [[TMP2]])
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %ret)
-; CHECK: }
-
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch.2"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
-; CHECK:   [[OBJ_PTR:\%.+]] = bitcast i8* [[RECOVER_OBJ]] to %class.SomeClass**
-; CHECK:   [[TMP3:\%.+]] = load %class.SomeClass*, %class.SomeClass** [[OBJ_PTR]], align 8
-; CHECK:   call void @"\01?handle_obj@@YAXPEAVSomeClass@@@Z"(%class.SomeClass* [[TMP3]])
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %ret)
-; CHECK: }
-
-; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch.3"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   call void @"\01?handle_exception@@YAXXZ"()
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %ret)
-; CHECK: }
-
-
-declare void @"\01?may_throw@@YAXXZ"() #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
-
-declare void @"\01?handle_exception@@YAXXZ"() #1
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #3
-
-declare void @"\01?handle_obj@@YAXPEAVSomeClass@@@Z"(%class.SomeClass*) #1
-
-declare void @"\01?handle_long_long@@YAX_J@Z"(i64) #1
-
-declare void @"\01?handle_int@@YAXH@Z"(i32) #1
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 233155) (llvm/trunk 233153)"}
diff --git a/test/CodeGen/WinEH/cppeh-nested-1.ll b/test/CodeGen/WinEH/cppeh-nested-1.ll
deleted file mode 100644
index d525d8a1a67ee..0000000000000
--- a/test/CodeGen/WinEH/cppeh-nested-1.ll
+++ /dev/null
@@ -1,194 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-;void test()
-;{
-;  try {
-;    try {
-;       may_throw();
-;    } catch (int i) {
-;      handle_int(i);
-;    }
-;  } catch (float f) {
-;    handle_float(f);
-;  }
-;  done();
-;}
-
-; ModuleID = 'cppeh-nested-1.cpp'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-
-$"\01??_R0M@8" = comdat any
-
-$"\01??_R0H@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0M@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".M\00" }, comdat
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-
-; CHECK: define void @"\01?test@@YAXXZ"()
-; CHECK: entry:
-; CHECK:   %i = alloca i32, align 4
-; CHECK:   %f = alloca float, align 4
-; CHECK:   call void (...) @llvm.localescape(float* %f, i32* %i)
-; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-; Function Attrs: uwtable
-define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  %i = alloca i32, align 4
-  %f = alloca float, align 4
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  br label %try.cont
-
-; CHECK: [[LPAD_LABEL]]:
-; CHECK:   landingpad { i8*, i32 }
-; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-; CHECK:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch.1", i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
-; CHECK:   indirectbr i8* [[RECOVER]], [label %try.cont, label %try.cont10]
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-  %1 = extractvalue { i8*, i32 } %0, 0
-  store i8* %1, i8** %exn.slot
-  %2 = extractvalue { i8*, i32 } %0, 1
-  store i32 %2, i32* %ehselector.slot
-  br label %catch.dispatch
-
-; CHECK-NOT: catch.dispatch:
-catch.dispatch:                                   ; preds = %lpad
-  %sel = load i32, i32* %ehselector.slot
-  %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)) #3
-  %matches = icmp eq i32 %sel, %3
-  br i1 %matches, label %catch, label %catch.dispatch3
-
-; CHECK-NOT: catch:
-catch:                                            ; preds = %catch.dispatch
-  %exn = load i8*, i8** %exn.slot
-  %4 = bitcast i32* %i to i8*
-  call void @llvm.eh.begincatch(i8* %exn, i8* %4) #3
-  %5 = load i32, i32* %i, align 4
-  invoke void @"\01?handle_int@@YAXH@Z"(i32 %5)
-          to label %invoke.cont2 unwind label %lpad1
-
-; CHECK-NOT: invoke.cont2:
-invoke.cont2:                                     ; preds = %catch
-  call void @llvm.eh.endcatch() #3
-  br label %try.cont
-
-try.cont:                                         ; preds = %invoke.cont2, %invoke.cont
-  br label %try.cont10
-
-; CHECK-NOT: lpad1:
-lpad1:                                            ; preds = %catch
-  %6 = landingpad { i8*, i32 }
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-  %7 = extractvalue { i8*, i32 } %6, 0
-  store i8* %7, i8** %exn.slot
-  %8 = extractvalue { i8*, i32 } %6, 1
-  store i32 %8, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #3
-  br label %catch.dispatch3
-
-; CHECK-NOT: catch.dispatch3:
-catch.dispatch3:                                  ; preds = %lpad1, %catch.dispatch
-  %sel4 = load i32, i32* %ehselector.slot
-  %9 = call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)) #3
-  %matches5 = icmp eq i32 %sel4, %9
-  br i1 %matches5, label %catch6, label %eh.resume
-
-; CHECK-NOT: catch6:
-catch6:                                           ; preds = %catch.dispatch3
-  %exn7 = load i8*, i8** %exn.slot
-  %10 = bitcast float* %f to i8*
-  call void @llvm.eh.begincatch(i8* %exn7, i8* %10) #3
-  %11 = load float, float* %f, align 4
-  call void @"\01?handle_float@@YAXM@Z"(float %11)
-  call void @llvm.eh.endcatch() #3
-  br label %try.cont10
-
-try.cont10:                                       ; preds = %catch6, %try.cont
-  call void @"\01?done@@YAXXZ"()
-  ret void
-
-; CHECK-NOT: eh.resume:
-eh.resume:                                        ; %catch.dispatch3
-  %exn11 = load i8*, i8** %exn.slot
-  %sel12 = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn11, 0
-  %lpad.val13 = insertvalue { i8*, i32 } %lpad.val, i32 %sel12, 1
-  resume { i8*, i32 } %lpad.val13
-; CHECK: }
-}
-
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_F1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
-; CHECK:   [[F_PTR1:\%.+]] = bitcast i8* [[RECOVER_F1]] to float*
-; CHECK:   [[TMP2:\%.+]] = load float, float* [[F_PTR1]], align 4
-; CHECK:   call void @"\01?handle_float@@YAXM@Z"(float [[TMP2]])
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont10)
-; CHECK: }
-
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
-; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
-; CHECK:   invoke void @"\01?handle_int@@YAXH@Z"(i32 [[TMP1]])
-; CHECK:           to label %invoke.cont2 unwind label %[[LPAD1_LABEL:lpad[0-9]*]]
-;
-; CHECK: invoke.cont2:
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont)
-;
-; CHECK: [[LPAD1_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   [[LPAD1_VAL:\%.+]] = landingpad { i8*, i32 }
-; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-; CHECK:   [[RECOVER1:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
-; CHECK:   indirectbr i8* [[RECOVER1]], []
-;
-; CHECK: }
-
-
-declare void @"\01?may_throw@@YAXXZ"() #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
-
-declare void @"\01?handle_int@@YAXH@Z"(i32) #1
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #3
-
-declare void @"\01?handle_float@@YAXM@Z"(float) #1
-
-declare void @"\01?done@@YAXXZ"() #1
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 232069) (llvm/trunk 232070)"}
diff --git a/test/CodeGen/WinEH/cppeh-nested-2.ll b/test/CodeGen/WinEH/cppeh-nested-2.ll
deleted file mode 100644
index 2764e7478c71e..0000000000000
--- a/test/CodeGen/WinEH/cppeh-nested-2.ll
+++ /dev/null
@@ -1,324 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-; class Inner {
-; public:
-;   Inner();
-;   ~Inner();
-; };
-; class Outer {
-; public:
-;   Outer();
-;   ~Outer();
-; };
-; void test() {
-;   try {
-;     Outer outer;
-;     try {
-;        Inner inner;
-;        may_throw();
-;     } catch (int i) {
-;       handle_int(i);
-;     }
-;   } catch (float f) {
-;     handle_float(f);
-;   }
-;   done();
-; }
-
-; ModuleID = 'nested-2.cpp'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%class.Outer = type { i8 }
-%class.Inner = type { i8 }
-
-@_ZTIf = external constant i8*
-@_ZTIi = external constant i8*
-
-; The function entry should be rewritten like this.
-; CHECK: define void @_Z4testv()
-; CHECK: entry:
-; CHECK:   %outer = alloca %class.Outer, align 1
-; CHECK:   %inner = alloca %class.Inner, align 1
-; CHECK:   %i = alloca i32, align 4
-; CHECK:   %f = alloca float, align 4
-; CHECK:   call void (...) @llvm.localescape(float* %f, i32* %i, %class.Outer* %outer, %class.Inner* %inner)
-; CHECK:   invoke void @_ZN5OuterC1Ev(%class.Outer* %outer)
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-; Function Attrs: uwtable
-define void @_Z4testv() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %outer = alloca %class.Outer, align 1
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  %inner = alloca %class.Inner, align 1
-  %i = alloca i32, align 4
-  %f = alloca float, align 4
-  invoke void @_ZN5OuterC1Ev(%class.Outer* %outer)
-          to label %invoke.cont unwind label %lpad
-
-; CHECK: invoke.cont:
-; CHECK:   invoke void @_ZN5InnerC1Ev(%class.Inner* %inner)
-; CHECK:           to label %invoke.cont2 unwind label %[[LPAD1_LABEL:lpad[0-9]*]]
-
-invoke.cont:                                      ; preds = %entry
-  invoke void @_ZN5InnerC1Ev(%class.Inner* %inner)
-          to label %invoke.cont2 unwind label %lpad1
-
-; CHECK: invoke.cont2:
-; CHECK:   invoke void @_Z9may_throwv()
-; CHECK:           to label %invoke.cont4 unwind label %[[LPAD3_LABEL:lpad[0-9]*]]
-
-invoke.cont2:                                     ; preds = %invoke.cont
-  invoke void @_Z9may_throwv()
-          to label %invoke.cont4 unwind label %lpad3
-
-; CHECK: invoke.cont4:
-; CHECK:   invoke void @_ZN5InnerD1Ev(%class.Inner* %inner)
-; CHECK:           to label %invoke.cont5 unwind label %[[LPAD1_LABEL]]
-
-invoke.cont4:                                     ; preds = %invoke.cont2
-  invoke void @_ZN5InnerD1Ev(%class.Inner* %inner)
-          to label %invoke.cont5 unwind label %lpad1
-
-; CHECK: invoke.cont5:
-; CHECK:   br label %try.cont
-
-invoke.cont5:                                     ; preds = %invoke.cont4
-  br label %try.cont
-
-; CHECK: [[LPAD_LABEL]]:
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIf to i8*)
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i8** @_ZTIf to i8*), i32 0, i8* (i8*, i8*)* @_Z4testv.catch)
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont19]
-
-lpad:                                             ; preds = %try.cont, %entry
-  %tmp = landingpad { i8*, i32 }
-          catch i8* bitcast (i8** @_ZTIf to i8*)
-  %tmp1 = extractvalue { i8*, i32 } %tmp, 0
-  store i8* %tmp1, i8** %exn.slot
-  %tmp2 = extractvalue { i8*, i32 } %tmp, 1
-  store i32 %tmp2, i32* %ehselector.slot
-  br label %catch.dispatch11
-
-; CHECK: [[LPAD1_LABEL]]:
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           cleanup
-; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIi to i8*)
-; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIf to i8*)
-; CHECK-NEXT:   [[RECOVER1:\%.+]] = call i8* (...) @llvm.eh.actions(
-; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIi to i8*), i32 1, i8* (i8*, i8*)* @_Z4testv.catch.1,
-; CHECK-SAME:       i32 0, void (i8*, i8*)* @_Z4testv.cleanup,
-; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIf to i8*), i32 0, i8* (i8*, i8*)* @_Z4testv.catch)
-; CHECK-NEXT:   indirectbr i8* [[RECOVER1]], [label %try.cont, label %try.cont19]
-
-lpad1:                                            ; preds = %invoke.cont4, %invoke.cont
-  %tmp3 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (i8** @_ZTIi to i8*)
-          catch i8* bitcast (i8** @_ZTIf to i8*)
-  %tmp4 = extractvalue { i8*, i32 } %tmp3, 0
-  store i8* %tmp4, i8** %exn.slot
-  %tmp5 = extractvalue { i8*, i32 } %tmp3, 1
-  store i32 %tmp5, i32* %ehselector.slot
-  br label %catch.dispatch
-
-; CHECK: [[LPAD3_LABEL]]:
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           cleanup
-; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIi to i8*)
-; CHECK-NEXT:           catch i8* bitcast (i8** @_ZTIf to i8*)
-; CHECK-NEXT:   [[RECOVER3:\%.+]] = call i8* (...) @llvm.eh.actions(
-; CHECK-SAME:       i32 0, void (i8*, i8*)* @_Z4testv.cleanup.2,
-; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIi to i8*), i32 1, i8* (i8*, i8*)* @_Z4testv.catch.1,
-; CHECK-SAME:       i32 0, void (i8*, i8*)* @_Z4testv.cleanup,
-; CHECK-SAME:       i32 1, i8* bitcast (i8** @_ZTIf to i8*), i32 0, i8* (i8*, i8*)* @_Z4testv.catch)
-; CHECK-NEXT:   indirectbr i8* [[RECOVER3]], [label %try.cont, label %try.cont19]
-
-lpad3:                                            ; preds = %invoke.cont2
-  %tmp6 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (i8** @_ZTIi to i8*)
-          catch i8* bitcast (i8** @_ZTIf to i8*)
-  %tmp7 = extractvalue { i8*, i32 } %tmp6, 0
-  store i8* %tmp7, i8** %exn.slot
-  %tmp8 = extractvalue { i8*, i32 } %tmp6, 1
-  store i32 %tmp8, i32* %ehselector.slot
-  call void @_ZN5InnerD1Ev(%class.Inner* %inner)
-  br label %catch.dispatch
-
-; CHECK-NOT: catch.dispatch:
-
-catch.dispatch:                                   ; preds = %lpad3, %lpad1
-  %sel = load i32, i32* %ehselector.slot
-  %tmp9 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #4
-  %matches = icmp eq i32 %sel, %tmp9
-  br i1 %matches, label %catch, label %ehcleanup
-
-; CHECK-NOT: catch:
-
-catch:                                            ; preds = %catch.dispatch
-  %exn = load i8*, i8** %exn.slot
-  %i.i8 = bitcast i32* %i to i8*
-  call void @llvm.eh.begincatch(i8* %exn, i8* %i.i8) #4
-  %tmp13 = load i32, i32* %i, align 4
-  invoke void @_Z10handle_inti(i32 %tmp13)
-          to label %invoke.cont8 unwind label %lpad7
-
-; CHECK-NOT: invoke.cont8:
-
-invoke.cont8:                                     ; preds = %catch
-  call void @llvm.eh.endcatch() #4
-  br label %try.cont
-
-; CHECK: try.cont:
-; CHECK:   invoke void @_ZN5OuterD1Ev(%class.Outer* %outer)
-; CHECK:           to label %invoke.cont9 unwind label %[[LPAD_LABEL]]
-
-try.cont:                                         ; preds = %invoke.cont8, %invoke.cont5
-  invoke void @_ZN5OuterD1Ev(%class.Outer* %outer)
-          to label %invoke.cont9 unwind label %lpad
-
-invoke.cont9:                                     ; preds = %try.cont
-  br label %try.cont19
-
-; CHECK-NOT: lpad7:
-
-lpad7:                                            ; preds = %catch
-  %tmp14 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (i8** @_ZTIf to i8*)
-  %tmp15 = extractvalue { i8*, i32 } %tmp14, 0
-  store i8* %tmp15, i8** %exn.slot
-  %tmp16 = extractvalue { i8*, i32 } %tmp14, 1
-  store i32 %tmp16, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #4
-  br label %ehcleanup
-
-; CHECK-NOT: ehcleanup:                                        ; preds = %lpad7, %catch.dispatch
-
-ehcleanup:                                        ; preds = %lpad7, %catch.dispatch
-  call void @_ZN5OuterD1Ev(%class.Outer* %outer)
-  br label %catch.dispatch11
-
-; CHECK-NOT: catch.dispatch11:
-
-catch.dispatch11:                                 ; preds = %ehcleanup, %lpad
-  %sel12 = load i32, i32* %ehselector.slot
-  %tmp17 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*)) #4
-  %matches13 = icmp eq i32 %sel12, %tmp17
-  br i1 %matches13, label %catch14, label %eh.resume
-
-; CHECK-NOT: catch14:
-
-catch14:                                          ; preds = %catch.dispatch11
-  %exn15 = load i8*, i8** %exn.slot
-  %f.i8 = bitcast float* %f to i8*
-  call void @llvm.eh.begincatch(i8* %exn15, i8* %f.i8) #4
-  %tmp21 = load float, float* %f, align 4
-  call void @_Z12handle_floatf(float %tmp21)
-  call void @llvm.eh.endcatch() #4
-  br label %try.cont19
-
-try.cont19:                                       ; preds = %catch14, %invoke.cont9
-  call void @_Z4donev()
-  ret void
-
-; CHECK-NOT: eh.resume:
-
-eh.resume:                                        ; preds = %catch.dispatch11
-  %exn20 = load i8*, i8** %exn.slot
-  %sel21 = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn20, 0
-  %lpad.val22 = insertvalue { i8*, i32 } %lpad.val, i32 %sel21, 1
-  resume { i8*, i32 } %lpad.val22
-
-; CHECK: }
-}
-
-; This catch handler should be outlined.
-; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_F:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
-; CHECK:   [[F_PTR:\%.+]] = bitcast i8* [[RECOVER_F]] to float*
-; CHECK:   [[TMP:\%.+]] = load float, float* [[F_PTR]], align 4
-; CHECK:   call void @_Z12handle_floatf(float [[TMP]])
-; CHECK:   ret i8* blockaddress(@_Z4testv, %try.cont19)
-; CHECK: }
-
-; This catch handler should be outlined.
-; CHECK: define internal i8* @_Z4testv.catch.1(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 1)
-; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
-; CHECK:   invoke void @_Z10handle_inti(i32 [[TMP1]])
-; CHECK:           to label %invoke.cont8 unwind label %[[LPAD7_LABEL:lpad[0-9]*]]
-;
-; CHECK: invoke.cont8:                                     ; preds = %entry
-; CHECK:   ret i8* blockaddress(@_Z4testv, %try.cont)
-;
-; CHECK: [[LPAD7_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   [[LPAD7_VAL:\%.+]] = landingpad { i8*, i32 }
-; (FIXME) The nested handler body isn't being populated yet.
-; CHECK: }
-
-; This cleanup handler should be outlined.
-; CHECK: define internal void @_Z4testv.cleanup(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_OUTER:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 2)
-; CHECK:   [[OUTER_PTR:\%.+]] = bitcast i8* [[RECOVER_OUTER]] to %class.Outer*
-; CHECK:   call void @_ZN5OuterD1Ev(%class.Outer* [[OUTER_PTR]])
-; CHECK:   ret void
-; CHECK: }
-
-; This cleanup handler should be outlined.
-; CHECK: define internal void @_Z4testv.cleanup.2(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_INNER:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 3)
-; CHECK:   [[INNER_PTR:\%.+]] = bitcast i8* [[RECOVER_INNER]] to %class.Inner*
-; CHECK:   call void @_ZN5InnerD1Ev(%class.Inner* [[INNER_PTR]])
-; CHECK:   ret void
-; CHECK: }
-
-
-
-declare void @_ZN5OuterC1Ev(%class.Outer*) #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-declare void @_ZN5InnerC1Ev(%class.Inner*) #1
-
-declare void @_Z9may_throwv() #1
-
-declare void @_ZN5InnerD1Ev(%class.Inner*) #1
-
-declare void @llvm.eh.begincatch(i8*, i8*)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #3
-
-declare void @_Z10handle_inti(i32) #1
-
-declare void @llvm.eh.endcatch()
-
-declare void @_ZN5OuterD1Ev(%class.Outer*) #1
-
-declare void @_Z12handle_floatf(float) #1
-
-declare void @_Z4donev() #1
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline noreturn nounwind }
-attributes #3 = { nounwind readnone }
-attributes #4 = { nounwind }
-attributes #5 = { noreturn nounwind }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.7.0 (trunk 226027)"}
diff --git a/test/CodeGen/WinEH/cppeh-nested-3.ll b/test/CodeGen/WinEH/cppeh-nested-3.ll
deleted file mode 100644
index 88759f406fb19..0000000000000
--- a/test/CodeGen/WinEH/cppeh-nested-3.ll
+++ /dev/null
@@ -1,260 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-;void test()
-;{
-;  try {
-;    try {
-;       may_throw();
-;    } catch (int i) {
-;      try {
-;        may_throw();
-;      }
-;      catch (int j) {
-;        i = j;
-;      }
-;      handle_int(i);
-;    }
-;  } catch (float f) {
-;    handle_float(f);
-;  }
-;  done();
-;}
-
-; ModuleID = 'cppeh-nested-3.cpp'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-
-$"\01??_R0M@8" = comdat any
-
-$"\01??_R0H@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0M@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".M\00" }, comdat
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-
-; CHECK: define void @"\01?test@@YAXXZ"()
-; CHECK: entry:
-; CHECK:   %i = alloca i32, align 4
-; CHECK:   %j = alloca i32, align 4
-; CHECK:   %f = alloca float, align 4
-; CHECK:   call void (...) @llvm.localescape(i32* %j, i32* %i, float* %f)
-; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
-; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
-
-; Function Attrs: uwtable
-define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  %i = alloca i32, align 4
-  %j = alloca i32, align 4
-  %f = alloca float, align 4
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  br label %try.cont10
-
-; CHECK: [[LPAD_LABEL]]:
-; CHECK:   landingpad { i8*, i32 }
-; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-; CHECK:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch.2", i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch.1")
-; CHECK:   indirectbr i8* [[RECOVER]], [label %try.cont10, label %try.cont19]
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-  %1 = extractvalue { i8*, i32 } %0, 0
-  store i8* %1, i8** %exn.slot
-  %2 = extractvalue { i8*, i32 } %0, 1
-  store i32 %2, i32* %ehselector.slot
-  br label %catch.dispatch
-
-; CHECK-NOT: catch.dispatch:
-catch.dispatch:                                   ; preds = %lpad
-  %sel = load i32, i32* %ehselector.slot
-  %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)) #3
-  %matches = icmp eq i32 %sel, %3
-  br i1 %matches, label %catch, label %catch.dispatch11
-
-; CHECK-NOT: catch:
-catch:                                            ; preds = %catch.dispatch
-  %exn = load i8*, i8** %exn.slot
-  %4 = bitcast i32* %i to i8*
-  call void @llvm.eh.begincatch(i8* %exn, i8* %4) #3
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont2 unwind label %lpad1
-
-; CHECK-NOT: invoke.cont2:
-invoke.cont2:                                     ; preds = %catch
-  br label %try.cont
-
-; CHECK-NOT: lpad1:
-lpad1:                                            ; preds = %catch
-  %5 = landingpad { i8*, i32 }
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-  %6 = extractvalue { i8*, i32 } %5, 0
-  store i8* %6, i8** %exn.slot
-  %7 = extractvalue { i8*, i32 } %5, 1
-  store i32 %7, i32* %ehselector.slot
-  br label %catch.dispatch3
-
-; CHECK-NOT: catch.dispatch3:
-catch.dispatch3:                                  ; preds = %lpad1
-  %sel4 = load i32, i32* %ehselector.slot
-  %8 = call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)) #3
-  %matches5 = icmp eq i32 %sel4, %8
-  br i1 %matches5, label %catch6, label %catch.dispatch11
-
-; CHECK-NOT: catch6:
-catch6:                                           ; preds = %catch.dispatch3
-  %exn7 = load i8*, i8** %exn.slot
-  %9 = bitcast i32* %j to i8*
-  call void @llvm.eh.begincatch(i8* %exn7, i8* %9) #3
-  %10 = load i32, i32* %j, align 4
-  store i32 %10, i32* %i, align 4
-  call void @llvm.eh.endcatch() #3
-  br label %try.cont
-
-; CHECK-NOT: try.cont:
-try.cont:                                         ; preds = %catch6, %invoke.cont2
-  %11 = load i32, i32* %i, align 4
-  invoke void @"\01?handle_int@@YAXH@Z"(i32 %11)
-          to label %invoke.cont9 unwind label %lpad8
-
-; CHECK-NOT: invoke.cont9:
-invoke.cont9:                                     ; preds = %try.cont
-  call void @llvm.eh.endcatch() #3
-  br label %try.cont10
-
-try.cont10:                                       ; preds = %invoke.cont9, %invoke.cont
-  br label %try.cont19
-
-; CHECK-NOT: lpad8:
-lpad8:                                            ; preds = %try.cont
-  %12 = landingpad { i8*, i32 }
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-  %13 = extractvalue { i8*, i32 } %12, 0
-  store i8* %13, i8** %exn.slot
-  %14 = extractvalue { i8*, i32 } %12, 1
-  store i32 %14, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #3
-  br label %catch.dispatch11
-
-; CHECK-NOT: catch.dispatch11:
-catch.dispatch11:                                 ; preds = %lpad8, %catch.dispatch3, %catch.dispatch
-  %sel12 = load i32, i32* %ehselector.slot
-  %15 = call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)) #3
-  %matches13 = icmp eq i32 %sel12, %15
-  br i1 %matches13, label %catch14, label %eh.resume
-
-; CHECK-NOT: catch14:
-catch14:                                          ; preds = %catch.dispatch11
-  %exn15 = load i8*, i8** %exn.slot
-  %16 = bitcast float* %f to i8*
-  call void @llvm.eh.begincatch(i8* %exn15, i8* %16) #3
-  %17 = load float, float* %f, align 4
-  call void @"\01?handle_float@@YAXM@Z"(float %17)
-  call void @llvm.eh.endcatch() #3
-  br label %try.cont19
-
-try.cont19:                                       ; preds = %catch14, %try.cont10
-  call void @"\01?done@@YAXXZ"()
-  ret void
-
-; CHECK-NOT: eh.resume:
-eh.resume:                                        ; preds = %lpad16, %catch.dispatch11
-  %exn20 = load i8*, i8** %exn.slot
-  %sel21 = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn20, 0
-  %lpad.val22 = insertvalue { i8*, i32 } %lpad.val, i32 %sel21, 1
-  resume { i8*, i32 } %lpad.val22
-; CHECK: }
-}
-
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_J:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
-; CHECK:   [[J_PTR:\%.+]] = bitcast i8* [[RECOVER_J]] to i32*
-; CHECK:   [[RECOVER_I1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
-; CHECK:   [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I1]] to i32*
-; CHECK:   [[TMP3:\%.+]] = load i32, i32* [[J_PTR]], align 4
-; CHECK:   store i32 [[TMP3]], i32* [[I_PTR1]]
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ.catch.2", %invoke.cont2)
-; CHECK: }
-
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_F:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
-; CHECK:   [[F_PTR:\%.+]] = bitcast i8* [[RECOVER_F]] to float*
-; CHECK:   [[TMP2:\%.+]] = load float, float* [[F_PTR]], align 4
-; CHECK:   call void @"\01?handle_float@@YAXM@Z"(float [[TMP2]])
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont19)
-; CHECK: }
-
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch.2"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
-; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
-; CHECK:           to label %invoke.cont2 unwind label %[[LPAD1_LABEL:lpad[0-9]*]]
-;
-; CHECK: invoke.cont2:                                     ; preds = %[[LPAD1_LABEL]], %entry
-; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
-; CHECK:   invoke void @"\01?handle_int@@YAXH@Z"(i32 [[TMP1]])
-; CHECK:           to label %invoke.cont9 unwind label %[[LPAD8_LABEL:lpad[0-9]*]]
-;
-; CHECK: [[LPAD1_LABEL]]:{{[ ]+}}; preds = %entry
-; CHECK:   [[LPAD1_VAL:\%.+]] = landingpad { i8*, i32 }
-; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-; CHECK:   [[RECOVER1:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch", i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch.1")
-; CHECK:   indirectbr i8* [[RECOVER1]], [label %invoke.cont2]
-;
-; CHECK: invoke.cont9:
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont10)
-;
-; CHECK: [[LPAD8_LABEL]]:{{[ ]+}}; preds = %invoke.cont2
-; CHECK:   [[LPAD8_VAL:\%.+]] = landingpad { i8*, i32 }
-; CHECK:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*)
-; CHECK:   [[RECOVER2:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0M@8" to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch.1")
-; CHECK:   indirectbr i8* [[RECOVER2]], []
-;
-; CHECK: }
-
-declare void @"\01?may_throw@@YAXXZ"() #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #3
-
-declare void @"\01?handle_int@@YAXH@Z"(i32) #1
-
-declare void @"\01?handle_float@@YAXM@Z"(float) #1
-
-declare void @"\01?done@@YAXXZ"() #1
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 232069) (llvm/trunk 232070)"}
diff --git a/test/CodeGen/WinEH/cppeh-nested-rethrow.ll b/test/CodeGen/WinEH/cppeh-nested-rethrow.ll
deleted file mode 100644
index 53f532c8eb161..0000000000000
--- a/test/CodeGen/WinEH/cppeh-nested-rethrow.ll
+++ /dev/null
@@ -1,212 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test was generated from the following code.
-;
-; void test1() {
-;   try {
-;     try {
-;       throw 1;
-;     } catch(...) { throw; }
-;   } catch (...) { }
-; }
-; void test2() {
-;   try {
-;     throw 1;
-;   } catch(...) {
-;     try {
-;       throw; 
-;     } catch (...) {}
-;   }
-; }
-;
-; These two functions result in functionally equivalent code, but the last
-; catch block contains a call to llvm.eh.endcatch that tripped up processing
-; during development.
-;
-; The main purpose of this test is to verify that we can correctly
-; handle the case of nested landing pads that return directly to a block in
-; the parent function.
-
-; ModuleID = 'cppeh-nested-rethrow.cpp'
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
-%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
-%eh.ThrowInfo = type { i32, i32, i32, i32 }
-
-$"\01??_R0H@8" = comdat any
-
-$"_CT??_R0H@84" = comdat any
-
-$_CTA1H = comdat any
-
-$_TI1H = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@__ImageBase = external constant i8
-@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
-@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
-@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
-
-; CHECK-LABEL: define void @"\01?test1@@YAXXZ"()
-; CHECK: entry:
-; CHECK:   call void (...) @llvm.localescape
-
-; Function Attrs: nounwind uwtable
-define void @"\01?test1@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %tmp = alloca i32, align 4
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  store i32 1, i32* %tmp
-  %0 = bitcast i32* %tmp to i8*
-  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #2
-          to label %unreachable unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %1 = landingpad { i8*, i32 }
-          catch i8* null
-  %2 = extractvalue { i8*, i32 } %1, 0
-  store i8* %2, i8** %exn.slot
-  %3 = extractvalue { i8*, i32 } %1, 1
-  store i32 %3, i32* %ehselector.slot
-  br label %catch
-
-catch:                                            ; preds = %lpad
-  %exn = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn, i8* null) #1
-  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) #2
-          to label %unreachable unwind label %lpad1
-
-lpad1:                                            ; preds = %catch
-  %4 = landingpad { i8*, i32 }
-          catch i8* null
-  %5 = extractvalue { i8*, i32 } %4, 0
-  store i8* %5, i8** %exn.slot
-  %6 = extractvalue { i8*, i32 } %4, 1
-  store i32 %6, i32* %ehselector.slot
-  br label %catch2
-
-catch2:                                           ; preds = %lpad1
-  %exn3 = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn3, i8* null) #1
-  call void @llvm.eh.endcatch() #1
-  br label %try.cont.4
-
-; This block should not be eliminated.
-; CHECK: try.cont.4:
-try.cont.4:                                        ; preds = %catch2, %try.cont
-  ret void
-
-try.cont:                                         ; No predecessors!
-  br label %try.cont.4
-
-unreachable:                                      ; preds = %catch, %entry
-  unreachable
-; CHECK: }
-}
-
-declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #1
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #1
-
-; CHECK-LABEL: define void @"\01?test2@@YAXXZ"()
-; CHECK: entry:
-; CHECK:   call void (...) @llvm.localescape
-
-; Function Attrs: nounwind uwtable
-define void @"\01?test2@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %tmp = alloca i32, align 4
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  store i32 1, i32* %tmp
-  %0 = bitcast i32* %tmp to i8*
-  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #2
-          to label %unreachable unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %1 = landingpad { i8*, i32 }
-          catch i8* null
-  %2 = extractvalue { i8*, i32 } %1, 0
-  store i8* %2, i8** %exn.slot
-  %3 = extractvalue { i8*, i32 } %1, 1
-  store i32 %3, i32* %ehselector.slot
-  br label %catch
-
-catch:                                            ; preds = %lpad
-  %exn = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn, i8* null) #1
-  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) #2
-          to label %unreachable unwind label %lpad1
-
-lpad1:                                            ; preds = %catch
-  %4 = landingpad { i8*, i32 }
-          catch i8* null
-  %5 = extractvalue { i8*, i32 } %4, 0
-  store i8* %5, i8** %exn.slot
-  %6 = extractvalue { i8*, i32 } %4, 1
-  store i32 %6, i32* %ehselector.slot
-  br label %catch2
-
-catch2:                                           ; preds = %lpad1
-  %exn3 = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn3, i8* null) #1
-  call void @llvm.eh.endcatch() #1
-  br label %try.cont
-
-; This block should not be eliminated.
-; CHECK: try.cont:
-; The endcatch call should be eliminated.
-; CHECK-NOT: call void @llvm.eh.endcatch()
-try.cont:                                         ; preds = %catch2
-  call void @llvm.eh.endcatch() #1
-  br label %try.cont.4
-
-try.cont.4:                                        ; preds = %try.cont
-  ret void
-
-unreachable:                                      ; preds = %catch, %entry
-  unreachable
-; CHECK: }
-}
-
-; The outlined test1.catch handler should return to a valid block address.
-; CHECK-LABEL: define internal i8* @"\01?test1@@YAXXZ.catch"(i8*, i8*)
-; CHECK-NOT:  ret i8* inttoptr (i32 1 to i8*)
-; CHECK: }
-
-; The outlined test1.catch1 handler should not contain a return instruction.
-; CHECK-LABEL: define internal i8* @"\01?test1@@YAXXZ.catch.1"(i8*, i8*)
-; CHECK-NOT: ret
-; CHECK: }
-
-; The outlined test2.catch handler should return to a valid block address.
-; CHECK-LABEL: define internal i8* @"\01?test2@@YAXXZ.catch"(i8*, i8*)
-; CHECK-NOT:  ret i8* inttoptr (i32 1 to i8*)
-; CHECK: }
-
-; The outlined test2.catch2 handler should not contain a return instruction.
-; CHECK-LABEL: define internal i8* @"\01?test2@@YAXXZ.catch.2"(i8*, i8*)
-; CHECK-NOT: ret
-; CHECK: }
-
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { noreturn }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 236059)"}
diff --git a/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll b/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
deleted file mode 100644
index 7b474c9d38a38..0000000000000
--- a/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
+++ /dev/null
@@ -1,278 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-; struct SomeData {
-;   int a;
-;   int b;
-; };
-; 
-; void may_throw();
-; void does_not_throw(int i);
-; void dump(int *, int, SomeData&);
-; 
-; void test() {
-;   int NumExceptions = 0;
-;   int ExceptionVal[10];
-;   SomeData Data = { 0, 0 };
-; 
-;   for (int i = 0; i < 10; ++i) {
-;     try {
-;       may_throw();
-;       Data.a += i;
-;     }
-;     catch (int e) {
-;       ExceptionVal[NumExceptions] = e;
-;       ++NumExceptions;
-;       if (e == i)
-;         Data.b += e;
-;       else
-;         Data.a += e;
-;     }
-;     does_not_throw(NumExceptions);
-;   }
-;   dump(ExceptionVal, NumExceptions, Data);
-; }
-;
-; Unlike the cppeh-frame-vars.ll test, this test was generated using -O2
-; optimization, which results in non-alloca values being used in the
-; catch handler.
-
-; ModuleID = 'cppeh-frame-vars.cpp'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%struct.SomeData = type { i32, i32 }
-
-$"\01??_R0H@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-
-; The function entry should be rewritten like this.
-; CHECK: define void @"\01?test@@YAXXZ"()
-; CHECK: entry:
-; CHECK:   [[NUMEXCEPTIONS_REGMEM:\%.+]] = alloca i32
-; CHECK:   [[I_REGMEM:\%.+]] = alloca i32
-; CHECK:   [[B_REGMEM:\%.+]] = alloca i32*
-; CHECK:   [[A_REGMEM:\%.+]] = alloca i32*
-; CHECK:   [[E_PTR:\%.+]] = alloca i32, align 4
-; CHECK:   [[EXCEPTIONVAL:\%.+]] = alloca [10 x i32], align 16
-; CHECK:   [[DATA_PTR:\%.+]] = alloca i64, align 8
-; CHECK:   [[TMPCAST:\%.+]] = bitcast i64* [[DATA_PTR]] to %struct.SomeData*
-; CHECK:   [[TMP:\%.+]] = bitcast [10 x i32]* [[EXCEPTIONVAL]] to i8*
-; CHECK:   call void @llvm.lifetime.start(i64 40, i8* [[TMP]])
-; CHECK:   store i64 0, i64* [[DATA_PTR]], align 8
-; CHECK:   [[A_PTR:\%.+]] = bitcast i64* [[DATA_PTR]] to i32*
-; CHECK:   store i32* [[A_PTR]], i32** [[A_REGMEM]]
-; CHECK:   [[B_PTR:\%.+]] = getelementptr inbounds %struct.SomeData, %struct.SomeData* [[TMPCAST]], i64 0, i32 1
-; CHECK:   store i32* [[B_PTR]], i32** [[B_REGMEM]]
-; CHECK:   call void (...) @llvm.localescape(i32* %e, i32* %NumExceptions.020.reg2mem, [10 x i32]* [[EXCEPTIONVAL]], i32* %inc.reg2mem, i32* [[I_REGMEM]], i32** [[A_REGMEM]], i32** [[B_REGMEM]])
-; CHECK:   br label %for.body
-
-; Function Attrs: uwtable
-define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %e = alloca i32, align 4
-  %ExceptionVal = alloca [10 x i32], align 16
-  %Data = alloca i64, align 8
-  %tmpcast = bitcast i64* %Data to %struct.SomeData*
-  %0 = bitcast [10 x i32]* %ExceptionVal to i8*
-  call void @llvm.lifetime.start(i64 40, i8* %0) #1
-  store i64 0, i64* %Data, align 8
-  %a = bitcast i64* %Data to i32*
-  %b = getelementptr inbounds %struct.SomeData, %struct.SomeData* %tmpcast, i64 0, i32 1
-  br label %for.body
-
-; CHECK: for.body:
-; CHECK:   [[NUMEXCEPTIONS_PHI:\%.*]] = phi i32 [ 0, %entry ], [ {{\%NumExceptions.*}}, %try.cont ]
-; CHECK:   [[I_PHI:\%.*]] = phi i32 [ 0, %entry ], [ {{\%inc.*}}, %try.cont ]
-; CHECK:   store i32 [[I_PHI]], i32* [[I_REGMEM]]
-; CHECK:   store i32 [[NUMEXCEPTIONS_PHI]], i32* [[NUMEXCEPTIONS_REGMEM]]
-; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
-for.body:                                         ; preds = %entry, %try.cont
-  %NumExceptions.020 = phi i32 [ 0, %entry ], [ %NumExceptions.1, %try.cont ]
-  %i.019 = phi i32 [ 0, %entry ], [ %inc5, %try.cont ]
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont unwind label %lpad
-
-; CHECK: invoke.cont:                                      ; preds = %for.body
-; CHECK:   [[A_RELOAD:\%.+]] = load i32*, i32** [[A_REGMEM]]
-; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[A_RELOAD]], align 8
-; CHECK:   [[I_RELOAD:\%.+]] = load i32, i32* [[I_REGMEM]]
-; CHECK:   [[ADD:\%.+]] = add nsw i32 [[TMP1]], [[I_RELOAD]]
-; CHECK:   [[A_RELOAD1:\%.+]] = load i32*, i32** [[A_REGMEM]]
-; CHECK:   [[NUMEXCEPTIONS_RELOAD:\%.+]] = load i32, i32* [[NUMEXCEPTIONS_REGMEM]]
-; CHECK:   br label %try.cont
-invoke.cont:                                      ; preds = %for.body
-  %1 = load i32, i32* %a, align 8, !tbaa !2
-  %add = add nsw i32 %1, %i.019
-  store i32 %add, i32* %a, align 8, !tbaa !2
-  br label %try.cont
-
-; CHECK: [[LPAD_LABEL:lpad[0-9]*]]:{{[ ]+}}; preds = %for.body
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch")
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %[[SPLIT_RECOVER_BB:.*]]]
-
-lpad:                                             ; preds = %for.body
-  %2 = landingpad { i8*, i32 }
-          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
-  %3 = extractvalue { i8*, i32 } %2, 1
-  %4 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)) #1
-  %matches = icmp eq i32 %3, %4
-  br i1 %matches, label %catch, label %eh.resume
-
-; CHECK-NOT: catch:
-
-catch:                                            ; preds = %lpad
-  %5 = extractvalue { i8*, i32 } %2, 0
-  %e.i8 = bitcast i32* %e to i8*
-  call void @llvm.eh.begincatch(i8* %5, i8* %e.i8) #1
-  %tmp8 = load i32, i32* %e, align 4, !tbaa !7
-  %idxprom = sext i32 %NumExceptions.020 to i64
-  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %ExceptionVal, i64 0, i64 %idxprom
-  store i32 %tmp8, i32* %arrayidx, align 4, !tbaa !7
-  %inc = add nsw i32 %NumExceptions.020, 1
-  %cmp1 = icmp eq i32 %tmp8, %i.019
-  br i1 %cmp1, label %if.then, label %if.else
-
-if.then:                                          ; preds = %catch
-  %tmp9 = load i32, i32* %b, align 4, !tbaa !8
-  %add2 = add nsw i32 %tmp9, %i.019
-  store i32 %add2, i32* %b, align 4, !tbaa !8
-  br label %if.end
-
-; CHECK-NOT: if.else:
-
-if.else:                                          ; preds = %catch
-  %tmp10 = load i32, i32* %a, align 8, !tbaa !2
-  %add4 = add nsw i32 %tmp10, %tmp8
-  store i32 %add4, i32* %a, align 8, !tbaa !2
-  br label %if.end
-
-; CHECK-NOT: if.end:
-; CHECK: [[SPLIT_RECOVER_BB]]:
-; CHECK: [[INC_RELOAD:\%.*]] = load i32, i32*
-; CHECK: br label %try.cont
-
-if.end:                                           ; preds = %if.else, %if.then
-  tail call void @llvm.eh.endcatch() #1
-  br label %try.cont
-
-; CHECK: try.cont:{{[ ]+}}; preds = %[[SPLIT_RECOVER_BB]], %invoke.cont
-; CHECK:   [[NUMEXCEPTIONS_PHI:\%.*]] = phi i32 [ [[NUMEXCEPTIONS_RELOAD]], %invoke.cont ], [ [[INC_RELOAD]], %[[SPLIT_RECOVER_BB]] ]
-; CHECK:   tail call void @"\01?does_not_throw@@YAXH@Z"(i32 [[NUMEXCEPTIONS_PHI]])
-; CHECK:   [[I_RELOAD:\%.+]] = load i32, i32* [[I_REGMEM]]
-; CHECK:   [[INC:\%.+]] = add nuw nsw i32 [[I_RELOAD]], 1
-; CHECK:   [[CMP:\%.+]] = icmp slt i32 [[INC]], 10
-; CHECK:   br i1 [[CMP]], label %for.body, label %for.end
-
-try.cont:                                         ; preds = %if.end, %invoke.cont
-  %NumExceptions.1 = phi i32 [ %NumExceptions.020, %invoke.cont ], [ %inc, %if.end ]
-  tail call void @"\01?does_not_throw@@YAXH@Z"(i32 %NumExceptions.1)
-  %inc5 = add nuw nsw i32 %i.019, 1
-  %cmp = icmp slt i32 %inc5, 10
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %try.cont
-  %NumExceptions.1.lcssa = phi i32 [ %NumExceptions.1, %try.cont ]
-  %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %ExceptionVal, i64 0, i64 0
-  call void @"\01?dump@@YAXPEAHHAEAUSomeData@@@Z"(i32* %arraydecay, i32 %NumExceptions.1.lcssa, %struct.SomeData* dereferenceable(8) %tmpcast)
-  call void @llvm.lifetime.end(i64 40, i8* %0) #1
-  ret void
-
-eh.resume:                                        ; preds = %lpad
-  %.lcssa = phi { i8*, i32 } [ %2, %lpad ]
-  resume { i8*, i32 } %.lcssa
-}
-
-; The following catch handler should be outlined.
-; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
-; CHECK: entry:
-; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
-; CHECK:   [[E_PTR:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
-; CHECK:   [[RECOVER_NUMEXCEPTIONS:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
-; CHECK:   [[NUMEXCEPTIONS_REGMEM:\%.+]] = bitcast i8* [[RECOVER_NUMEXCEPTIONS]] to i32*
-; CHECK:   [[RECOVER_EXCEPTIONVAL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
-; CHECK:   [[EXCEPTIONVAL:\%.+]] = bitcast i8* [[RECOVER_EXCEPTIONVAL]] to [10 x i32]*
-; CHECK:   [[RECOVER_INC:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 3)
-; CHECK:   [[INC_REGMEM:\%.+]] = bitcast i8* [[RECOVER_INC]] to i32*
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 4)
-; CHECK:   [[I_REGMEM:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK:   [[RECOVER_A:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 5)
-; CHECK:   [[A_REGMEM:\%.+]] = bitcast i8* [[RECOVER_A]] to i32**
-; CHECK:   [[RECOVER_B:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 6)
-; CHECK:   [[B_REGMEM:\%.+]] = bitcast i8* [[RECOVER_B]] to i32**
-; CHECK:   [[E_I8PTR:\%.+]] = bitcast i32* [[E_PTR]] to i8*
-; CHECK:   [[TMP:\%.+]] = load i32, i32* [[E_PTR]], align 4
-; CHECK:   [[NUMEXCEPTIONS_RELOAD:\%.+]] = load i32, i32* [[NUMEXCEPTIONS_REGMEM]]
-; CHECK:   [[IDXPROM:\%.+]] = sext i32 [[NUMEXCEPTIONS_RELOAD]] to i64
-; CHECK:   [[ARRAYIDX:\%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[EXCEPTIONVAL]], i64 0, i64 [[IDXPROM]]
-; CHECK:   store i32 [[TMP]], i32* [[ARRAYIDX]], align 4
-; CHECK:   [[NUMEXCEPTIONS_RELOAD:\%.+]] = load i32, i32* [[NUMEXCEPTIONS_REGMEM]]
-; CHECK:   [[INC:\%.+]] = add nsw i32 [[NUMEXCEPTIONS_RELOAD]], 1
-; CHECK:   [[CMP:\%.+]] = icmp eq i32 [[TMP]], [[I_RELOAD]]
-; CHECK:   br i1 [[CMP]], label %if.then, label %if.else
-;
-; CHECK: if.then:{{[ ]+}}; preds = %entry
-; CHECK:   [[B_RELOAD:\%.+]] = load i32*, i32** [[B_REGMEM]]
-; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[B_RELOAD]], align 4
-; CHECK:   [[I_RELOAD:\%.+]] = load i32, i32* [[I_REGMEM]]
-; CHECK:   [[ADD:\%.+]] = add nsw i32 [[TMP1]], [[I_RELOAD]]
-; CHECK:   [[B_RELOAD:\%.+]] = load i32*, i32** [[B_REGMEM]]
-; CHECK:   store i32 [[ADD]], i32* [[B_RELOAD]], align 4
-; CHECK:   br label %if.end
-;
-; CHECK: if.else:{{[ ]+}}; preds = %entry
-; CHECK:   [[A_RELOAD:\%.+]] = load i32*, i32** [[A_REGMEM]]
-; CHECK:   [[TMP2:\%.+]] = load i32, i32* [[A_RELOAD]], align 8
-; CHECK:   [[ADD2:\%.+]] = add nsw i32 [[TMP2]], [[TMP]]
-; CHECK:   [[A_RELOAD:\%.+]] = load i32*, i32** [[A_REGMEM]]
-; CHECK:   store i32 [[ADD2]], i32* [[A_RELOAD]], align 8
-; CHECK:   br label %if.end
-;
-; CHECK: if.end:{{[ ]+}}; preds = %if.else, %if.then
-; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %[[SPLIT_RECOVER_BB]])
-; CHECK: }
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-
-declare void @"\01?may_throw@@YAXXZ"() #2
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #3
-
-declare void @llvm.eh.begincatch(i8*, i8*)
-
-declare void @llvm.eh.endcatch()
-
-declare void @"\01?does_not_throw@@YAXH@Z"(i32) #2
-
-declare void @"\01?dump@@YAXPEAHHAEAUSomeData@@@Z"(i32*, i32, %struct.SomeData* dereferenceable(8)) #2
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 228868)"}
-!2 = !{!3, !4, i64 0}
-!3 = !{!"?AUSomeData@@", !4, i64 0, !4, i64 4}
-!4 = !{!"int", !5, i64 0}
-!5 = !{!"omnipotent char", !6, i64 0}
-!6 = !{!"Simple C/C++ TBAA"}
-!7 = !{!4, !4, i64 0}
-!8 = !{!3, !4, i64 4}
diff --git a/test/CodeGen/WinEH/cppeh-prepared-catch-all.ll b/test/CodeGen/WinEH/cppeh-prepared-catch-all.ll
deleted file mode 100644
index 31b5e58562b21..0000000000000
--- a/test/CodeGen/WinEH/cppeh-prepared-catch-all.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-; This test case is equivalent to:
-; extern "C" void may_throw();
-; extern "C" void test_catch_all() {
-;   try {
-;     may_throw();
-;   } catch (...) {
-;   }
-; }
-
-declare void @may_throw() #1
-declare i32 @__CxxFrameHandler3(...)
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #2
-declare void @llvm.eh.endcatch() #2
-
-; Function Attrs: nounwind uwtable
-define void @test_catch_all() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  invoke void @may_throw()
-          to label %try.cont unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* null
-  %1 = extractvalue { i8*, i32 } %0, 0
-  tail call void @llvm.eh.begincatch(i8* %1, i8* null) #2
-  tail call void @llvm.eh.endcatch() #2
-  br label %try.cont
-
-try.cont:                                         ; preds = %entry, %lpad
-  ret void
-}
-
-; CHECK-LABEL: $handlerMap$0$test_catch_all:
-; CHECK:         .long   {{[0-9]+}}
-; CHECK:         .long   0
-; CHECK:         .long   0
-; CHECK:         .long   test_catch_all.catch@IMGREL
-; CHECK:         .long   .Ltest_catch_all.catch$parent_frame_offset
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
diff --git a/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll b/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
deleted file mode 100644
index 2d31a1d5cf4fb..0000000000000
--- a/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
+++ /dev/null
@@ -1,165 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-
-; Verify that we get the right frame escape label when the catch comes after the
-; parent function.
-
-; This test case is equivalent to:
-; int main() {
-;   try {
-;     throw 42;
-;   } catch (int e) {
-;     printf("e: %d\n", e);
-;   }
-; }
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
-%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
-%eh.ThrowInfo = type { i32, i32, i32, i32 }
-%eh.CatchHandlerType = type { i32, i8* }
-
-$"\01??_R0H@8" = comdat any
-
-$"_CT??_R0H@84" = comdat any
-
-$_CTA1H = comdat any
-
-$_TI1H = comdat any
-
-$"\01??_C@_06PNOAJMHG@e?3?5?$CFd?6?$AA@" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@__ImageBase = external constant i8
-@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
-@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
-@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
-@llvm.eh.handlertype.H.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
-@"\01??_C@_06PNOAJMHG@e?3?5?$CFd?6?$AA@" = linkonce_odr unnamed_addr constant [7 x i8] c"e: %d\0A\00", comdat, align 1
-
-declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
-
-; Function Attrs: uwtable
-define i32 @main() #1 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %tmp.i = alloca i32, align 4
-  %e = alloca i32, align 4
-  %0 = bitcast i32* %tmp.i to i8*
-  store i32 42, i32* %tmp.i, align 4, !tbaa !2
-  call void (...) @llvm.localescape(i32* %e)
-  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #6
-          to label %.noexc unwind label %lpad1
-
-.noexc:                                           ; preds = %entry
-  unreachable
-
-lpad1:                                            ; preds = %entry
-  %1 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-  %recover = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*), i32 0, i8* (i8*, i8*)* @main.catch)
-  indirectbr i8* %recover, [label %try.cont.split]
-
-try.cont.split:                                   ; preds = %lpad1
-  ret i32 0
-}
-
-; CHECK-LABEL: main:
-; CHECK:        .seh_handlerdata
-; CHECK:        .long   ($cppxdata$main)@IMGREL
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture readonly, ...) #4
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #3
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #3
-
-; Function Attrs: nounwind
-declare i8* @llvm.eh.actions(...) #3
-
-define internal i8* @main.catch(i8*, i8*) #5 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %e.i8 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %1, i32 0)
-  %e = bitcast i8* %e.i8 to i32*
-  %2 = bitcast i32* %e to i8*
-  %3 = load i32, i32* %e, align 4, !tbaa !2
-  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @"\01??_C@_06PNOAJMHG@e?3?5?$CFd?6?$AA@", i64 0, i64 0), i32 %3)
-  invoke void @llvm.donothing()
-          to label %entry.split unwind label %stub
-
-entry.split:                                      ; preds = %entry
-  ret i8* blockaddress(@main, %try.cont.split)
-
-stub:                                             ; preds = %entry
-  %4 = landingpad { i8*, i32 }
-          cleanup
-  %recover = call i8* (...) @llvm.eh.actions()
-  unreachable
-}
-
-; CHECK-LABEL: main.catch:
-; CHECK:        .seh_handlerdata
-; CHECK:        .long   ($cppxdata$main)@IMGREL
-
-; CHECK: .align 4
-; CHECK-NEXT: $cppxdata$main:
-; CHECK-NEXT:         .long   429065506
-; CHECK-NEXT:         .long   2
-; CHECK-NEXT:         .long   ($stateUnwindMap$main)@IMGREL
-; CHECK-NEXT:         .long   1
-; CHECK-NEXT:         .long   ($tryMap$main)@IMGREL
-; CHECK-NEXT:         .long   3
-; CHECK-NEXT:         .long   ($ip2state$main)@IMGREL
-; CHECK-NEXT:         .long   40
-; CHECK-NEXT:         .long   0
-; CHECK-NEXT:         .long   1
-
-; Make sure we get the right frame escape label.
-
-; CHECK: $handlerMap$0$main:
-; CHECK-NEXT:         .long   0
-; CHECK-NEXT:         .long   "??_R0H@8"@IMGREL
-; CHECK-NEXT:         .long   .Lmain$frame_escape_0
-; CHECK-NEXT:         .long   main.catch@IMGREL
-; CHECK-NEXT:         .long   .Lmain.catch$parent_frame_offset
-
-; Function Attrs: nounwind readnone
-declare void @llvm.donothing() #2
-
-; Function Attrs: nounwind
-declare void @llvm.localescape(...) #3
-
-; Function Attrs: nounwind readnone
-declare i8* @llvm.localrecover(i8*, i8*, i32) #2
-
-attributes #0 = { noreturn uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="main" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { "wineh-parent"="main" }
-attributes #6 = { noreturn }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 "}
-!2 = !{!3, !3, i64 0}
-!3 = !{!"int", !4, i64 0}
-!4 = !{!"omnipotent char", !5, i64 0}
-!5 = !{!"Simple C/C++ TBAA"}
-
diff --git a/test/CodeGen/WinEH/cppeh-prepared-catch.ll b/test/CodeGen/WinEH/cppeh-prepared-catch.ll
deleted file mode 100644
index a5d86dceea93d..0000000000000
--- a/test/CodeGen/WinEH/cppeh-prepared-catch.ll
+++ /dev/null
@@ -1,232 +0,0 @@
-; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86
-
-; This test case is equivalent to:
-; void f() {
-;   try {
-;     try {
-;       may_throw();
-;     } catch (int &) {
-;       may_throw();
-;     }
-;     may_throw();
-;   } catch (double) {
-;   }
-; }
-
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.CatchHandlerType = type { i32, i8* }
-
-$"\01??_R0N@8" = comdat any
-
-$"\01??_R0H@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0N@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".N\00" }, comdat
-@llvm.eh.handlertype.N.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0N@8" to i8*) }, section "llvm.metadata"
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@llvm.eh.handlertype.H.8 = private unnamed_addr constant %eh.CatchHandlerType { i32 8, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
-
-define internal i8* @"\01?f@@YAXXZ.catch"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 0)
-  %bc2 = bitcast i8* %.i8 to i32**
-  %bc3 = bitcast i32** %bc2 to i8*
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont2 unwind label %lpad1
-
-invoke.cont2:                                     ; preds = %entry
-  ret i8* blockaddress(@"\01?f@@YAXXZ", %try.cont)
-
-lpad1:                                            ; preds = %entry
-  %lp4 = landingpad { i8*, i32 }
-          cleanup
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.N.0
-  %recover = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.N.0 to i8*), i32 1, i8* (i8*, i8*)* @"\01?f@@YAXXZ.catch1")
-  indirectbr i8* %recover, [label %invoke.cont2]
-}
-
-; CHECK-LABEL: "?f@@YAXXZ.catch":
-; No code should be generated for the indirectbr.
-; CHECK-NOT: jmp{{[ql]}} *
-; X64:        .seh_handlerdata
-; X64-NEXT:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
-
-
-define internal i8* @"\01?f@@YAXXZ.catch1"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 1)
-  %2 = bitcast i8* %.i8 to double*
-  %3 = bitcast double* %2 to i8*
-  invoke void () @llvm.donothing()
-          to label %done unwind label %lpad
-
-done:
-  ret i8* blockaddress(@"\01?f@@YAXXZ", %try.cont8)
-
-lpad:                                             ; preds = %entry
-  %4 = landingpad { i8*, i32 }
-          cleanup
-  %recover = call i8* (...) @llvm.eh.actions()
-  unreachable
-}
-
-; CHECK-LABEL: "?f@@YAXXZ.catch1":
-; No code should be generated for the indirectbr.
-; CHECK-NOT: jmp{{[ql]}} *
-; X64: ".L?f@@YAXXZ.catch1$parent_frame_offset" = 16
-; X64:         movq    %rdx, 16(%rsp)
-; X64:        .seh_handlerdata
-; X64:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
-
-define void @"\01?f@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  %0 = alloca i32*, align 8
-  %1 = alloca double, align 8
-  call void (...) @llvm.localescape(i32** %0, double* %1)
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont unwind label %lpad2
-
-invoke.cont:                                      ; preds = %entry
-  br label %try.cont
-
-lpad2:                                            ; preds = %entry
-  %2 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.8
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.N.0
-  %recover = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.8 to i8*), i32 0, i8* (i8*, i8*)* @"\01?f@@YAXXZ.catch", i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.N.0 to i8*), i32 1, i8* (i8*, i8*)* @"\01?f@@YAXXZ.catch1")
-  indirectbr i8* %recover, [label %try.cont, label %try.cont8]
-
-try.cont:                                         ; preds = %lpad2, %invoke.cont
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %try.cont8 unwind label %lpad1
-
-lpad1:
-  %3 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.N.0
-  %recover2 = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.N.0 to i8*), i32 1, i8* (i8*, i8*)* @"\01?f@@YAXXZ.catch1")
-  indirectbr i8* %recover2, [label %try.cont8]
-
-try.cont8:                                        ; preds = %lpad2, %try.cont
-  ret void
-}
-
-; CHECK-LABEL: "?f@@YAXXZ":
-; No code should be generated for the indirectbr.
-; CHECK-NOT: jmp{{[ql]}} *
-
-; X64:             .seh_handlerdata
-; X64-NEXT:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
-; X86:             .section .xdata,"dr"
-
-; CHECK: .align 4
-
-; X64: "$cppxdata$?f@@YAXXZ":
-; X64-NEXT:          .long   429065506
-; X64-NEXT:          .long   4
-; X64-NEXT:          .long   ("$stateUnwindMap$?f@@YAXXZ")@IMGREL
-; X64-NEXT:          .long   2
-; X64-NEXT:          .long   ("$tryMap$?f@@YAXXZ")@IMGREL
-; X64-NEXT:          .long   6
-; X64-NEXT:          .long   ("$ip2state$?f@@YAXXZ")@IMGREL
-; X64-NEXT:          .long   32
-; X64-NEXT:          .long   0
-; X64-NEXT:          .long   1
-
-; X86: "L__ehtable$?f@@YAXXZ":
-; X86-NEXT:          .long   429065506
-; X86-NEXT:          .long   4
-; X86-NEXT:          .long   ("$stateUnwindMap$?f@@YAXXZ")
-; X86-NEXT:          .long   2
-; X86-NEXT:          .long   ("$tryMap$?f@@YAXXZ")
-; X86-NEXT:          .long   0
-; X86-NEXT:          .long   0
-; X86-NEXT:          .long   0
-; X86-NEXT:          .long   1
-
-
-; CHECK-NEXT:"$stateUnwindMap$?f@@YAXXZ":
-; CHECK-NEXT:        .long   -1
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   -1
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:"$tryMap$?f@@YAXXZ":
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   2
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   ("$handlerMap$0$?f@@YAXXZ")
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   2
-; CHECK-NEXT:        .long   3
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   ("$handlerMap$1$?f@@YAXXZ")
-; CHECK-NEXT:"$handlerMap$0$?f@@YAXXZ":
-; CHECK-NEXT:        .long   8
-; CHECK-NEXT:        .long   "??_R0H@8"
-; CHECK-NEXT:        .long   "{{.?}}L?f@@YAXXZ$frame_escape_0"
-; CHECK-NEXT:        .long   "?f@@YAXXZ.catch"
-; X64-NEXT:          .long   ".L?f@@YAXXZ.catch$parent_frame_offset"
-; CHECK-NEXT:"$handlerMap$1$?f@@YAXXZ":
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   "??_R0N@8"
-; CHECK-NEXT:        .long   "{{.?}}L?f@@YAXXZ$frame_escape_1"
-; CHECK-NEXT:        .long   "?f@@YAXXZ.catch1"
-; X64-NEXT:          .long   ".L?f@@YAXXZ.catch1$parent_frame_offset"
-
-; X64-NEXT:"$ip2state$?f@@YAXXZ":
-; X64-NEXT:        .long   .Lfunc_begin0
-; X64-NEXT:        .long   2
-; X64-NEXT:        .long   .Ltmp0
-; X64-NEXT:        .long   0
-; X64-NEXT:        .long   .Lfunc_begin1
-; X64-NEXT:        .long   3
-; X64-NEXT:        .long   .Lfunc_begin2
-; X64-NEXT:        .long   -1
-; X64-NEXT:        .long   .Ltmp13
-; X64-NEXT:        .long   1
-; X64-NEXT:        .long   .Ltmp16
-; X64-NEXT:        .long   0
-
-
-; X86: "___ehhandler$?f@@YAXXZ": # @"__ehhandler$?f@@YAXXZ"
-; X86: movl $"L__ehtable$?f@@YAXXZ", %eax
-; X86: jmp ___CxxFrameHandler3 # TAILCALL
-
-
-declare void @"\01?may_throw@@YAXXZ"() #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #3
-
-; Function Attrs: nounwind
-declare i8* @llvm.eh.actions(...) #3
-
-; Function Attrs: nounwind
-declare void @llvm.localescape(...) #3
-
-; Function Attrs: nounwind readnone
-declare i8* @llvm.localrecover(i8*, i8*, i32) #2
-
-declare void @llvm.donothing()
-
-attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="?f@@YAXXZ" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { "wineh-parent"="?f@@YAXXZ" }
diff --git a/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll b/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
deleted file mode 100644
index b5cfd65030ab7..0000000000000
--- a/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
+++ /dev/null
@@ -1,245 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
-%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
-%eh.ThrowInfo = type { i32, i32, i32, i32 }
-%struct.S = type { i8 }
-
-$"\01??_DS@@QEAA@XZ" = comdat any
-
-$"\01??_R0H@8" = comdat any
-
-$"_CT??_R0H@84" = comdat any
-
-$_CTA1H = comdat any
-
-$_TI1H = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@__ImageBase = external constant i8
-@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
-@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
-@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
-
-
-; CHECK-LABEL: "?test1@@YAXXZ":
-; CHECK:             .seh_handlerdata
-; CHECK-NEXT:        .long   ("$cppxdata$?test1@@YAXXZ")@IMGREL
-; CHECK-NEXT: .align 4
-; CHECK-NEXT:"$cppxdata$?test1@@YAXXZ":
-; CHECK-NEXT:        .long   429065506
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   ("$stateUnwindMap$?test1@@YAXXZ")@IMGREL
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   2
-; CHECK-NEXT:        .long   ("$ip2state$?test1@@YAXXZ")@IMGREL
-; CHECK-NEXT:        .long   32
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:"$stateUnwindMap$?test1@@YAXXZ":
-; CHECK-NEXT:        .long   -1
-; CHECK-NEXT:        .long   "?test1@@YAXXZ.cleanup"@IMGREL
-; CHECK-NEXT:"$ip2state$?test1@@YAXXZ":
-; CHECK-NEXT:        .long   .Lfunc_begin0@IMGREL
-; CHECK-NEXT:        .long   -1
-; CHECK-NEXT:        .long   .Ltmp0@IMGREL
-; CHECK-NEXT:        .long   0
-
-define void @"\01?test1@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %unwindhelp = alloca i64
-  %tmp = alloca i32, align 4
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  store i32 0, i32* %tmp
-  %0 = bitcast i32* %tmp to i8*
-  call void (...) @llvm.localescape()
-  store volatile i64 -2, i64* %unwindhelp
-  %1 = bitcast i64* %unwindhelp to i8*
-  call void @llvm.eh.unwindhelp(i8* %1)
-  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #8
-          to label %unreachable unwind label %lpad1
-
-lpad1:                                            ; preds = %entry
-  %2 = landingpad { i8*, i32 }
-          cleanup
-  %recover = call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @"\01?test1@@YAXXZ.cleanup")
-  indirectbr i8* %recover, []
-
-unreachable:                                      ; preds = %entry
-  unreachable
-}
-
-declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind
-define linkonce_odr void @"\01??_DS@@QEAA@XZ"(%struct.S* %this) unnamed_addr #1 comdat align 2 {
-entry:
-  %this.addr = alloca %struct.S*, align 8
-  store %struct.S* %this, %struct.S** %this.addr, align 8
-  %this1 = load %struct.S*, %struct.S** %this.addr
-  call void @"\01??1S@@QEAA@XZ"(%struct.S* %this1) #4
-  ret void
-}
-
-; CHECK-LABEL: "?test2@@YAX_N@Z":
-; CHECK:             .seh_handlerdata
-; CHECK-NEXT:        .long   ("$cppxdata$?test2@@YAX_N@Z")@IMGREL
-; CHECK-NEXT: .align 4
-; CHECK-NEXT:"$cppxdata$?test2@@YAX_N@Z":
-; CHECK-NEXT:        .long   429065506
-; CHECK-NEXT:        .long   2
-; CHECK-NEXT:        .long   ("$stateUnwindMap$?test2@@YAX_N@Z")@IMGREL
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   4
-; CHECK-NEXT:        .long   ("$ip2state$?test2@@YAX_N@Z")@IMGREL
-; CHECK-NEXT:        .long   40
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:"$stateUnwindMap$?test2@@YAX_N@Z":
-; CHECK-NEXT:        .long   -1
-; CHECK-NEXT:        .long   "?test2@@YAX_N@Z.cleanup"@IMGREL
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   "?test2@@YAX_N@Z.cleanup1"@IMGREL
-; CHECK-NEXT:"$ip2state$?test2@@YAX_N@Z":
-; CHECK-NEXT:        .long   .Lfunc_begin1@IMGREL
-; CHECK-NEXT:        .long   -1
-; CHECK-NEXT:        .long   .Ltmp7@IMGREL
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   .Ltmp9@IMGREL
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   .Ltmp12@IMGREL
-; CHECK-NEXT:        .long   0
-
-define void @"\01?test2@@YAX_N@Z"(i1 zeroext %b) #2 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-  %b.addr = alloca i8, align 1
-  %s = alloca %struct.S, align 1
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  %s1 = alloca %struct.S, align 1
-  %frombool = zext i1 %b to i8
-  store i8 %frombool, i8* %b.addr, align 1
-  call void (...) @llvm.localescape(%struct.S* %s, %struct.S* %s1)
-  call void @"\01?may_throw@@YAXXZ"()
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont unwind label %lpad1
-
-invoke.cont:                                      ; preds = %entry
-  %1 = load i8, i8* %b.addr, align 1
-  %tobool = trunc i8 %1 to i1
-  br i1 %tobool, label %if.then, label %if.else
-
-if.then:                                          ; preds = %invoke.cont
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont3 unwind label %lpad3
-
-invoke.cont3:                                     ; preds = %if.then
-  call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s1) #4
-  br label %if.end
-
-lpad1:                                            ; preds = %entry, %if.end
-  %2 = landingpad { i8*, i32 }
-          cleanup
-  %recover = call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @"\01?test2@@YAX_N@Z.cleanup")
-  indirectbr i8* %recover, []
-
-lpad3:                                            ; preds = %if.then
-  %3 = landingpad { i8*, i32 }
-          cleanup
-  %recover4 = call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @"\01?test2@@YAX_N@Z.cleanup1", i32 0, void (i8*, i8*)* @"\01?test2@@YAX_N@Z.cleanup")
-  indirectbr i8* %recover4, []
-
-if.else:                                          ; preds = %invoke.cont
-  call void @"\01?dont_throw@@YAXXZ"() #4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %invoke.cont3
-  invoke void @"\01?may_throw@@YAXXZ"()
-          to label %invoke.cont4 unwind label %lpad1
-
-invoke.cont4:                                     ; preds = %if.end
-  call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s) #4
-  ret void
-}
-
-declare void @"\01?may_throw@@YAXXZ"() #3
-
-; Function Attrs: nounwind
-declare void @"\01?dont_throw@@YAXXZ"() #1
-
-; Function Attrs: nounwind
-declare void @"\01??1S@@QEAA@XZ"(%struct.S*) #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.eh.actions(...) #4
-
-define internal void @"\01?test1@@YAXXZ.cleanup"(i8*, i8*) #5 {
-entry:
-  %s = alloca %struct.S, align 1
-  call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s) #4
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.localescape(...) #4
-
-; Function Attrs: nounwind readnone
-declare i8* @llvm.localrecover(i8*, i8*, i32) #6
-
-; Function Attrs: nounwind
-declare void @llvm.eh.unwindhelp(i8*) #4
-
-define internal void @"\01?test2@@YAX_N@Z.cleanup"(i8*, i8*) #7 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %s.i8 = call i8* @llvm.localrecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 0)
-  %s = bitcast i8* %s.i8 to %struct.S*
-  call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s) #4
-  invoke void @llvm.donothing()
-          to label %entry.split unwind label %stub
-
-entry.split:                                      ; preds = %entry
-  ret void
-
-stub:                                             ; preds = %entry
-  %2 = landingpad { i8*, i32 }
-          cleanup
-  unreachable
-}
-
-define internal void @"\01?test2@@YAX_N@Z.cleanup1"(i8*, i8*) #7 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %s1.i8 = call i8* @llvm.localrecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 1)
-  %s1 = bitcast i8* %s1.i8 to %struct.S*
-  call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s1) #4
-  invoke void @llvm.donothing()
-          to label %entry.split unwind label %stub
-
-entry.split:                                      ; preds = %entry
-  ret void
-
-stub:                                             ; preds = %entry
-  %2 = landingpad { i8*, i32 }
-          cleanup
-  unreachable
-}
-
-declare void @llvm.donothing()
-
-attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="?test1@@YAXXZ" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="?test2@@YAX_N@Z" }
-attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-attributes #5 = { "wineh-parent"="?test1@@YAXXZ" }
-attributes #6 = { nounwind readnone }
-attributes #7 = { "wineh-parent"="?test2@@YAX_N@Z" }
-attributes #8 = { noreturn }
diff --git a/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll b/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll
deleted file mode 100644
index 87ccc9d9deddc..0000000000000
--- a/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following source, built with -O2
-;
-; void f() {
-;   try {
-;     g();
-;     try {
-;       throw;
-;     } catch (int) {
-;     }
-;   } catch (...) {
-;   }
-; }
-;
-
-; ModuleID = '<stdin>'
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.CatchHandlerType = type { i32, i8* }
-%eh.ThrowInfo = type { i32, i32, i32, i32 }
-
-$"\01??_R0H@8" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@llvm.eh.handlertype.H.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
-
-; CHECK-LABEL: define void @"\01?f@@YAXXZ"()
-; CHECK: entry:
-; CHECK:   call void (...) @llvm.localescape()
-; CHECK:   invoke void @"\01?g@@YAXXZ"()
-
-; Function Attrs: nounwind
-define void @"\01?f@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  invoke void @"\01?g@@YAXXZ"()
-          to label %invoke.cont unwind label %lpad
-
-; CHECK-LABEL: invoke.cont:
-; CHECK:   invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null)
-; CHECK:           to label %unreachable unwind label %[[LPAD1_LABEL:lpad[0-9]+]]
-
-invoke.cont:                                      ; preds = %entry
-  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) #4
-          to label %unreachable unwind label %lpad1
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* null
-  %1 = extractvalue { i8*, i32 } %0, 0
-  br label %catch2
-
-; Note: Even though this landing pad has two catch clauses, it only has one action because both
-;       handlers do the same thing.
-; CHECK: [[LPAD1_LABEL]]:
-; CHECK:   landingpad { i8*, i32 }
-; CHECK-NEXT:           catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-; CHECK-NEXT:           catch i8* null
-; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* null, i32 -1, i8* (i8*, i8*)* @"\01?f@@YAXXZ.catch")
-; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont4]
-
-lpad1:                                            ; preds = %invoke.cont
-  %2 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-          catch i8* null
-  %3 = extractvalue { i8*, i32 } %2, 0
-  br label %catch2
-
-catch2:                                           ; preds = %lpad1, %lpad
-  %exn.slot.0 = phi i8* [ %3, %lpad1 ], [ %1, %lpad ]
-  tail call void @llvm.eh.begincatch(i8* %exn.slot.0, i8* null) #3
-  tail call void @llvm.eh.endcatch() #3
-  br label %try.cont4
-
-try.cont4:                                        ; preds = %catch, %catch2
-  ret void
-
-unreachable:                                      ; preds = %invoke.cont
-  unreachable
-
-; CHECK: }
-}
-
-declare void @"\01?g@@YAXXZ"() #1
-
-declare i32 @__CxxFrameHandler3(...)
-
-declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #3
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { noreturn }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.7.0 (trunk 235112) (llvm/trunk 235121)"}
diff --git a/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll b/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll
deleted file mode 100644
index 0921353681580..0000000000000
--- a/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll
+++ /dev/null
@@ -1,394 +0,0 @@
-; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
-
-; This test is based on the following code:
-;
-; int main(void) {
-;   try {
-;     try {
-;       throw 'a';
-;     } catch (char c) {
-;       printf("%c\n", c);
-;     }
-;     throw 1;
-;   } catch(int x) {
-;     printf("%d\n", x);
-;   } catch(...) {
-;     printf("...\n");
-;   }
-;   try {
-;     try {
-;       throw 'b';
-;     } catch (char c) {
-;       printf("%c\n", c);
-;     }
-;     throw 2;
-;   } catch(int x) {
-;     printf("%d\n", x);
-;   } catch (char c) {
-;     printf("%c\n", c);
-;   } catch(...) {
-;     printf("...\n");
-;   }
-;   return 0;
-; }
-
-; This test is just checking for failures in processing the IR.
-; Extensive handler matching is not required.
-
-; ModuleID = 'cppeh-similar-catch-blocks.cpp'
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.CatchHandlerType = type { i32, i8* }
-%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
-%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
-%eh.ThrowInfo = type { i32, i32, i32, i32 }
-
-$"\01??_R0H@8" = comdat any
-
-$"\01??_R0D@8" = comdat any
-
-$"_CT??_R0D@81" = comdat any
-
-$_CTA1D = comdat any
-
-$_TI1D = comdat any
-
-$"\01??_C@_03PJCJOCBM@?$CFc?6?$AA@" = comdat any
-
-$"_CT??_R0H@84" = comdat any
-
-$_CTA1H = comdat any
-
-$_TI1H = comdat any
-
-$"\01??_C@_04MPPNMCOK@?4?4?4?6?$AA@" = comdat any
-
-$"\01??_C@_03PMGGPEJJ@?$CFd?6?$AA@" = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@llvm.eh.handlertype.H.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
-@"\01??_R0D@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".D\00" }, comdat
-@llvm.eh.handlertype.D.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0D@8" to i8*) }, section "llvm.metadata"
-@__ImageBase = external constant i8
-@"_CT??_R0D@81" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0D@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 1, i32 0 }, section ".xdata", comdat
-@_CTA1D = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0D@81" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
-@_TI1D = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1D to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
-@"\01??_C@_03PJCJOCBM@?$CFc?6?$AA@" = linkonce_odr unnamed_addr constant [4 x i8] c"%c\0A\00", comdat, align 1
-@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
-@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
-@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
-@"\01??_C@_04MPPNMCOK@?4?4?4?6?$AA@" = linkonce_odr unnamed_addr constant [5 x i8] c"...\0A\00", comdat, align 1
-@"\01??_C@_03PMGGPEJJ@?$CFd?6?$AA@" = linkonce_odr unnamed_addr constant [4 x i8] c"%d\0A\00", comdat, align 1
-
-; This is just a minimal check to verify that main was handled by WinEHPrepare.
-; CHECK: define i32 @main()
-; CHECK: entry:
-; CHECK:   call void (...) @llvm.localescape(i32* [[X_PTR:\%.+]], i32* [[X2_PTR:\%.+]], i8* [[C2_PTR:\%.+]], i8* [[C3_PTR:\%.+]], i8* [[C_PTR:\%.+]])
-; CHECK:   invoke void @_CxxThrowException
-; CHECK: }
-
-; Function Attrs: uwtable
-define i32 @main() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %retval = alloca i32, align 4
-  %tmp = alloca i8, align 1
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  %c = alloca i8, align 1
-  %tmp3 = alloca i32, align 4
-  %x = alloca i32, align 4
-  %tmp20 = alloca i8, align 1
-  %c28 = alloca i8, align 1
-  %tmp34 = alloca i32, align 4
-  %c48 = alloca i8, align 1
-  %x56 = alloca i32, align 4
-  store i32 0, i32* %retval
-  store i8 97, i8* %tmp
-  invoke void @_CxxThrowException(i8* %tmp, %eh.ThrowInfo* @_TI1D) #4
-          to label %unreachable unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.D.0
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-          catch i8* null
-  %1 = extractvalue { i8*, i32 } %0, 0
-  store i8* %1, i8** %exn.slot
-  %2 = extractvalue { i8*, i32 } %0, 1
-  store i32 %2, i32* %ehselector.slot
-  br label %catch.dispatch
-
-catch.dispatch:                                   ; preds = %lpad
-  %sel = load i32, i32* %ehselector.slot
-  %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.D.0 to i8*)) #2
-  %matches = icmp eq i32 %sel, %3
-  br i1 %matches, label %catch, label %catch.dispatch5
-
-catch:                                            ; preds = %catch.dispatch
-  %exn = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn, i8* %c) #2
-  %4 = load i8, i8* %c, align 1
-  %conv = sext i8 %4 to i32
-  %call = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01??_C@_03PJCJOCBM@?$CFc?6?$AA@", i32 0, i32 0), i32 %conv)
-          to label %invoke.cont unwind label %lpad2
-
-invoke.cont:                                      ; preds = %catch
-  call void @llvm.eh.endcatch() #2
-  br label %try.cont
-
-try.cont:                                         ; preds = %invoke.cont
-  store i32 1, i32* %tmp3
-  %5 = bitcast i32* %tmp3 to i8*
-  invoke void @_CxxThrowException(i8* %5, %eh.ThrowInfo* @_TI1H) #4
-          to label %unreachable unwind label %lpad4
-
-lpad2:                                            ; preds = %catch
-  %6 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-          catch i8* null
-  %7 = extractvalue { i8*, i32 } %6, 0
-  store i8* %7, i8** %exn.slot
-  %8 = extractvalue { i8*, i32 } %6, 1
-  store i32 %8, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #2
-  br label %catch.dispatch5
-
-lpad4:                                            ; preds = %try.cont
-  %9 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-          catch i8* null
-  %10 = extractvalue { i8*, i32 } %9, 0
-  store i8* %10, i8** %exn.slot
-  %11 = extractvalue { i8*, i32 } %9, 1
-  store i32 %11, i32* %ehselector.slot
-  br label %catch.dispatch5
-
-catch.dispatch5:                                  ; preds = %lpad4, %lpad2, %catch.dispatch
-  %sel6 = load i32, i32* %ehselector.slot
-  %12 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)) #2
-  %matches7 = icmp eq i32 %sel6, %12
-  br i1 %matches7, label %catch13, label %catch8
-
-catch13:                                          ; preds = %catch.dispatch5
-  %exn14 = load i8*, i8** %exn.slot
-  %13 = bitcast i32* %x to i8*
-  call void @llvm.eh.begincatch(i8* %exn14, i8* %13) #2
-  %14 = load i32, i32* %x, align 4
-  %call18 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01??_C@_03PMGGPEJJ@?$CFd?6?$AA@", i32 0, i32 0), i32 %14)
-          to label %invoke.cont17 unwind label %lpad16
-
-invoke.cont17:                                    ; preds = %catch13
-  call void @llvm.eh.endcatch() #2
-  br label %try.cont19
-
-try.cont19:                                       ; preds = %invoke.cont17, %invoke.cont11
-  store i8 98, i8* %tmp20
-  invoke void @_CxxThrowException(i8* %tmp20, %eh.ThrowInfo* @_TI1D) #4
-          to label %unreachable unwind label %lpad21
-
-catch8:                                           ; preds = %catch.dispatch5
-  %exn9 = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn9, i8* null) #2
-  %call12 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @"\01??_C@_04MPPNMCOK@?4?4?4?6?$AA@", i32 0, i32 0))
-          to label %invoke.cont11 unwind label %lpad10
-
-invoke.cont11:                                    ; preds = %catch8
-  call void @llvm.eh.endcatch() #2
-  br label %try.cont19
-
-lpad10:                                           ; preds = %catch8
-  %15 = landingpad { i8*, i32 }
-          cleanup
-  %16 = extractvalue { i8*, i32 } %15, 0
-  store i8* %16, i8** %exn.slot
-  %17 = extractvalue { i8*, i32 } %15, 1
-  store i32 %17, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #2
-  br label %eh.resume
-
-lpad16:                                           ; preds = %catch13
-  %18 = landingpad { i8*, i32 }
-          cleanup
-  %19 = extractvalue { i8*, i32 } %18, 0
-  store i8* %19, i8** %exn.slot
-  %20 = extractvalue { i8*, i32 } %18, 1
-  store i32 %20, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #2
-  br label %eh.resume
-
-lpad21:                                           ; preds = %try.cont19
-  %21 = landingpad { i8*, i32 }
-          catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.D.0 to i8*)
-          catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)
-          catch i8* null
-  %22 = extractvalue { i8*, i32 } %21, 0
-  store i8* %22, i8** %exn.slot
-  %23 = extractvalue { i8*, i32 } %21, 1
-  store i32 %23, i32* %ehselector.slot
-  br label %catch.dispatch22
-
-catch.dispatch22:                                 ; preds = %lpad21
-  %sel23 = load i32, i32* %ehselector.slot
-  %24 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.D.0 to i8*)) #2
-  %matches24 = icmp eq i32 %sel23, %24
-  br i1 %matches24, label %catch25, label %catch.dispatch36
-
-catch25:                                          ; preds = %catch.dispatch22
-  %exn26 = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn26, i8* %c28) #2
-  %25 = load i8, i8* %c28, align 1
-  %conv29 = sext i8 %25 to i32
-  %call32 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01??_C@_03PJCJOCBM@?$CFc?6?$AA@", i32 0, i32 0), i32 %conv29)
-          to label %invoke.cont31 unwind label %lpad30
-
-invoke.cont31:                                    ; preds = %catch25
-  call void @llvm.eh.endcatch() #2
-  br label %try.cont33
-
-try.cont33:                                       ; preds = %invoke.cont31
-  store i32 2, i32* %tmp34
-  %26 = bitcast i32* %tmp34 to i8*
-  invoke void @_CxxThrowException(i8* %26, %eh.ThrowInfo* @_TI1H) #4
-          to label %unreachable unwind label %lpad35
-
-lpad30:                                           ; preds = %catch25
-  %27 = landingpad { i8*, i32 }
-          catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)
-          catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.D.0 to i8*)
-          catch i8* null
-  %28 = extractvalue { i8*, i32 } %27, 0
-  store i8* %28, i8** %exn.slot
-  %29 = extractvalue { i8*, i32 } %27, 1
-  store i32 %29, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #2
-  br label %catch.dispatch36
-
-lpad35:                                           ; preds = %try.cont33
-  %30 = landingpad { i8*, i32 }
-          catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)
-          catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.D.0 to i8*)
-          catch i8* null
-  %31 = extractvalue { i8*, i32 } %30, 0
-  store i8* %31, i8** %exn.slot
-  %32 = extractvalue { i8*, i32 } %30, 1
-  store i32 %32, i32* %ehselector.slot
-  br label %catch.dispatch36
-
-catch.dispatch36:                                 ; preds = %lpad35, %lpad30, %catch.dispatch22
-  %sel37 = load i32, i32* %ehselector.slot
-  %33 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)) #2
-  %matches38 = icmp eq i32 %sel37, %33
-  br i1 %matches38, label %catch53, label %catch.fallthrough
-
-catch53:                                          ; preds = %catch.dispatch36
-  %exn54 = load i8*, i8** %exn.slot
-  %34 = bitcast i32* %x56 to i8*
-  call void @llvm.eh.begincatch(i8* %exn54, i8* %34) #2
-  %35 = load i32, i32* %x56, align 4
-  %call59 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01??_C@_03PMGGPEJJ@?$CFd?6?$AA@", i32 0, i32 0), i32 %35)
-          to label %invoke.cont58 unwind label %lpad57
-
-invoke.cont58:                                    ; preds = %catch53
-  call void @llvm.eh.endcatch() #2
-  br label %try.cont60
-
-try.cont60:                                       ; preds = %invoke.cont58, %invoke.cont51, %invoke.cont43
-  ret i32 0
-
-catch.fallthrough:                                ; preds = %catch.dispatch36
-  %36 = call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.D.0 to i8*)) #2
-  %matches39 = icmp eq i32 %sel37, %36
-  br i1 %matches39, label %catch45, label %catch40
-
-catch45:                                          ; preds = %catch.fallthrough
-  %exn46 = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn46, i8* %c48) #2
-  %37 = load i8, i8* %c48, align 1
-  %conv49 = sext i8 %37 to i32
-  %call52 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01??_C@_03PJCJOCBM@?$CFc?6?$AA@", i32 0, i32 0), i32 %conv49)
-          to label %invoke.cont51 unwind label %lpad50
-
-invoke.cont51:                                    ; preds = %catch45
-  call void @llvm.eh.endcatch() #2
-  br label %try.cont60
-
-catch40:                                          ; preds = %catch.fallthrough
-  %exn41 = load i8*, i8** %exn.slot
-  call void @llvm.eh.begincatch(i8* %exn41, i8* null) #2
-  %call44 = invoke i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @"\01??_C@_04MPPNMCOK@?4?4?4?6?$AA@", i32 0, i32 0))
-          to label %invoke.cont43 unwind label %lpad42
-
-invoke.cont43:                                    ; preds = %catch40
-  call void @llvm.eh.endcatch() #2
-  br label %try.cont60
-
-lpad42:                                           ; preds = %catch40
-  %38 = landingpad { i8*, i32 }
-          cleanup
-  %39 = extractvalue { i8*, i32 } %38, 0
-  store i8* %39, i8** %exn.slot
-  %40 = extractvalue { i8*, i32 } %38, 1
-  store i32 %40, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #2
-  br label %eh.resume
-
-lpad50:                                           ; preds = %catch45
-  %41 = landingpad { i8*, i32 }
-          cleanup
-  %42 = extractvalue { i8*, i32 } %41, 0
-  store i8* %42, i8** %exn.slot
-  %43 = extractvalue { i8*, i32 } %41, 1
-  store i32 %43, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #2
-  br label %eh.resume
-
-lpad57:                                           ; preds = %catch53
-  %44 = landingpad { i8*, i32 }
-          cleanup
-  %45 = extractvalue { i8*, i32 } %44, 0
-  store i8* %45, i8** %exn.slot
-  %46 = extractvalue { i8*, i32 } %44, 1
-  store i32 %46, i32* %ehselector.slot
-  call void @llvm.eh.endcatch() #2
-  br label %eh.resume
-
-eh.resume:                                        ; preds = %lpad57, %lpad50, %lpad42, %lpad16, %lpad10
-  %exn61 = load i8*, i8** %exn.slot
-  %sel62 = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn61, 0
-  %lpad.val63 = insertvalue { i8*, i32 } %lpad.val, i32 %sel62, 1
-  resume { i8*, i32 } %lpad.val63
-
-unreachable:                                      ; preds = %try.cont33, %try.cont19, %try.cont, %entry
-  unreachable
-}
-
-declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #2
-
-declare i32 @printf(i8*, ...) #3
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #2
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind }
-attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { noreturn }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 235214) (llvm/trunk 235213)"}
diff --git a/test/CodeGen/WinEH/cppeh-state-calc-1.ll b/test/CodeGen/WinEH/cppeh-state-calc-1.ll
deleted file mode 100644
index abc5d5292cf79..0000000000000
--- a/test/CodeGen/WinEH/cppeh-state-calc-1.ll
+++ /dev/null
@@ -1,289 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-
-; This test was generated from the following code.
-;
-; void test() {
-;   try {
-;     try {
-;       try {
-;         two();
-;         throw 2;
-;       } catch (int x) {
-;         catch_two();
-;       }
-;       a();
-;       throw 'a';
-;     } catch (char c) {
-;       catch_a();
-;     }
-;     one();
-;     throw 1;
-;   } catch(int x) { 
-;     catch_one();
-;   } catch(...) {
-;     catch_all();
-;   }
-; }
-;
-; The function calls before the throws were declared as 'noexcept' and are
-; just here to make blocks easier to identify in the IR.
-
-; ModuleID = '<stdin>'
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
-%eh.CatchHandlerType = type { i32, i8* }
-%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
-%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
-%eh.ThrowInfo = type { i32, i32, i32, i32 }
-
-$"\01??_R0H@8" = comdat any
-
-$"\01??_R0D@8" = comdat any
-
-$"_CT??_R0H@84" = comdat any
-
-$_CTA1H = comdat any
-
-$_TI1H = comdat any
-
-$"_CT??_R0D@81" = comdat any
-
-$_CTA1D = comdat any
-
-$_TI1D = comdat any
-
-@"\01??_7type_info@@6B@" = external constant i8*
-@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
-@llvm.eh.handlertype.H.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata"
-@"\01??_R0D@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".D\00" }, comdat
-@llvm.eh.handlertype.D.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0D@8" to i8*) }, section "llvm.metadata"
-@__ImageBase = external constant i8
-@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
-@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
-@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
-@"_CT??_R0D@81" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0D@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 1, i32 0 }, section ".xdata", comdat
-@_CTA1D = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0D@81" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
-@_TI1D = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1D to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
-
-; Function Attrs: nounwind uwtable
-define void @"\01?test@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %tmp = alloca i32, align 4
-  %x = alloca i32, align 4
-  %tmp2 = alloca i8, align 1
-  %c = alloca i8, align 1
-  %tmp11 = alloca i32, align 4
-  %x21 = alloca i32, align 4
-  call void @"\01?two@@YAXXZ"() #3
-  store i32 2, i32* %tmp
-  %0 = bitcast i32* %tmp to i8*
-  call void (...) @llvm.localescape(i32* %x, i8* %c, i32* %x21)
-  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #5
-          to label %unreachable unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %1 = landingpad { i8*, i32 }
-          catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.D.0
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-          catch i8* null
-  %recover = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*), i32 0, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch", i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.D.0 to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1", i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch2", i32 1, i8* null, i32 -1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch3")
-  indirectbr i8* %recover, [label %try.cont, label %try.cont10, label %try.cont22]
-
-try.cont:                                         ; preds = %lpad
-  call void @"\01?a@@YAXXZ"() #3
-  store i8 97, i8* %tmp2
-  invoke void @_CxxThrowException(i8* %tmp2, %eh.ThrowInfo* @_TI1D) #5
-          to label %unreachable unwind label %lpad3
-
-lpad3:                                            ; preds = %try.cont
-  %2 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.D.0
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-          catch i8* null
-  %recover1 = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.D.0 to i8*), i32 1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch1", i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch2", i32 1, i8* null, i32 -1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch3")
-  indirectbr i8* %recover1, [label %try.cont10, label %try.cont22]
-
-try.cont10:                                       ; preds = %lpad3, %lpad
-  call void @"\01?one@@YAXXZ"() #3
-  store i32 1, i32* %tmp11
-  %3 = bitcast i32* %tmp11 to i8*
-  invoke void @_CxxThrowException(i8* %3, %eh.ThrowInfo* @_TI1H) #5
-          to label %unreachable unwind label %lpad12
-
-lpad12:                                           ; preds = %try.cont10
-  %4 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-          catch i8* null
-  %recover2 = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*), i32 2, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch2", i32 1, i8* null, i32 -1, i8* (i8*, i8*)* @"\01?test@@YAXXZ.catch3")
-  indirectbr i8* %recover2, [label %try.cont22]
-
-try.cont22:                                       ; preds = %lpad12, %lpad3, %lpad
-  ret void
-
-unreachable:                                      ; preds = %try.cont10, %try.cont, %entry
-  unreachable
-}
-
-; Function Attrs: nounwind
-declare void @"\01?two@@YAXXZ"() #1
-
-declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
-
-declare i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
-
-; Function Attrs: nounwind
-declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #3
-
-; Function Attrs: nounwind
-declare void @"\01?catch_two@@YAXXZ"() #1
-
-; Function Attrs: nounwind
-declare void @llvm.eh.endcatch() #3
-
-; Function Attrs: nounwind
-declare void @"\01?a@@YAXXZ"() #1
-
-; Function Attrs: nounwind
-declare void @"\01?catch_a@@YAXXZ"() #1
-
-; Function Attrs: nounwind
-declare void @"\01?one@@YAXXZ"() #1
-
-; Function Attrs: nounwind
-declare void @"\01?catch_all@@YAXXZ"() #1
-
-; Function Attrs: nounwind
-declare void @"\01?catch_one@@YAXXZ"() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.eh.actions(...) #3
-
-define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %x.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
-  %x = bitcast i8* %x.i8 to i32*
-  %2 = bitcast i32* %x to i8*
-  call void @"\01?catch_two@@YAXXZ"() #3
-  invoke void @llvm.donothing()
-          to label %entry.split unwind label %stub
-
-entry.split:                                      ; preds = %entry
-  ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont)
-
-stub:                                             ; preds = %entry
-  %3 = landingpad { i8*, i32 }
-          cleanup
-  %recover = call i8* (...) @llvm.eh.actions()
-  unreachable
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.donothing() #2
-
-define internal i8* @"\01?test@@YAXXZ.catch1"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  call void @"\01?catch_a@@YAXXZ"() #3
-  invoke void @llvm.donothing()
-          to label %entry.split unwind label %stub
-
-entry.split:                                      ; preds = %entry
-  ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont10)
-
-stub:                                             ; preds = %entry
-  %2 = landingpad { i8*, i32 }
-          cleanup
-  %recover = call i8* (...) @llvm.eh.actions()
-  unreachable
-}
-
-define internal i8* @"\01?test@@YAXXZ.catch2"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  %x21.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
-  %x21 = bitcast i8* %x21.i8 to i32*
-  %2 = bitcast i32* %x21 to i8*
-  call void @"\01?catch_one@@YAXXZ"() #3
-  invoke void @llvm.donothing()
-          to label %entry.split unwind label %stub
-
-entry.split:                                      ; preds = %entry
-  ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont22)
-
-stub:                                             ; preds = %entry
-  %3 = landingpad { i8*, i32 }
-          cleanup
-  %recover = call i8* (...) @llvm.eh.actions()
-  unreachable
-}
-
-define internal i8* @"\01?test@@YAXXZ.catch3"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-entry:
-  call void @"\01?catch_all@@YAXXZ"() #3
-  invoke void @llvm.donothing()
-          to label %entry.split unwind label %stub
-
-entry.split:                                      ; preds = %entry
-  ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont22)
-
-stub:                                             ; preds = %entry
-  %2 = landingpad { i8*, i32 }
-          cleanup
-  %recover = call i8* (...) @llvm.eh.actions()
-  unreachable
-}
-
-; Function Attrs: nounwind
-declare void @llvm.localescape(...) #3
-
-; Function Attrs: nounwind readnone
-declare i8* @llvm.localrecover(i8*, i8*, i32) #2
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="?test@@YAXXZ" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { "wineh-parent"="?test@@YAXXZ" }
-attributes #5 = { noreturn }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 (trunk 236059)"}
-
-; CHECK-LABEL: "$cppxdata$?test@@YAXXZ":
-; CHECK-NEXT: 	.long	429065506
-; CHECK-NEXT: 	.long
-; CHECK-NEXT: 	.long	("$stateUnwindMap$?test@@YAXXZ")@IMGREL
-; CHECK-NEXT: 	.long
-; CHECK-NEXT: 	.long	("$tryMap$?test@@YAXXZ")@IMGREL
-; CHECK-NEXT: 	.long
-; CHECK-NEXT: 	.long	("$ip2state$?test@@YAXXZ")@IMGREL
-; CHECK-NEXT: 	.long	40
-; CHECK-NEXT: 	.long	0
-; CHECK-NEXT: 	.long	1
-; CHECK: "$stateUnwindMap$?test@@YAXXZ":
-; CHECK: "$tryMap$?test@@YAXXZ":
-; CHECK: "$handlerMap$0$?test@@YAXXZ":
-; CHECK: "$ip2state$?test@@YAXXZ":
-; CHECK-NEXT: 	.long	.Lfunc_begin0@IMGREL
-; CHECK-NEXT: 	.long	-1
-; CHECK-NEXT: 	.long	.Ltmp0@IMGREL
-; CHECK-NEXT: 	.long	2
-; CHECK-NEXT: 	.long	.Ltmp3@IMGREL
-; CHECK-NEXT: 	.long	1
-; CHECK-NEXT: 	.long	.Ltmp6@IMGREL
-; CHECK-NEXT: 	.long	0
-; CHECK-NEXT: 	.long	.Lfunc_begin1@IMGREL
-; CHECK-NEXT: 	.long	3
-; CHECK-NEXT: 	.long	.Lfunc_begin2@IMGREL
-; CHECK-NEXT: 	.long	4
-; CHECK-NEXT: 	.long	.Lfunc_begin3@IMGREL
-; CHECK-NEXT: 	.long	5
-; CHECK-NEXT: 	.long	.Lfunc_begin4@IMGREL
-; CHECK-NEXT: 	.long	6
diff --git a/test/CodeGen/WinEH/seh-catch-all.ll b/test/CodeGen/WinEH/seh-catch-all.ll
deleted file mode 100644
index 5ac2295a5b41b..0000000000000
--- a/test/CodeGen/WinEH/seh-catch-all.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: opt -S -winehprepare < %s | FileCheck %s
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-@str.__except = internal unnamed_addr constant [9 x i8] c"__except\00", align 1
-
-; Function Attrs: uwtable
-
-declare i32 @puts(i8*)
-
-define void @may_crash() {
-entry:
-  store volatile i32 42, i32* null, align 4
-  ret void
-}
-
-declare i32 @__C_specific_handler(...)
-
-; Function Attrs: nounwind readnone
-declare i8* @llvm.frameaddress(i32)
-
-; Function Attrs: uwtable
-define void @seh_catch_all() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
-entry:
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  invoke void @may_crash()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  br label %__try.cont
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* null
-  %1 = extractvalue { i8*, i32 } %0, 0
-  store i8* %1, i8** %exn.slot
-  %2 = extractvalue { i8*, i32 } %0, 1
-  store i32 %2, i32* %ehselector.slot
-  br label %__except
-
-__except:                                         ; preds = %lpad
-  %call = call i32 @puts(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @str.__except, i32 0, i32 0))
-  br label %__try.cont
-
-__try.cont:                                       ; preds = %__except, %invoke.cont
-  ret void
-}
-
-; CHECK-LABEL: define void @seh_catch_all()
-; CHECK: landingpad
-; CHECK-NEXT: catch i8* null
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* null, i32 -1, i8* blockaddress(@seh_catch_all, %lpad.split))
-; CHECK-NEXT: indirectbr
-;
-; CHECK: lpad.split:
-; CHECK-NOT: extractvalue
-; CHECK: call i32 @puts
diff --git a/test/CodeGen/WinEH/seh-exception-code.ll b/test/CodeGen/WinEH/seh-exception-code.ll
deleted file mode 100644
index 2998e79821331..0000000000000
--- a/test/CodeGen/WinEH/seh-exception-code.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; RUN: opt -winehprepare -S < %s | FileCheck %s
-
-; WinEHPrepare was crashing during phi demotion.
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc18.0.0"
-
-declare i32 @__C_specific_handler(...)
-
-@str = linkonce_odr unnamed_addr constant [16 x i8] c"caught it! %lx\0A\00", align 1
-
-; Function Attrs: nounwind uwtable
-declare void @maycrash()
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture readonly, ...)
-
-; Function Attrs: nounwind uwtable
-define void @doit() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
-entry:
-  invoke void @maycrash()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  invoke void @maycrash()
-          to label %__try.cont unwind label %lpad.1
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* null
-  %1 = extractvalue { i8*, i32 } %0, 0
-  br label %__except
-
-lpad.1:                                           ; preds = %invoke.cont, %lpad
-  %2 = landingpad { i8*, i32 }
-          catch i8* null
-  %3 = extractvalue { i8*, i32 } %2, 0
-  br label %__except
-
-__except:                                         ; preds = %lpad, %lpad.1
-  %exn.slot.0 = phi i8* [ %3, %lpad.1 ], [ %1, %lpad ]
-  %4 = ptrtoint i8* %exn.slot.0 to i64
-  %5 = trunc i64 %4 to i32
-  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @str, i64 0, i64 0), i32 %5)
-  br label %__try.cont
-
-__try.cont:                                       ; preds = %invoke.cont, %__except
-  ret void
-}
-
-; CHECK-LABEL: define void @doit()
-; CHECK: landingpad
-; CHECK: indirectbr i8* %{{[^,]*}}, [label %[[except_split1:.*]]]
-; CHECK: [[except_split1]]:
-; CHECK: call i32 @llvm.eh.exceptioncode()
-; CHECK: br label %__except
-;
-; CHECK: landingpad
-; CHECK: indirectbr i8* %{{[^,]*}}, [label %[[except_split2:.*]]]
-; CHECK: [[except_split2]]:
-; CHECK: call i32 @llvm.eh.exceptioncode()
-; CHECK: br label %__except
-;
-; CHECK: __except:
-; CHECK: phi
-; CHECK: call i32 (i8*, ...) @printf
diff --git a/test/CodeGen/WinEH/seh-exception-code2.ll b/test/CodeGen/WinEH/seh-exception-code2.ll
deleted file mode 100644
index 0356956502c03..0000000000000
--- a/test/CodeGen/WinEH/seh-exception-code2.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: opt -winehprepare -S < %s | FileCheck %s
-
-; WinEHPrepare was crashing during phi demotion.
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc18.0.0"
-
-declare i32 @__C_specific_handler(...)
-
-@str = linkonce_odr unnamed_addr constant [16 x i8] c"caught it! %lx\0A\00", align 1
-
-declare void @maycrash()
-declare void @finally(i1 %abnormal)
-declare i32 @printf(i8* nocapture readonly, ...)
-declare i32 @llvm.eh.typeid.for(i8*)
-
-; Function Attrs: nounwind uwtable
-define void @doit() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
-entry:
-  invoke void @maycrash()
-          to label %invoke.cont unwind label %lpad.1
-
-invoke.cont:                                      ; preds = %entry
-  invoke void @maycrash()
-          to label %__try.cont unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %lp0 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@doit@@" to i8*)
-  %ehptr.0 = extractvalue { i8*, i32 } %lp0, 0
-  %ehsel.0 = extractvalue { i8*, i32 } %lp0, 1
-  call void @finally(i1 true)
-  br label %ehdispatch
-
-lpad.1:                                           ; preds = %invoke.cont, %lpad
-  %lp1 = landingpad { i8*, i32 }
-          catch i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@doit@@" to i8*)
-  %ehptr.1 = extractvalue { i8*, i32 } %lp1, 0
-  %ehsel.1 = extractvalue { i8*, i32 } %lp1, 1
-  br label %ehdispatch
-
-ehdispatch:
-  %ehptr.2 = phi i8* [ %ehptr.0, %lpad ], [ %ehptr.1, %lpad.1 ]
-  %ehsel.2 = phi i32 [ %ehsel.0, %lpad ], [ %ehsel.1, %lpad.1 ]
-  %mysel = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@doit@@" to i8*))
-  %matches = icmp eq i32 %ehsel.2, %mysel
-  br i1 %matches, label %__except, label %eh.resume
-
-__except:                                         ; preds = %lpad, %lpad.1
-  %t4 = ptrtoint i8* %ehptr.2 to i64
-  %t5 = trunc i64 %t4 to i32
-  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @str, i64 0, i64 0), i32 %t5)
-  br label %__try.cont
-
-__try.cont:                                       ; preds = %invoke.cont, %__except
-  call void @finally(i1 false)
-  ret void
-
-eh.resume:
-  %ehvals0 = insertvalue { i8*, i32 } undef, i8* %ehptr.2, 0
-  %ehvals = insertvalue { i8*, i32 } %ehvals0, i32 %ehsel.2, 1
-  resume { i8*, i32 } %ehvals
-}
-
-define internal i32 @"\01?filt$0@0@doit@@"(i8* %exception_pointers, i8* %frame_pointer) #1 {
-entry:
-  %0 = bitcast i8* %exception_pointers to { i32*, i8* }*
-  %1 = getelementptr inbounds { i32*, i8* }, { i32*, i8* }* %0, i32 0, i32 0
-  %2 = load i32*, i32** %1
-  %3 = load i32, i32* %2
-  %cmp = icmp eq i32 %3, -1073741819
-  %4 = zext i1 %cmp to i32
-  ret i32 %4
-}
-
-; CHECK-LABEL: define void @doit()
-; CHECK: %lp0 = landingpad { i8*, i32 }
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: catch i8*
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions({{.*}})
-; CHECK-NEXT: indirectbr i8* %{{[^,]*}}, [label %__except]
-;
-; CHECK: %lp1 = landingpad { i8*, i32 }
-; CHECK-NEXT: catch i8*
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions({{.*}})
-; CHECK-NEXT: indirectbr i8* %{{[^,]*}}, [label %__except]
-;
-; CHECK: __except:
-; CHECK: call i32 @llvm.eh.exceptioncode()
-; CHECK: call i32 (i8*, ...) @printf
diff --git a/test/CodeGen/WinEH/seh-inlined-finally.ll b/test/CodeGen/WinEH/seh-inlined-finally.ll
deleted file mode 100644
index 157adf0c81830..0000000000000
--- a/test/CodeGen/WinEH/seh-inlined-finally.ll
+++ /dev/null
@@ -1,83 +0,0 @@
-; RUN: opt -S -winehprepare < %s | FileCheck %s
-
-; Check that things work when the mid-level optimizer inlines the finally
-; block.
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-%struct._RTL_CRITICAL_SECTION = type { %struct._RTL_CRITICAL_SECTION_DEBUG*, i32, i32, i8*, i8*, i64 }
-%struct._RTL_CRITICAL_SECTION_DEBUG = type { i16, i16, %struct._RTL_CRITICAL_SECTION*, %struct._LIST_ENTRY, i32, i32, i32, i16, i16 }
-%struct._LIST_ENTRY = type { %struct._LIST_ENTRY*, %struct._LIST_ENTRY* }
-
-declare i32 @puts(i8*)
-declare void @may_crash()
-declare i32 @__C_specific_handler(...)
-declare i8* @llvm.localrecover(i8*, i8*, i32) #1
-declare i8* @llvm.localaddress()
-declare void @llvm.localescape(...)
-declare dllimport void @EnterCriticalSection(%struct._RTL_CRITICAL_SECTION*)
-declare dllimport void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION*)
-
-define void @use_finally() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
-entry:
-  invoke void @may_crash()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  %call.i = tail call i32 @puts(i8* null)
-  ret void
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          cleanup
-  %call.i2 = tail call i32 @puts(i8* null)
-  resume { i8*, i32 } %0
-}
-
-; CHECK-LABEL: define void @use_finally()
-; CHECK: invoke void @may_crash()
-;
-; CHECK: landingpad
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @use_finally.cleanup)
-; CHECK-NEXT: indirectbr i8* %recover, []
-
-; Function Attrs: nounwind uwtable
-define i32 @call_may_crash_locked() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
-entry:
-  %p = alloca %struct._RTL_CRITICAL_SECTION, align 8
-  call void (...) @llvm.localescape(%struct._RTL_CRITICAL_SECTION* %p)
-  call void @EnterCriticalSection(%struct._RTL_CRITICAL_SECTION* %p)
-  invoke void @may_crash()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  %tmp2 = call i8* @llvm.localaddress()
-  %tmp3 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %tmp2, i32 0) #2
-  %tmp6 = bitcast i8* %tmp3 to %struct._RTL_CRITICAL_SECTION*
-  call void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION* %tmp6)
-  ret i32 42
-
-lpad:                                             ; preds = %entry
-  %tmp7 = landingpad { i8*, i32 }
-            cleanup
-  %tmp8 = call i8* @llvm.localaddress()
-  %tmp9 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %tmp8, i32 0)
-  %tmp12 = bitcast i8* %tmp9 to %struct._RTL_CRITICAL_SECTION*
-  call void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION* %tmp12)
-  resume { i8*, i32 } %tmp7
-}
-
-; CHECK-LABEL: define i32 @call_may_crash_locked()
-; CHECK: invoke void @may_crash()
-;
-; CHECK: landingpad
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @call_may_crash_locked.cleanup)
-; CHECK-NEXT: indirectbr i8* %recover, []
-
-; CHECK-LABEL: define internal void @call_may_crash_locked.cleanup(i8*, i8*)
-; CHECK: %tmp9 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %1, i32 0)
-; CHECK: %tmp12 = bitcast i8* %tmp9 to %struct._RTL_CRITICAL_SECTION*
-; CHECK: call void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION* %tmp12)
diff --git a/test/CodeGen/WinEH/seh-outlined-finally-win32.ll b/test/CodeGen/WinEH/seh-outlined-finally-win32.ll
deleted file mode 100644
index 3649433c4b610..0000000000000
--- a/test/CodeGen/WinEH/seh-outlined-finally-win32.ll
+++ /dev/null
@@ -1,172 +0,0 @@
-; RUN: opt -S -winehprepare < %s | FileCheck %s
-
-; Test case based on this code:
-;
-; extern "C" int _abnormal_termination();
-; #pragma intrinsic(_abnormal_termination)
-; extern "C" int printf(const char *, ...);
-; extern "C" void may_crash() {
-;   *(volatile int *)0 = 42;
-; }
-; int main() {
-;   int myres = 0;
-;   __try {
-;     __try {
-;       may_crash();
-;     } __finally {
-;       printf("inner finally %d\n", _abnormal_termination());
-;       may_crash();
-;     }
-;   } __finally {
-;     printf("outer finally %d\n", _abnormal_termination());
-;   }
-; }
-;
-; Note that if the inner finally crashes, the outer finally still runs. There
-; is nothing like a std::terminate call in this situation.
-
-target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
-target triple = "i686-pc-windows-msvc"
-
-$"\01??_C@_0BC@LHHILCPN@outer?5finally?5?$CFd?6?$AA@" = comdat any
-
-$"\01??_C@_0BC@JELAHKN@inner?5finally?5?$CFd?6?$AA@" = comdat any
-
-@"\01??_C@_0BC@LHHILCPN@outer?5finally?5?$CFd?6?$AA@" = linkonce_odr unnamed_addr constant [18 x i8] c"outer finally %d\0A\00", comdat, align 1
-@"\01??_C@_0BC@JELAHKN@inner?5finally?5?$CFd?6?$AA@" = linkonce_odr unnamed_addr constant [18 x i8] c"inner finally %d\0A\00", comdat, align 1
-
-; Function Attrs: nounwind
-define void @may_crash() #0 {
-entry:
-  store volatile i32 42, i32* null, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define i32 @main() #0 personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
-entry:
-  %myres = alloca i32, align 4
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  store i32 0, i32* %myres, align 4
-  invoke void @may_crash() #4
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  %0 = call i8* @llvm.frameaddress(i32 0)
-  invoke void @"\01?fin$1@0@main@@"(i8 zeroext 0, i8* %0) #4
-          to label %invoke.cont.2 unwind label %lpad.1
-
-invoke.cont.2:                                    ; preds = %invoke.cont
-  %1 = call i8* @llvm.frameaddress(i32 0)
-  call void @"\01?fin$0@0@main@@"(i8 zeroext 0, i8* %1)
-  ret i32 0
-
-lpad:                                             ; preds = %entry
-  %2 = landingpad { i8*, i32 }
-          cleanup
-  %3 = extractvalue { i8*, i32 } %2, 0
-  store i8* %3, i8** %exn.slot
-  %4 = extractvalue { i8*, i32 } %2, 1
-  store i32 %4, i32* %ehselector.slot
-  %5 = call i8* @llvm.frameaddress(i32 0)
-  invoke void @"\01?fin$1@0@main@@"(i8 zeroext 1, i8* %5) #4
-          to label %invoke.cont.3 unwind label %lpad.1
-
-lpad.1:                                           ; preds = %lpad, %invoke.cont
-  %6 = landingpad { i8*, i32 }
-          cleanup
-  %7 = extractvalue { i8*, i32 } %6, 0
-  store i8* %7, i8** %exn.slot
-  %8 = extractvalue { i8*, i32 } %6, 1
-  store i32 %8, i32* %ehselector.slot
-  br label %ehcleanup
-
-invoke.cont.3:                                    ; preds = %lpad
-  br label %ehcleanup
-
-ehcleanup:                                        ; preds = %invoke.cont.3, %lpad.1
-  %9 = call i8* @llvm.frameaddress(i32 0)
-  call void @"\01?fin$0@0@main@@"(i8 zeroext 1, i8* %9)
-  br label %eh.resume
-
-eh.resume:                                        ; preds = %ehcleanup
-  %exn = load i8*, i8** %exn.slot
-  %sel = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
-  %lpad.val.4 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
-  resume { i8*, i32 } %lpad.val.4
-}
-
-; CHECK-LABEL: define i32 @main()
-; CHECK: invoke void @may_crash()
-;
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void ()* @main.cleanup)
-; CHECK-NEXT: indirectbr
-;
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void ()* @main.cleanup.1)
-; CHECK-NEXT: indirectbr
-
-; CHECK-LABEL: define internal void @main.cleanup()
-; CHECK: call i8* @llvm.frameaddress(i32 1)
-; CHECK: call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %{{.*}})
-; CHECK: call void @"\01?fin$1@0@main@@"(i8 zeroext 1, i8* %{{.*}})
-; CHECK: call void @"\01?fin$0@0@main@@"(i8 zeroext 1, i8* %{{.*}})
-
-; CHECK-LABEL: define internal void @main.cleanup.1()
-; CHECK: call i8* @llvm.frameaddress(i32 1)
-; CHECK: call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %{{.*}})
-; CHECK: call void @"\01?fin$0@0@main@@"(i8 zeroext 1, i8* %{{.*}})
-
-; Function Attrs: noinline nounwind
-define internal void @"\01?fin$0@0@main@@"(i8 zeroext %abnormal_termination, i8* %frame_pointer) #1 {
-entry:
-  %frame_pointer.addr = alloca i8*, align 4
-  %abnormal_termination.addr = alloca i8, align 1
-  %0 = call i8* @llvm.frameaddress(i32 1)
-  %1 = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %0)
-  store i8* %frame_pointer, i8** %frame_pointer.addr, align 4
-  store i8 %abnormal_termination, i8* %abnormal_termination.addr, align 1
-  %2 = zext i8 %abnormal_termination to i32
-  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @"\01??_C@_0BC@LHHILCPN@outer?5finally?5?$CFd?6?$AA@", i32 0, i32 0), i32 %2)
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i8* @llvm.frameaddress(i32) #2
-
-; Function Attrs: nounwind readnone
-declare i8* @llvm.x86.seh.recoverfp(i8*, i8*) #2
-
-declare i32 @printf(i8*, ...) #3
-
-; Function Attrs: noinline nounwind
-define internal void @"\01?fin$1@0@main@@"(i8 zeroext %abnormal_termination, i8* %frame_pointer) #1 {
-entry:
-  %frame_pointer.addr = alloca i8*, align 4
-  %abnormal_termination.addr = alloca i8, align 1
-  %0 = call i8* @llvm.frameaddress(i32 1)
-  %1 = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %0)
-  store i8* %frame_pointer, i8** %frame_pointer.addr, align 4
-  store i8 %abnormal_termination, i8* %abnormal_termination.addr, align 1
-  %2 = zext i8 %abnormal_termination to i32
-  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @"\01??_C@_0BC@JELAHKN@inner?5finally?5?$CFd?6?$AA@", i32 0, i32 0), i32 %2)
-  call void @may_crash()
-  ret void
-}
-
-declare i32 @_except_handler3(...)
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { noinline }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.7.0 "}
diff --git a/test/CodeGen/WinEH/seh-outlined-finally.ll b/test/CodeGen/WinEH/seh-outlined-finally.ll
deleted file mode 100644
index 529f85b9602bb..0000000000000
--- a/test/CodeGen/WinEH/seh-outlined-finally.ll
+++ /dev/null
@@ -1,155 +0,0 @@
-; RUN: opt -S -winehprepare -mtriple=x86_64-windows-msvc < %s | FileCheck %s
-
-; Test case based on this code:
-;
-; extern "C" int _abnormal_termination();
-; #pragma intrinsic(_abnormal_termination)
-; extern "C" int printf(const char *, ...);
-; extern "C" void may_crash() {
-;   *(volatile int *)0 = 42;
-; }
-; int main() {
-;   int myres = 0;
-;   __try {
-;     __try {
-;       may_crash();
-;     } __finally {
-;       printf("inner finally %d\n", _abnormal_termination());
-;       may_crash();
-;     }
-;   } __finally {
-;     printf("outer finally %d\n", _abnormal_termination());
-;   }
-; }
-;
-; Note that if the inner finally crashes, the outer finally still runs. There
-; is nothing like a std::terminate call in this situation.
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-@str_outer_finally = linkonce_odr unnamed_addr constant [18 x i8] c"outer finally %d\0A\00", align 1
-@str_inner_finally = linkonce_odr unnamed_addr constant [18 x i8] c"inner finally %d\0A\00", align 1
-
-; Function Attrs: nounwind uwtable
-define void @may_crash() #0 {
-entry:
-  store volatile i32 42, i32* null, align 4
-  ret void
-}
-
-; Function Attrs: uwtable
-define i32 @main() #1 personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
-entry:
-  %myres = alloca i32, align 4
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  store i32 0, i32* %myres, align 4
-  invoke void @may_crash() #4
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  %0 = call i8* @llvm.localaddress()
-  invoke void @"\01?fin$1@0@main@@"(i1 zeroext false, i8* %0) #4
-          to label %invoke.cont2 unwind label %lpad1
-
-invoke.cont2:                                     ; preds = %invoke.cont
-  %1 = call i8* @llvm.localaddress()
-  call void @"\01?fin$0@0@main@@"(i1 zeroext false, i8* %1)
-  ret i32 0
-
-lpad:                                             ; preds = %entry
-  %2 = landingpad { i8*, i32 }
-          cleanup
-  %3 = extractvalue { i8*, i32 } %2, 0
-  store i8* %3, i8** %exn.slot
-  %4 = extractvalue { i8*, i32 } %2, 1
-  store i32 %4, i32* %ehselector.slot
-  %5 = call i8* @llvm.localaddress()
-  invoke void @"\01?fin$1@0@main@@"(i1 zeroext true, i8* %5) #4
-          to label %invoke.cont3 unwind label %lpad1
-
-lpad1:                                            ; preds = %lpad, %invoke.cont
-  %6 = landingpad { i8*, i32 }
-          cleanup
-  %7 = extractvalue { i8*, i32 } %6, 0
-  store i8* %7, i8** %exn.slot
-  %8 = extractvalue { i8*, i32 } %6, 1
-  store i32 %8, i32* %ehselector.slot
-  br label %ehcleanup
-
-invoke.cont3:                                     ; preds = %lpad
-  br label %ehcleanup
-
-ehcleanup:                                        ; preds = %invoke.cont3, %lpad1
-  %9 = call i8* @llvm.localaddress()
-  call void @"\01?fin$0@0@main@@"(i1 zeroext true, i8* %9)
-  br label %eh.resume
-
-eh.resume:                                        ; preds = %ehcleanup
-  %exn = load i8*, i8** %exn.slot
-  %sel = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
-  %lpad.val4 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
-  resume { i8*, i32 } %lpad.val4
-}
-
-; CHECK-NOT: define internal void @
-
-; CHECK-LABEL: define i32 @main()
-; CHECK: invoke void @may_crash()
-;
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void (i1, i8*)* @"\01?fin$1@0@main@@", i32 0, void (i1, i8*)* @"\01?fin$0@0@main@@")
-; CHECK-NEXT: indirectbr
-;
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void (i1, i8*)* @"\01?fin$0@0@main@@")
-; CHECK-NEXT: indirectbr
-
-; There should not be any *new* cleanup helpers, just the existing ones.
-; CHECK-NOT: define internal void @
-; CHECK: define internal void @"\01?fin$0@0@main@@"(i1 zeroext %abnormal_termination, i8* %frame_pointer)
-; CHECK-NOT: define internal void @
-; CHECK: define internal void @"\01?fin$1@0@main@@"(i1 zeroext %abnormal_termination, i8* %frame_pointer)
-; CHECK-NOT: define internal void @
-
-define internal void @"\01?fin$0@0@main@@"(i1 zeroext %abnormal_termination, i8* %frame_pointer) #2 {
-entry:
-  %frame_pointer.addr = alloca i8*, align 8
-  %abnormal_termination.addr = alloca i8, align 1
-  store i8* %frame_pointer, i8** %frame_pointer.addr, align 8
-  %frombool = zext i1 %abnormal_termination to i8
-  store i8 %frombool, i8* %abnormal_termination.addr, align 1
-  %0 = zext i1 %abnormal_termination to i32
-  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @str_outer_finally, i32 0, i32 0), i32 %0)
-  ret void
-}
-
-declare i32 @printf(i8*, ...) #2
-
-define internal void @"\01?fin$1@0@main@@"(i1 zeroext %abnormal_termination, i8* %frame_pointer) #2 {
-entry:
-  %frame_pointer.addr = alloca i8*, align 8
-  %abnormal_termination.addr = alloca i8, align 1
-  store i8* %frame_pointer, i8** %frame_pointer.addr, align 8
-  %frombool = zext i1 %abnormal_termination to i8
-  store i8 %frombool, i8* %abnormal_termination.addr, align 1
-  %0 = zext i1 %abnormal_termination to i32
-  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @str_inner_finally, i32 0, i32 0), i32 %0)
-  call void @may_crash()
-  ret void
-}
-
-declare i32 @__C_specific_handler(...)
-
-; Function Attrs: nounwind readnone
-declare i8* @llvm.localaddress() #3
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone }
-attributes #4 = { noinline }
diff --git a/test/CodeGen/WinEH/seh-prepared-basic.ll b/test/CodeGen/WinEH/seh-prepared-basic.ll
deleted file mode 100644
index b6a30309f1c11..0000000000000
--- a/test/CodeGen/WinEH/seh-prepared-basic.ll
+++ /dev/null
@@ -1,83 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-
-; Test case based on this code:
-; extern "C" unsigned long _exception_code();
-; extern "C" int filt(unsigned long);
-; extern "C" void g();
-; extern "C" void do_except() {
-;   __try {
-;     g();
-;   } __except(filt(_exception_code())) {
-;   }
-; }
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-; Function Attrs: uwtable
-define void @do_except() #0 personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
-entry:
-  call void (...) @llvm.localescape()
-  invoke void @g() #5
-          to label %__try.cont unwind label %lpad1
-
-lpad1:                                            ; preds = %entry
-  %ehvals = landingpad { i8*, i32 }
-          catch i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@do_except@@" to i8*)
-  %recover = call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@do_except@@" to i8*), i32 -1, i8* blockaddress(@do_except, %__try.cont))
-  indirectbr i8* %recover, [label %__try.cont]
-
-__try.cont:                                       ; preds = %lpad1, %entry
-  ret void
-}
-
-; CHECK-LABEL: do_except:
-; CHECK: .seh_handler __C_specific_handler
-; CHECK-NOT: jmpq *
-; CHECK: .seh_handlerdata
-; CHECK-NEXT: .long 1
-; CHECK-NEXT: .long .Ltmp{{.*}}
-; CHECK-NEXT: .long .Ltmp{{.*}}
-; CHECK-NEXT: .long "?filt$0@0@do_except@@"@IMGREL
-; CHECK-NEXT: .long .Ltmp{{.*}}@IMGREL
-
-; Function Attrs: noinline nounwind
-define internal i32 @"\01?filt$0@0@do_except@@"(i8* nocapture readonly %exception_pointers, i8* nocapture readnone %frame_pointer) #1 {
-entry:
-  %0 = bitcast i8* %exception_pointers to i32**
-  %1 = load i32*, i32** %0, align 8
-  %2 = load i32, i32* %1, align 4
-  %call = tail call i32 @filt(i32 %2) #4
-  ret i32 %call
-}
-
-declare i32 @filt(i32) #2
-
-declare void @g() #2
-
-declare i32 @__C_specific_handler(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #3
-
-; Function Attrs: nounwind
-declare i8* @llvm.eh.actions(...) #4
-
-; Function Attrs: nounwind
-declare void @llvm.localescape(...) #4
-
-; Function Attrs: nounwind readnone
-declare i8* @llvm.localrecover(i8*, i8*, i32) #3
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="do_except" }
-attributes #1 = { noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone }
-attributes #4 = { nounwind }
-attributes #5 = { noinline }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"PIC Level", i32 2}
-!1 = !{!"clang version 3.7.0 "}
diff --git a/test/CodeGen/WinEH/seh-resume-phi.ll b/test/CodeGen/WinEH/seh-resume-phi.ll
deleted file mode 100644
index d2bd64167d22c..0000000000000
--- a/test/CodeGen/WinEH/seh-resume-phi.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; RUN: opt -S -winehprepare < %s | FileCheck %s
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
-
-declare void @might_crash(i8* %ehptr)
-declare i32 @filt()
-declare void @cleanup()
-declare i32 @__C_specific_handler(...)
-declare i32 @llvm.eh.typeid.for(i8*)
-
-define void @resume_phi() personality i32 (...)* @__C_specific_handler {
-entry:
-  invoke void @might_crash(i8* null)
-          to label %return unwind label %lpad1
-
-lpad1:
-  %ehvals1 = landingpad { i8*, i32 }
-          catch i32 ()* @filt
-  %ehptr1 = extractvalue { i8*, i32 } %ehvals1, 0
-  %ehsel1 = extractvalue { i8*, i32 } %ehvals1, 1
-  %filt_sel = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @filt to i8*))
-  %matches = icmp eq i32 %ehsel1, %filt_sel
-  br i1 %matches, label %__except, label %eh.resume
-
-__except:
-  invoke void @might_crash(i8* %ehptr1)
-          to label %return unwind label %lpad2
-
-lpad2:
-  %ehvals2 = landingpad { i8*, i32 }
-          cleanup
-  %ehptr2 = extractvalue { i8*, i32 } %ehvals2, 0
-  %ehsel2 = extractvalue { i8*, i32 } %ehvals2, 1
-  call void @cleanup()
-  br label %eh.resume
-
-return:
-  ret void
-
-eh.resume:
-  %ehptr.phi = phi i8* [ %ehptr1, %lpad1 ], [ %ehptr2, %lpad2 ]
-  %ehsel.phi = phi i32 [ %ehsel1, %lpad1 ], [ %ehsel2, %lpad2 ]
-  %ehval.phi1 = insertvalue { i8*, i32 } undef, i8* %ehptr.phi, 0
-  %ehval.phi2 = insertvalue { i8*, i32 } %ehval.phi1, i32 %ehsel.phi, 1
-  resume { i8*, i32 } %ehval.phi2
-}
-
-; CHECK-LABEL: define void @resume_phi()
-; CHECK: invoke void @might_crash(i8* null)
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: catch i32 ()* @filt
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(
-; CHECK-SAME: i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@resume_phi, %__except))
-; CHECK-NEXT: indirectbr {{.*}} [label %__except]
-;
-; CHECK: __except:
-; CHECK: call i32 @llvm.eh.exceptioncode()
-; CHECK: invoke void @might_crash(i8* %{{.*}})
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void (i8*, i8*)* @resume_phi.cleanup)
-; CHECK-NEXT: indirectbr {{.*}} []
-
-; CHECK-LABEL: define internal void @resume_phi.cleanup(i8*, i8*)
-; CHECK: call void @cleanup()
diff --git a/test/CodeGen/WinEH/seh-simple.ll b/test/CodeGen/WinEH/seh-simple.ll
deleted file mode 100644
index 060186484aec5..0000000000000
--- a/test/CodeGen/WinEH/seh-simple.ll
+++ /dev/null
@@ -1,233 +0,0 @@
-; RUN: opt -S -winehprepare -mtriple=x86_64-windows-msvc < %s \
-; RUN: 		| FileCheck %s --check-prefix=CHECK --check-prefix=X64
-
-; This test should also pass in 32-bit using _except_handler3.
-; RUN: sed -e 's/__C_specific_handler/_except_handler3/' %s \
-; RUN: 		| opt -S -winehprepare -mtriple=i686-windows-msvc \
-; RUN: 		| FileCheck %s --check-prefix=CHECK --check-prefix=X86
-
-declare void @cleanup()
-declare i32 @filt()
-declare void @might_crash()
-declare i32 @__C_specific_handler(...)
-declare i32 @llvm.eh.typeid.for(i8*)
-
-define i32 @simple_except_store() personality i32 (...)* @__C_specific_handler {
-entry:
-  %retval = alloca i32
-  store i32 0, i32* %retval
-  invoke void @might_crash()
-          to label %return unwind label %lpad
-
-lpad:
-  %ehvals = landingpad { i8*, i32 }
-          catch i32 ()* @filt
-  %sel = extractvalue { i8*, i32 } %ehvals, 1
-  %filt_sel = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @filt to i8*))
-  %matches = icmp eq i32 %sel, %filt_sel
-  br i1 %matches, label %__except, label %eh.resume
-
-__except:
-  store i32 1, i32* %retval
-  br label %return
-
-return:
-  %r = load i32, i32* %retval
-  ret i32 %r
-
-eh.resume:
-  resume { i8*, i32 } %ehvals
-}
-
-; CHECK-LABEL: define i32 @simple_except_store()
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: catch i32 ()* @filt
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@simple_except_store, %__except))
-; CHECK-NEXT: indirectbr {{.*}} [label %__except]
-
-define i32 @catch_all() personality i32 (...)* @__C_specific_handler {
-entry:
-  %retval = alloca i32
-  store i32 0, i32* %retval
-  invoke void @might_crash()
-          to label %return unwind label %lpad
-
-lpad:
-  %ehvals = landingpad { i8*, i32 }
-          catch i8* null
-  store i32 1, i32* %retval
-  br label %return
-
-return:
-  %r = load i32, i32* %retval
-  ret i32 %r
-}
-
-; CHECK-LABEL: define i32 @catch_all()
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: catch i8* null
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* null, i32 -1, i8* blockaddress(@catch_all, %lpad.split))
-; CHECK-NEXT: indirectbr {{.*}} [label %lpad.split]
-;
-; CHECK: lpad.split:
-; CHECK: store i32 1, i32* %retval
-
-
-define i32 @except_phi() personality i32 (...)* @__C_specific_handler {
-entry:
-  invoke void @might_crash()
-          to label %return unwind label %lpad
-
-lpad:
-  %ehvals = landingpad { i8*, i32 }
-          catch i32 ()* @filt
-  %sel = extractvalue { i8*, i32 } %ehvals, 1
-  %filt_sel = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @filt to i8*))
-  %matches = icmp eq i32 %sel, %filt_sel
-  br i1 %matches, label %return, label %eh.resume
-
-return:
-  %r = phi i32 [0, %entry], [1, %lpad]
-  ret i32 %r
-
-eh.resume:
-  resume { i8*, i32 } %ehvals
-}
-
-; CHECK-LABEL: define i32 @except_phi()
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: catch i32 ()* @filt
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@except_phi, %lpad.return_crit_edge))
-; CHECK-NEXT: indirectbr {{.*}} [label %lpad.return_crit_edge]
-;
-; CHECK: lpad.return_crit_edge:
-; CHECK: br label %return
-;
-; CHECK: return:
-; CHECK-NEXT: %r = phi i32 [ 0, %entry ], [ 1, %lpad.return_crit_edge ]
-; CHECK-NEXT: ret i32 %r
-
-define i32 @except_join() personality i32 (...)* @__C_specific_handler {
-entry:
-  invoke void @might_crash()
-          to label %return unwind label %lpad
-
-lpad:
-  %ehvals = landingpad { i8*, i32 }
-          catch i32 ()* @filt
-  %sel = extractvalue { i8*, i32 } %ehvals, 1
-  %filt_sel = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @filt to i8*))
-  %matches = icmp eq i32 %sel, %filt_sel
-  br i1 %matches, label %return, label %eh.resume
-
-return:
-  ret i32 0
-
-eh.resume:
-  resume { i8*, i32 } %ehvals
-}
-
-; CHECK-LABEL: define i32 @except_join()
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: catch i32 ()* @filt
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@except_join, %lpad.return_crit_edge))
-; CHECK-NEXT: indirectbr {{.*}} [label %lpad.return_crit_edge]
-;
-; CHECK: lpad.return_crit_edge:
-; CHECK: br label %return
-;
-; CHECK: return:
-; CHECK-NEXT: ret i32 0
-
-define i32 @lpad_phi() personality i32 (...)* @__C_specific_handler {
-entry:
-  invoke void @might_crash()
-          to label %cont unwind label %lpad
-
-cont:
-  invoke void @might_crash()
-          to label %return unwind label %lpad
-
-lpad:
-  %ncalls.1 = phi i32 [ 0, %entry ], [ 1, %cont ]
-  %ehvals = landingpad { i8*, i32 }
-          catch i32 ()* @filt
-  %sel = extractvalue { i8*, i32 } %ehvals, 1
-  %filt_sel = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @filt to i8*))
-  %matches = icmp eq i32 %sel, %filt_sel
-  br i1 %matches, label %return, label %eh.resume
-
-return:
-  %r = phi i32 [2, %cont], [%ncalls.1, %lpad]
-  ret i32 %r
-
-eh.resume:
-  resume { i8*, i32 } %ehvals
-}
-
-; CHECK-LABEL: define i32 @lpad_phi()
-; CHECK: alloca i32
-; CHECK: store i32 0, i32*
-; CHECK: invoke void @might_crash()
-; CHECK: store i32 1, i32*
-; CHECK: invoke void @might_crash()
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: catch i32 ()* @filt
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void ({{.*}})* @lpad_phi.cleanup, i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@lpad_phi, %lpad.return_crit_edge))
-; CHECK-NEXT: indirectbr {{.*}} [label %lpad.return_crit_edge]
-;
-; CHECK: lpad.return_crit_edge:
-; CHECK: load i32, i32*
-; CHECK: br label %return
-;
-; CHECK: return:
-; CHECK-NEXT: %r = phi i32 [ 2, %cont ], [ %{{.*}}, %lpad.return_crit_edge ]
-; CHECK-NEXT: ret i32 %r
-
-define i32 @cleanup_and_except() personality i32 (...)* @__C_specific_handler {
-entry:
-  invoke void @might_crash()
-          to label %return unwind label %lpad
-
-lpad:
-  %ehvals = landingpad { i8*, i32 }
-          cleanup
-          catch i32 ()* @filt
-  call void @cleanup()
-  %sel = extractvalue { i8*, i32 } %ehvals, 1
-  %filt_sel = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @filt to i8*))
-  %matches = icmp eq i32 %sel, %filt_sel
-  br i1 %matches, label %return, label %eh.resume
-
-return:
-  %r = phi i32 [0, %entry], [1, %lpad]
-  ret i32 %r
-
-eh.resume:
-  resume { i8*, i32 } %ehvals
-}
-
-; CHECK-LABEL: define i32 @cleanup_and_except()
-; CHECK: landingpad { i8*, i32 }
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: catch i32 ()* @filt
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions(
-; CHECK: i32 0, void ({{.*}})* @cleanup_and_except.cleanup,
-; CHECK: i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@cleanup_and_except, %lpad.return_crit_edge))
-; CHECK-NEXT: indirectbr {{.*}} [label %lpad.return_crit_edge]
-;
-; CHECK: lpad.return_crit_edge:
-; CHECK: br label %return
-;
-; CHECK: return:
-; CHECK-NEXT: %r = phi i32 [ 0, %entry ], [ 1, %lpad.return_crit_edge ]
-; CHECK-NEXT: ret i32 %r
-
-; FIXME: This cleanup is an artifact of bad demotion.
-; X64-LABEL: define internal void @lpad_phi.cleanup(i8*, i8*)
-; X86-LABEL: define internal void @lpad_phi.cleanup()
-; X86: call i8* @llvm.frameaddress(i32 1)
-; CHECK: call i8* @llvm.localrecover({{.*}})
-; CHECK: load i32
-; CHECK: store i32 %{{.*}}, i32*
diff --git a/test/CodeGen/WinEH/wineh-cloning.ll b/test/CodeGen/WinEH/wineh-cloning.ll
new file mode 100644
index 0000000000000..c13e0a163641e
--- /dev/null
+++ b/test/CodeGen/WinEH/wineh-cloning.ll
@@ -0,0 +1,391 @@
+; RUN: opt -mtriple=x86_64-pc-windows-msvc -S -winehprepare  < %s | FileCheck %s
+
+declare i32 @__CxxFrameHandler3(...)
+declare i32 @__C_specific_handler(...)
+
+declare void @f()
+
+declare void @llvm.foo(i32) nounwind
+declare void @llvm.bar() nounwind
+declare i32 @llvm.qux() nounwind
+declare i1 @llvm.baz() nounwind
+
+define void @test1() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  ; %x def colors: {entry} subset of use colors; must spill
+  %x = call i32 @llvm.qux()
+  invoke void @f()
+    to label %noreturn unwind label %catch.switch
+catch.switch:
+  %cs = catchswitch within none [label %catch] unwind to caller
+catch:
+  %cp = catchpad within %cs []
+  br label %noreturn
+noreturn:
+  ; %x use colors: {entry, cleanup}
+  call void @llvm.foo(i32 %x)
+  unreachable
+}
+; Need two copies of the call to @h, one under entry and one under catch.
+; Currently we generate a load for each, though we shouldn't need one
+; for the use in entry's copy.
+; CHECK-LABEL: define void @test1(
+; CHECK: entry:
+; CHECK:   %x = call i32 @llvm.qux()
+; CHECK:   invoke void @f()
+; CHECK:     to label %[[EntryCopy:[^ ]+]] unwind label %catch
+; CHECK: catch.switch:
+; CHECK:   %cs = catchswitch within none [label %catch] unwind to caller
+; CHECK: catch:
+; CHECK:   catchpad within %cs []
+; CHECK-NEXT: call void @llvm.foo(i32 %x)
+; CHECK: [[EntryCopy]]:
+; CHECK:   call void @llvm.foo(i32 %x)
+
+
+define void @test2() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @f()
+    to label %exit unwind label %cleanup
+cleanup:
+  cleanuppad within none []
+  br label %exit
+exit:
+  call void @llvm.bar()
+  ret void
+}
+; Need two copies of %exit's call to @f -- the subsequent ret is only
+; valid when coming from %entry, but on the path from %cleanup, this
+; might be a valid call to @f which might dynamically not return.
+; CHECK-LABEL: define void @test2(
+; CHECK: entry:
+; CHECK:   invoke void @f()
+; CHECK:     to label %[[exit:[^ ]+]] unwind label %cleanup
+; CHECK: cleanup:
+; CHECK:   cleanuppad within none []
+; CHECK:   call void @llvm.bar()
+; CHECK-NEXT: unreachable
+; CHECK: [[exit]]:
+; CHECK:   call void @llvm.bar()
+; CHECK-NEXT: ret void
+
+
+define void @test3() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @f()
+    to label %invoke.cont unwind label %catch.switch
+invoke.cont:
+  invoke void @f()
+    to label %exit unwind label %cleanup
+catch.switch:
+  %cs = catchswitch within none [label %catch] unwind to caller
+catch:
+  catchpad within %cs []
+  br label %shared
+cleanup:
+  cleanuppad within none []
+  br label %shared
+shared:
+  call void @llvm.bar()
+  br label %exit
+exit:
+  ret void
+}
+; Need two copies of %shared's call to @f (similar to @test2 but
+; the two regions here are siblings, not parent-child).
+; CHECK-LABEL: define void @test3(
+; CHECK:   invoke void @f()
+; CHECK:   invoke void @f()
+; CHECK:     to label %[[exit:[^ ]+]] unwind
+; CHECK: catch:
+; CHECK:   catchpad within %cs []
+; CHECK-NEXT: call void @llvm.bar()
+; CHECK-NEXT: unreachable
+; CHECK: cleanup:
+; CHECK:   cleanuppad within none []
+; CHECK:   call void @llvm.bar()
+; CHECK-NEXT: unreachable
+; CHECK: [[exit]]:
+; CHECK:   ret void
+
+
+define void @test4() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @f()
+    to label %shared unwind label %catch.switch
+catch.switch:
+  %cs = catchswitch within none [label %catch] unwind to caller
+catch:
+  catchpad within %cs []
+  br label %shared
+shared:
+  %x = call i32 @llvm.qux()
+  %i = call i32 @llvm.qux()
+  %zero.trip = icmp eq i32 %i, 0
+  br i1 %zero.trip, label %exit, label %loop
+loop:
+  %i.loop = phi i32 [ %i, %shared ], [ %i.dec, %loop.tail ]
+  %b = call i1 @llvm.baz()
+  br i1 %b, label %left, label %right
+left:
+  %y = call i32 @llvm.qux()
+  br label %loop.tail
+right:
+  call void @llvm.foo(i32 %x)
+  br label %loop.tail
+loop.tail:
+  %i.dec = sub i32 %i.loop, 1
+  %done = icmp eq i32 %i.dec, 0
+  br i1 %done, label %exit, label %loop
+exit:
+  call void @llvm.foo(i32 %x)
+  unreachable
+}
+; Make sure we can clone regions that have internal control
+; flow and SSA values.  Here we need two copies of everything
+; from %shared to %exit.
+; CHECK-LABEL: define void @test4(
+; CHECK:  entry:
+; CHECK:    to label %[[shared_E:[^ ]+]] unwind label %catch.switch
+; CHECK:  catch:
+; CHECK:    catchpad within %cs []
+; CHECK:    [[x_C:%[^ ]+]] = call i32 @llvm.qux()
+; CHECK:    [[i_C:%[^ ]+]] = call i32 @llvm.qux()
+; CHECK:    [[zt_C:%[^ ]+]] = icmp eq i32 [[i_C]], 0
+; CHECK:    br i1 [[zt_C]], label %[[exit_C:[^ ]+]], label %[[loop_C:[^ ]+]]
+; CHECK:  [[shared_E]]:
+; CHECK:    [[x_E:%[^ ]+]] = call i32 @llvm.qux()
+; CHECK:    [[i_E:%[^ ]+]] = call i32 @llvm.qux()
+; CHECK:    [[zt_E:%[^ ]+]] = icmp eq i32 [[i_E]], 0
+; CHECK:    br i1 [[zt_E]], label %[[exit_E:[^ ]+]], label %[[loop_E:[^ ]+]]
+; CHECK:  [[loop_C]]:
+; CHECK:    [[iloop_C:%[^ ]+]] = phi i32 [ [[i_C]], %catch ], [ [[idec_C:%[^ ]+]], %[[looptail_C:[^ ]+]] ]
+; CHECK:    [[b_C:%[^ ]+]] = call i1 @llvm.baz()
+; CHECK:    br i1 [[b_C]], label %[[left_C:[^ ]+]], label %[[right_C:[^ ]+]]
+; CHECK:  [[loop_E]]:
+; CHECK:    [[iloop_E:%[^ ]+]] = phi i32 [ [[i_E]], %[[shared_E]] ], [ [[idec_E:%[^ ]+]], %[[looptail_E:[^ ]+]] ]
+; CHECK:    [[b_E:%[^ ]+]] = call i1 @llvm.baz()
+; CHECK:    br i1 [[b_E]], label %[[left_E:[^ ]+]], label %[[right_E:[^ ]+]]
+; CHECK:  [[left_C]]:
+; CHECK:    [[y_C:%[^ ]+]] = call i32 @llvm.qux()
+; CHECK:    br label %[[looptail_C]]
+; CHECK:  [[left_E]]:
+; CHECK:    [[y_E:%[^ ]+]] = call i32 @llvm.qux()
+; CHECK:    br label %[[looptail_E]]
+; CHECK:  [[right_C]]:
+; CHECK:    call void @llvm.foo(i32 [[x_C]])
+; CHECK:    br label %[[looptail_C]]
+; CHECK:  [[right_E]]:
+; CHECK:    call void @llvm.foo(i32 [[x_E]])
+; CHECK:    br label %[[looptail_E]]
+; CHECK:  [[looptail_C]]:
+; CHECK:    [[idec_C]] = sub i32 [[iloop_C]], 1
+; CHECK:    [[done_C:%[^ ]+]] = icmp eq i32 [[idec_C]], 0
+; CHECK:    br i1 [[done_C]], label %[[exit_C]], label %[[loop_C]]
+; CHECK:  [[looptail_E]]:
+; CHECK:    [[idec_E]] = sub i32 [[iloop_E]], 1
+; CHECK:    [[done_E:%[^ ]+]] = icmp eq i32 [[idec_E]], 0
+; CHECK:    br i1 [[done_E]], label %[[exit_E]], label %[[loop_E]]
+; CHECK:  [[exit_C]]:
+; CHECK:    call void @llvm.foo(i32 [[x_C]])
+; CHECK:    unreachable
+; CHECK:  [[exit_E]]:
+; CHECK:    call void @llvm.foo(i32 [[x_E]])
+; CHECK:    unreachable
+
+
+define void @test5() personality i32 (...)* @__C_specific_handler {
+entry:
+  invoke void @f()
+    to label %exit unwind label %outer
+outer:
+  %o = cleanuppad within none []
+  %x = call i32 @llvm.qux()
+  invoke void @f() [ "funclet"(token %o) ]
+    to label %outer.ret unwind label %catch.switch
+catch.switch:
+  %cs = catchswitch within %o [label %inner] unwind to caller
+inner:
+  %i = catchpad within %cs []
+  catchret from %i to label %outer.post-inner
+outer.post-inner:
+  call void @llvm.foo(i32 %x)
+  br label %outer.ret
+outer.ret:
+  cleanupret from %o unwind to caller
+exit:
+  ret void
+}
+; Simple nested case (catch-inside-cleanup).  Nothing needs
+; to be cloned.  The def and use of %x are both in %outer
+; and so don't need to be spilled.
+; CHECK-LABEL: define void @test5(
+; CHECK:      outer:
+; CHECK:        %x = call i32 @llvm.qux()
+; CHECK-NEXT:   invoke void @f()
+; CHECK-NEXT:     to label %outer.ret unwind label %catch.switch
+; CHECK:      inner:
+; CHECK-NEXT:   %i = catchpad within %cs []
+; CHECK-NEXT:   catchret from %i to label %outer.post-inner
+; CHECK:      outer.post-inner:
+; CHECK-NEXT:   call void @llvm.foo(i32 %x)
+; CHECK-NEXT:   br label %outer.ret
+
+
+define void @test9() personality i32 (...)* @__C_specific_handler {
+entry:
+  invoke void @f()
+    to label %invoke.cont unwind label %left
+invoke.cont:
+  invoke void @f()
+    to label %unreachable unwind label %right
+left:
+  %cp.left = cleanuppad within none []
+  call void @llvm.foo(i32 1)
+  invoke void @f() [ "funclet"(token %cp.left) ]
+    to label %unreachable unwind label %right
+right:
+  %cp.right = cleanuppad within none []
+  call void @llvm.foo(i32 2)
+  invoke void @f() [ "funclet"(token %cp.right) ]
+    to label %unreachable unwind label %left
+unreachable:
+  unreachable
+}
+; This is an irreducible loop with two funclets that enter each other.
+; CHECK-LABEL: define void @test9(
+; CHECK:     entry:
+; CHECK:               to label %invoke.cont unwind label %[[LEFT:.+]]
+; CHECK:     invoke.cont:
+; CHECK:               to label %[[UNREACHABLE_ENTRY:.+]] unwind label %[[RIGHT:.+]]
+; CHECK:     [[LEFT]]:
+; CHECK:       call void @llvm.foo(i32 1)
+; CHECK:       invoke void @f()
+; CHECK:               to label %[[UNREACHABLE_LEFT:.+]] unwind label %[[RIGHT]]
+; CHECK:     [[RIGHT]]:
+; CHECK:       call void @llvm.foo(i32 2)
+; CHECK:       invoke void @f()
+; CHECK:               to label %[[UNREACHABLE_RIGHT:.+]] unwind label %[[LEFT]]
+; CHECK:     [[UNREACHABLE_RIGHT]]:
+; CHECK:       unreachable
+; CHECK:     [[UNREACHABLE_LEFT]]:
+; CHECK:       unreachable
+; CHECK:     [[UNREACHABLE_ENTRY]]:
+; CHECK:       unreachable
+
+
+define void @test10() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @f()
+    to label %unreachable unwind label %inner
+inner:
+  %cleanup = cleanuppad within none []
+  ; make sure we don't overlook this cleanupret and try to process
+  ; successor %outer as a child of inner.
+  cleanupret from %cleanup unwind label %outer
+outer:
+  %cs = catchswitch within none [label %catch.body] unwind to caller
+
+catch.body:
+  %catch = catchpad within %cs []
+  catchret from %catch to label %exit
+exit:
+  ret void
+unreachable:
+  unreachable
+}
+; CHECK-LABEL: define void @test10(
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   invoke
+; CHECK-NEXT:     to label %unreachable unwind label %inner
+; CHECK:      inner:
+; CHECK-NEXT:   %cleanup = cleanuppad within none []
+; CHECK-NEXT:   cleanupret from %cleanup unwind label %outer
+; CHECK:      outer:
+; CHECK-NEXT:   %cs = catchswitch within none [label %catch.body] unwind to caller
+; CHECK:      catch.body:
+; CHECK-NEXT:   %catch = catchpad within %cs []
+; CHECK-NEXT:   catchret from %catch to label %exit
+; CHECK:      exit:
+; CHECK-NEXT:   ret void
+
+define void @test11() personality i32 (...)* @__C_specific_handler {
+entry:
+  invoke void @f()
+    to label %exit unwind label %cleanup.outer
+cleanup.outer:
+  %outer = cleanuppad within none []
+  invoke void @f() [ "funclet"(token %outer) ]
+    to label %outer.cont unwind label %cleanup.inner
+outer.cont:
+  br label %merge
+cleanup.inner:
+  %inner = cleanuppad within %outer []
+  br label %merge
+merge:
+  call void @llvm.bar()
+  unreachable
+exit:
+  ret void
+}
+; merge.end will get cloned for outer and inner, but is implausible
+; from inner, so the call @f() in inner's copy of merge should be
+; rewritten to call @f()
+; CHECK-LABEL: define void @test11()
+; CHECK:      %inner = cleanuppad within %outer []
+; CHECK-NEXT: call void @llvm.bar()
+; CHECK-NEXT: unreachable
+
+define void @test12() personality i32 (...)* @__CxxFrameHandler3 !dbg !5 {
+entry:
+  invoke void @f()
+    to label %cont unwind label %left, !dbg !8
+cont:
+  invoke void @f()
+    to label %exit unwind label %right
+left:
+  cleanuppad within none []
+  br label %join
+right:
+  cleanuppad within none []
+  br label %join
+join:
+  ; This call will get cloned; make sure we can handle cloning
+  ; instructions with debug metadata attached.
+  call void @llvm.bar(), !dbg !9
+  unreachable
+exit:
+  ret void
+}
+
+; CHECK-LABEL: define void @test13()
+; CHECK: ret void
+define void @test13() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  ret void
+
+unreachable:
+  cleanuppad within none []
+  unreachable
+}
+
+;; Debug info (from test12)
+
+; Make sure the DISubprogram doesn't get cloned
+; CHECK-LABEL: !llvm.module.flags
+; CHECK-NOT: !DISubprogram
+; CHECK: !{{[0-9]+}} = distinct !DISubprogram(name: "test12"
+; CHECK-NOT: !DISubprogram
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "compiler", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !3, subprograms: !4)
+!2 = !DIFile(filename: "test.cpp", directory: ".")
+!3 = !{}
+!4 = !{!5}
+!5 = distinct !DISubprogram(name: "test12", scope: !2, file: !2, type: !6, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, variables: !3)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null}
+!8 = !DILocation(line: 1, scope: !5)
+!9 = !DILocation(line: 2, scope: !5)
diff --git a/test/CodeGen/WinEH/wineh-demotion.ll b/test/CodeGen/WinEH/wineh-demotion.ll
new file mode 100644
index 0000000000000..411952d84bb6c
--- /dev/null
+++ b/test/CodeGen/WinEH/wineh-demotion.ll
@@ -0,0 +1,356 @@
+; RUN: opt -mtriple=x86_64-pc-windows-msvc -S -winehprepare  < %s | FileCheck %s
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @f()
+
+declare i32 @g()
+
+declare void @h(i32)
+
+declare i1 @i()
+
+declare void @llvm.bar() nounwind
+
+; CHECK-LABEL: @test1(
+define void @test1(i1 %B) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  ; Spill slot should be inserted here
+  ; CHECK: [[Slot:%[^ ]+]] = alloca
+  ; Can't store for %phi at these defs because the lifetimes overlap
+  ; CHECK-NOT: store
+  %x = call i32 @g()
+  %y = call i32 @g()
+  br i1 %B, label %left, label %right
+left:
+  ; CHECK: left:
+  ; CHECK-NEXT: store i32 %x, i32* [[Slot]]
+  ; CHECK-NEXT: invoke void @f
+  invoke void @f()
+          to label %exit unwind label %merge
+right:
+  ; CHECK: right:
+  ; CHECK-NEXT: store i32 %y, i32* [[Slot]]
+  ; CHECK-NEXT: invoke void @f
+  invoke void @f()
+          to label %exit unwind label %merge
+merge:
+  ; CHECK: merge:
+  ; CHECK-NOT: = phi
+  %phi = phi i32 [ %x, %left ], [ %y, %right ]
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:
+  %cp = catchpad within %cs1 []
+  ; CHECK: catch:
+  ; CHECK: [[Reload:%[^ ]+]] = load i32, i32* [[Slot]]
+  ; CHECK-NEXT: call void @h(i32 [[Reload]])
+  call void @h(i32 %phi) [ "funclet"(token %cp) ]
+  catchret from %cp to label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @test2(
+define void @test2(i1 %B) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  br i1 %B, label %left, label %right
+left:
+  ; Need two stores here because %x and %y interfere so they need 2 slots
+  ; CHECK: left:
+  ; CHECK:   store i32 1, i32* [[Slot1:%[^ ]+]]
+  ; CHECK:   store i32 1, i32* [[Slot2:%[^ ]+]]
+  ; CHECK-NEXT: invoke void @f
+  invoke void @f()
+          to label %exit unwind label %merge.inner
+right:
+  ; Need two stores here because %x and %y interfere so they need 2 slots
+  ; CHECK: right:
+  ; CHECK-DAG:   store i32 2, i32* [[Slot1]]
+  ; CHECK-DAG:   store i32 2, i32* [[Slot2]]
+  ; CHECK: invoke void @f
+  invoke void @f()
+          to label %exit unwind label %merge.inner
+merge.inner:
+  ; CHECK: merge.inner:
+  ; CHECK-NOT: = phi
+  ; CHECK: catchswitch within none
+  %x = phi i32 [ 1, %left ], [ 2, %right ]
+  %cs1 = catchswitch within none [label %catch.inner] unwind label %merge.outer
+
+catch.inner:
+  %cpinner = catchpad within %cs1 []
+  ; Need just one store here because only %y is affected
+  ; CHECK: catch.inner:
+  %z = call i32 @g() [ "funclet"(token %cpinner) ]
+  ; CHECK:   store i32 %z
+  ; CHECK-NEXT: invoke void @f
+  invoke void @f() [ "funclet"(token %cpinner) ]
+          to label %catchret.inner unwind label %merge.outer
+
+catchret.inner:
+  catchret from %cpinner to label %exit
+
+merge.outer:
+  %y = phi i32 [ %x, %merge.inner ], [ %z, %catch.inner ]
+  ; CHECK: merge.outer:
+  ; CHECK-NOT: = phi
+  ; CHECK: catchswitch within none
+  %cs2 = catchswitch within none [label %catch.outer] unwind to caller
+
+catch.outer:
+  %cpouter = catchpad within %cs2 []
+  ; CHECK: catch.outer:
+  ; CHECK: [[CatchPad:%[^ ]+]] = catchpad within %cs2 []
+  ; Need to load x and y from two different slots since they're both live
+  ; and can have different values (if we came from catch.inner)
+  ; CHECK-DAG: load i32, i32* [[Slot1]]
+  ; CHECK-DAG: load i32, i32* [[Slot2]]
+  ; CHECK: catchret from [[CatchPad]] to label
+  call void @h(i32 %x) [ "funclet"(token %cpouter) ]
+  call void @h(i32 %y) [ "funclet"(token %cpouter) ]
+  catchret from %cpouter to label %exit
+
+exit:
+  ret void
+}
+
+; test4: don't need stores for %phi.inner, as its only use is to feed %phi.outer
+;        %phi.outer needs stores in %left, %right, and %join
+; CHECK-LABEL: @test4(
+define void @test4(i1 %B) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  ; CHECK:      entry:
+  ; CHECK:        [[Slot:%[^ ]+]] = alloca
+  ; CHECK-NEXT:   br
+  br i1 %B, label %left, label %right
+left:
+  ; CHECK: left:
+  ; CHECK-NOT: store
+  ; CHECK: store i32 %l, i32* [[Slot]]
+  ; CHECK-NEXT: invoke void @f
+  %l = call i32 @g()
+  invoke void @f()
+          to label %join unwind label %catchpad.inner
+right:
+  ; CHECK: right:
+  ; CHECK-NOT: store
+  ; CHECK: store i32 %r, i32* [[Slot]]
+  ; CHECK-NEXT: invoke void @f
+  %r = call i32 @g()
+  invoke void @f()
+          to label %join unwind label %catchpad.inner
+catchpad.inner:
+   ; CHECK: catchpad.inner:
+   ; CHECK-NEXT: catchswitch within none
+   %phi.inner = phi i32 [ %l, %left ], [ %r, %right ]
+   %cs1 = catchswitch within none [label %catch.inner] unwind label %catchpad.outer
+catch.inner:
+   %cp1 = catchpad within %cs1 []
+   catchret from %cp1 to label %join
+join:
+  ; CHECK: join:
+  ; CHECK-NOT: store
+  ; CHECK: store i32 %j, i32* [[Slot]]
+  ; CHECK-NEXT: invoke void @f
+   %j = call i32 @g()
+   invoke void @f()
+           to label %exit unwind label %catchpad.outer
+
+catchpad.outer:
+   ; CHECK: catchpad.outer:
+   ; CHECK-NEXT: catchswitch within none
+   %phi.outer = phi i32 [ %phi.inner, %catchpad.inner ], [ %j, %join ]
+   %cs2 = catchswitch within none [label %catch.outer] unwind to caller
+catch.outer:
+   ; CHECK: catch.outer:
+   ; CHECK:   [[Reload:%[^ ]+]] = load i32, i32* [[Slot]]
+   ; CHECK:   call void @h(i32 [[Reload]])
+   %cp2 = catchpad within %cs2 []
+   call void @h(i32 %phi.outer) [ "funclet"(token %cp2) ]
+   catchret from %cp2 to label %exit
+exit:
+   ret void
+}
+
+; CHECK-LABEL: @test5(
+define void @test5() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  ; need store for %phi.cleanup
+  ; CHECK:      entry:
+  ; CHECK:        store i32 1, i32* [[CleanupSlot:%[^ ]+]]
+  ; CHECK-NEXT:   invoke void @f
+  invoke void @f()
+          to label %invoke.cont unwind label %cleanup
+
+invoke.cont:
+  ; need store for %phi.cleanup
+  ; CHECK:      invoke.cont:
+  ; CHECK-NEXT:   store i32 2, i32* [[CleanupSlot]]
+  ; CHECK-NEXT:   invoke void @f
+  invoke void @f()
+          to label %invoke.cont2 unwind label %cleanup
+
+cleanup:
+  ; cleanup phi can be loaded at cleanup entry
+  ; CHECK: cleanup:
+  ; CHECK-NEXT: cleanuppad within none []
+  ; CHECK: [[CleanupReload:%[^ ]+]] = load i32, i32* [[CleanupSlot]]
+  %phi.cleanup = phi i32 [ 1, %entry ], [ 2, %invoke.cont ]
+  %cp = cleanuppad within none []
+  %b = call i1 @i() [ "funclet"(token %cp) ]
+  br i1 %b, label %left, label %right
+
+left:
+  ; CHECK: left:
+  ; CHECK:   call void @h(i32 [[CleanupReload]]
+  call void @h(i32 %phi.cleanup) [ "funclet"(token %cp) ]
+  br label %merge
+
+right:
+  ; CHECK: right:
+  ; CHECK:   call void @h(i32 [[CleanupReload]]
+  call void @h(i32 %phi.cleanup) [ "funclet"(token %cp) ]
+  br label %merge
+
+merge:
+  ; need store for %phi.catch
+  ; CHECK:      merge:
+  ; CHECK-NEXT:   store i32 [[CleanupReload]], i32* [[CatchSlot:%[^ ]+]]
+  ; CHECK-NEXT:   cleanupret
+  cleanupret from %cp unwind label %catchswitch
+
+invoke.cont2:
+  ; need store for %phi.catch
+  ; CHECK:      invoke.cont2:
+  ; CHECK-NEXT:   store i32 3, i32* [[CatchSlot]]
+  ; CHECK-NEXT:   invoke void @f
+  invoke void @f()
+          to label %exit unwind label %catchswitch
+
+catchswitch:
+  ; CHECK: catchswitch:
+  ; CHECK-NEXT: catchswitch within none
+  %phi.catch = phi i32 [ %phi.cleanup, %merge ], [ 3, %invoke.cont2 ]
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:
+  ; CHECK: catch:
+  ; CHECK:   catchpad within %cs1
+  ; CHECK:   [[CatchReload:%[^ ]+]] = load i32, i32* [[CatchSlot]]
+  ; CHECK:   call void @h(i32 [[CatchReload]]
+  %cp2 = catchpad within %cs1 []
+  call void @h(i32 %phi.catch) [ "funclet"(token %cp2) ]
+  catchret from %cp2 to label %exit
+
+exit:
+  ret void
+}
+
+; We used to demote %x, but we don't need to anymore.
+; CHECK-LABEL: @test6(
+define void @test6() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  ; CHECK: entry:
+  ; CHECK: %x = invoke i32 @g()
+  ; CHECK-NEXT: to label %loop unwind label %to_caller
+  %x = invoke i32 @g()
+          to label %loop unwind label %to_caller
+to_caller:
+  %cp1 = cleanuppad within none []
+  cleanupret from %cp1 unwind to caller
+loop:
+  invoke void @f()
+          to label %loop unwind label %cleanup
+cleanup:
+  ; CHECK: cleanup:
+  ; CHECK:   call void @h(i32 %x)
+  %cp2 = cleanuppad within none []
+  call void @h(i32 %x) [ "funclet"(token %cp2) ]
+  cleanupret from %cp2 unwind to caller
+}
+
+; CHECK-LABEL: @test7(
+define void @test7() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  ; %x is an EH pad phi, so gets stored in pred here
+  ; CHECK: entry:
+  ; CHECK:   store i32 1, i32* [[SlotX:%[^ ]+]]
+  ; CHECK:   invoke void @f()
+  invoke void @f()
+     to label %invoke.cont unwind label %catchpad
+invoke.cont:
+  ; %x is an EH pad phi, so gets stored in pred here
+  ; CHECK: invoke.cont:
+  ; CHECK:   store i32 2, i32* [[SlotX]]
+  ; CHECK:   invoke void @f()
+  invoke void @f()
+    to label %exit unwind label %catchpad
+catchpad:
+  ; %x phi should be eliminated
+  ; CHECK: catchpad:
+  ; CHECK-NEXT: catchswitch within none
+  %x = phi i32 [ 1, %entry ], [ 2, %invoke.cont ]
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+catch:
+  ; CHECK: catch:
+  ; CHECK-NEXT: %[[CatchPad:[^ ]+]] = catchpad within %cs1 []
+  %cp = catchpad within %cs1 []
+  %b = call i1 @i() [ "funclet"(token %cp) ]
+  br i1 %b, label %left, label %right
+left:
+  ; Edge from %left to %join needs to be split so that
+  ; the load of %x can be inserted *after* the catchret
+  ; CHECK: left:
+  ; CHECK-NEXT: catchret from %[[CatchPad]] to label %[[SplitLeft:[^ ]+]]
+  catchret from %cp to label %join
+  ; CHECK: [[SplitLeft]]:
+  ; CHECK:   [[LoadX:%[^ ]+]] = load i32, i32* [[SlotX]]
+  ; CHECK:   br label %join
+right:
+  ; Edge from %right to %join needs to be split so that
+  ; the load of %y can be inserted *after* the catchret
+  ; CHECK: right:
+  ; CHECK:   %y = call i32 @g()
+  ; CHECK:   catchret from %[[CatchPad]] to label %join
+  %y = call i32 @g() [ "funclet"(token %cp) ]
+  catchret from %cp to label %join
+join:
+  ; CHECK: join:
+  ; CHECK:   %phi = phi i32 [ [[LoadX]], %[[SplitLeft]] ], [ %y, %right ]
+  %phi = phi i32 [ %x, %left ], [ %y, %right ]
+  call void @h(i32 %phi)
+  br label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @test8(
+define void @test8() personality i32 (...)* @__CxxFrameHandler3 { entry:
+  invoke void @f()
+          to label %done unwind label %cleanup1
+  invoke void @f()
+          to label %done unwind label %cleanup2
+
+done:
+  ret void
+
+cleanup1:
+  ; CHECK: [[CleanupPad1:%[^ ]+]] = cleanuppad within none []
+  ; CHECK-NEXT: call void @llvm.bar()
+  ; CHECK-NEXT: cleanupret from [[CleanupPad1]]
+  %cp0 = cleanuppad within none []
+  br label %cleanupexit
+
+cleanup2:
+  ; CHECK: cleanuppad within none []
+  ; CHECK-NEXT: call void @llvm.bar()
+  ; CHECK-NEXT: unreachable
+  %cp1 = cleanuppad within none []
+  br label %cleanupexit
+
+cleanupexit:
+  call void @llvm.bar()
+  cleanupret from %cp0 unwind label %cleanup2
+}
diff --git a/test/CodeGen/WinEH/wineh-intrinsics-invalid.ll b/test/CodeGen/WinEH/wineh-intrinsics-invalid.ll
new file mode 100644
index 0000000000000..17d6e70ad1eb4
--- /dev/null
+++ b/test/CodeGen/WinEH/wineh-intrinsics-invalid.ll
@@ -0,0 +1,26 @@
+; RUN: sed -e s/.T1:// %s | not opt -lint -disable-output 2>&1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: sed -e s/.T2:// %s | not opt -lint -disable-output 2>&1 | FileCheck --check-prefix=CHECK2 %s
+
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @f()
+
+;T1: declare i8* @llvm.eh.exceptionpointer.p0i8(i32)
+;T1:
+;T1: define void @test1() personality i32 (...)* @__CxxFrameHandler3 {
+;T1:   call i8* @llvm.eh.exceptionpointer.p0i8(i32 0)
+;T1:   ret void
+;T1: }
+;CHECK1: Intrinsic has incorrect argument type!
+;CHECK1-NEXT: i8* (i32)* @llvm.eh.exceptionpointer.p0i8
+
+;T2: declare i8* @llvm.eh.exceptionpointer.p0i8(token)
+;T2:
+;T2: define void @test2() personality i32 (...)* @__CxxFrameHandler3 {
+;T2:   call i8* @llvm.eh.exceptionpointer.p0i8(token undef)
+;T2:   ret void
+;T2: }
+;CHECK2: eh.exceptionpointer argument must be a catchpad
+;CHECK2-NEXT:  call i8* @llvm.eh.exceptionpointer.p0i8(token undef)
+
+declare i32 @__CxxFrameHandler3(...)
diff --git a/test/CodeGen/WinEH/wineh-intrinsics.ll b/test/CodeGen/WinEH/wineh-intrinsics.ll
new file mode 100644
index 0000000000000..3658792a38436
--- /dev/null
+++ b/test/CodeGen/WinEH/wineh-intrinsics.ll
@@ -0,0 +1,44 @@
+; RUN: opt -lint -disable-output < %s
+
+; This test is meant to prove that the verifier does not report errors for correct
+; use of the llvm.eh.exceptionpointer intrinsic.
+
+target triple = "x86_64-pc-windows-msvc"
+
+declare i8* @llvm.eh.exceptionpointer.p0i8(token)
+declare i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token)
+
+declare void @f(...)
+
+define void @test1() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void (...) @f(i32 1)
+     to label %exit unwind label %catchpad
+catchpad:
+  %cs1 = catchswitch within none [label %do_catch] unwind to caller
+do_catch:
+  %catch = catchpad within %cs1 [i32 1]
+  %exn = call i8* @llvm.eh.exceptionpointer.p0i8(token %catch)
+  call void (...) @f(i8* %exn)
+  catchret from %catch to label %exit
+exit:
+  ret void
+}
+
+define void @test2() personality i32 (...)* @ProcessManagedException {
+entry:
+  invoke void (...) @f(i32 1)
+     to label %exit unwind label %catchpad
+catchpad:
+  %cs1 = catchswitch within none [label %do_catch] unwind to caller
+do_catch:
+  %catch = catchpad within %cs1 [i32 1]
+  %exn = call i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token %catch)
+  call void (...) @f(i8 addrspace(1)* %exn)
+  catchret from %catch to label %exit
+exit:
+  ret void
+}
+
+declare i32 @__CxxFrameHandler3(...)
+declare i32 @ProcessManagedException(...)
diff --git a/test/CodeGen/WinEH/wineh-no-demotion.ll b/test/CodeGen/WinEH/wineh-no-demotion.ll
new file mode 100644
index 0000000000000..4fb84db890930
--- /dev/null
+++ b/test/CodeGen/WinEH/wineh-no-demotion.ll
@@ -0,0 +1,130 @@
+; RUN: opt -mtriple=x86_x64-pc-windows-msvc -S -winehprepare -disable-demotion -disable-cleanups < %s | FileCheck %s
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare i32 @__C_specific_handler(...)
+
+declare void @f()
+
+declare i32 @g()
+
+declare void @h(i32)
+
+; CHECK-LABEL: @test1(
+define void @test1(i1 %bool) personality i32 (...)* @__C_specific_handler {
+entry:
+  invoke void @f()
+          to label %invoke.cont unwind label %left
+
+invoke.cont:
+  invoke void @f()
+          to label %exit unwind label %inner
+
+left:
+  %0 = cleanuppad within none []
+  br i1 %bool, label %shared, label %cleanupret
+
+cleanupret:
+  cleanupret from %0 unwind label %right
+
+right:
+  %1 = cleanuppad within none []
+  br label %shared
+
+shared:
+  %x = call i32 @g()
+  invoke void @f() [ "funclet"(token %0) ]
+          to label %shared.cont unwind label %inner
+
+shared.cont:
+  unreachable
+
+inner:
+  %phi = phi i32 [ %x, %shared ], [ 0, %invoke.cont ]
+  %i = cleanuppad within none []
+  call void @h(i32 %phi)
+  unreachable
+
+; CHECK: %phi = phi i32 [ %x, %shared ], [ 0, %invoke.cont ], [ %x.for.left, %shared.for.left ]
+; CHECK: %i = cleanuppad within none []
+; CHECK: call void @h(i32 %phi)
+
+exit:
+  unreachable
+}
+
+; CHECK-LABEL: @test2(
+define void @test2(i1 %bool) personality i32 (...)* @__C_specific_handler {
+entry:
+  invoke void @f()
+          to label %shared.cont unwind label %left
+
+left:
+  %0 = cleanuppad within none []
+  br i1 %bool, label %shared, label %cleanupret
+
+cleanupret:
+  cleanupret from %0 unwind label %right
+
+right:
+  %1 = cleanuppad within none []
+  br label %shared
+
+shared:
+  %x = call i32 @g()
+  invoke void @f() [ "funclet"(token %0) ]
+          to label %shared.cont unwind label %inner
+
+shared.cont:
+  unreachable
+
+inner:
+  %i = cleanuppad within none []
+  call void @h(i32 %x)
+  unreachable
+
+; CHECK: %x1 = phi i32 [ %x.for.left, %shared.for.left ], [ %x, %shared ]
+; CHECK: %i = cleanuppad within none []
+; CHECK: call void @h(i32 %x1)
+
+exit:
+  unreachable
+}
+
+; CHECK-LABEL: @test4(
+define void @test4(i1 %x) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @f()
+          to label %invoke.cont1 unwind label %left
+
+invoke.cont1:
+  invoke void @f()
+          to label %exit unwind label %right
+
+left:
+  %0 = cleanuppad within none []
+  br label %shared
+
+right:
+  %1 = cleanuppad within none []
+  br i1 %x, label %shared, label %right.other
+
+right.other:
+  br label %shared
+
+shared:
+  %phi = phi i32 [ 1, %left ], [ 0, %right ], [ -1, %right.other ]
+  call void @h(i32 %phi)
+  unreachable
+
+; CHECK: %phi = phi i32 [ 0, %right ], [ -1, %right.other ]
+; CHECK: call void @h(i32 %phi)
+
+; CHECK: %phi.for.left = phi i32 [ 1, %left ]
+; CHECK: call void @h(i32 %phi.for.left)
+
+exit:
+  unreachable
+}
+
+declare void @__std_terminate()
diff --git a/test/CodeGen/WinEH/wineh-statenumbering-cleanups.ll b/test/CodeGen/WinEH/wineh-statenumbering-cleanups.ll
new file mode 100644
index 0000000000000..f5889f03965b4
--- /dev/null
+++ b/test/CodeGen/WinEH/wineh-statenumbering-cleanups.ll
@@ -0,0 +1,62 @@
+; RUN: sed -e s/.Cxx:// %s | opt -mtriple=x86-pc-windows-msvc -S -x86-winehstate | FileCheck %s
+; RUN: sed -e s/.SEH:// %s | opt -mtriple=x86-pc-windows-msvc -S -x86-winehstate | FileCheck %s
+
+declare i32 @__CxxFrameHandler3(...)
+declare i32 @_except_handler3(...)
+declare void @dummy_filter()
+
+declare void @f(i32)
+
+; CHECK-LABEL: define void @test2(
+;Cxx: define void @test2(i1 %b) personality i32 (...)* @__CxxFrameHandler3 {
+;SEH: define void @test2(i1 %b) personality i32 (...)* @_except_handler3 {
+entry:
+  ; CHECK: entry:
+  ; CHECK:   store i32 1
+  ; CHECK:   invoke void @f(i32 1)
+  invoke void @f(i32 1)
+    to label %exit unwind label %cleanup.pad
+cleanup.pad:
+  %cleanup = cleanuppad within none []
+  br i1 %b, label %left, label %right
+left:
+  cleanupret from %cleanup unwind label %catch.pad
+right:
+  cleanupret from %cleanup unwind label %catch.pad
+catch.pad:
+  %cs1 = catchswitch within none [label %catch.body] unwind to caller
+catch.body:
+;Cxx: %catch = catchpad within %cs1 [i8* null, i32 u0x40, i8* null]
+;SEH: %catch = catchpad within %cs1 [void ()* @dummy_filter]
+  catchret from %catch to label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: define void @test3(
+;Cxx: define void @test3() personality i32 (...)* @__CxxFrameHandler3 {
+;SEH: define void @test3() personality i32 (...)* @_except_handler3 {
+entry:
+  ; CHECK: entry:
+  ; CHECK:   store i32 0
+  ; CHECK:   invoke void @f(i32 1)
+  invoke void @f(i32 1)
+    to label %exit unwind label %cleanup.pad
+cleanup.pad:
+  ; CHECK: cleanup.pad:
+  ; CHECK:   store i32 1
+  ; CHECK:   invoke void @f(i32 0)
+  %cleanup = cleanuppad within none []
+  invoke void @f(i32 0)
+    to label %unreachable unwind label %catch.pad
+unreachable:
+  unreachable
+catch.pad:
+  %cs1 = catchswitch within none [label %catch.body] unwind to caller
+catch.body:
+;Cxx: %catch = catchpad within %cs1 [i8* null, i32 u0x40, i8* null]
+;SEH: %catch = catchpad within %cs1 [void ()* @dummy_filter]
+  catchret from %catch to label %exit
+exit:
+  ret void
+}
diff --git a/test/CodeGen/WinEH/wineh-statenumbering.ll b/test/CodeGen/WinEH/wineh-statenumbering.ll
new file mode 100644
index 0000000000000..dab7fde61a667
--- /dev/null
+++ b/test/CodeGen/WinEH/wineh-statenumbering.ll
@@ -0,0 +1,148 @@
+; RUN: opt -mtriple=i686-pc-windows-msvc -S -x86-winehstate  < %s | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchableType = type { i32, i8*, i32, i32, i32, i32, i8* }
+%eh.CatchableTypeArray.1 = type { i32, [1 x %eh.CatchableType*] }
+%eh.ThrowInfo = type { i32, i8*, i8*, i8* }
+
+$"\01??_R0H@8" = comdat any
+
+$"_CT??_R0H@84" = comdat any
+
+$_CTA1H = comdat any
+
+$_TI1H = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i32 -1, i32 0, i32 4, i8* null }, section ".xdata", comdat
+@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x %eh.CatchableType*] [%eh.CatchableType* @"_CT??_R0H@84"] }, section ".xdata", comdat
+@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i8* null, i8* null, i8* bitcast (%eh.CatchableTypeArray.1* @_CTA1H to i8*) }, section ".xdata", comdat
+
+define i32 @main() #0 personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %tmp = alloca i32, align 4
+  ; CHECK: entry:
+  ; CHECK:   store i32 -1
+  ; CHECK:   call void @g(i32 3)
+  call void @g(i32 3)
+  store i32 0, i32* %tmp, align 4
+  %0 = bitcast i32* %tmp to i8*
+  ; CHECK:   store i32 0
+  ; CHECK:   invoke void @_CxxThrowException(
+  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* nonnull @_TI1H) #1
+          to label %unreachable.for.entry unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %cs1 [i8* null, i32 u0x40, i8* null]
+  ; CHECK: catch:
+  ; CHECK:   store i32 2
+  ; CHECK:   invoke void @_CxxThrowException(
+  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) #1
+          to label %unreachable unwind label %catch.dispatch.1
+
+catch.dispatch.1:                                 ; preds = %catch
+  %cs2 = catchswitch within %1 [label %catch.3] unwind to caller
+catch.3:                                          ; preds = %catch.dispatch.1
+  %2 = catchpad within %cs2 [i8* null, i32 u0x40, i8* null]
+  ; CHECK: catch.3:
+  ; CHECK:   store i32 3
+  ; CHECK:   call void @g(i32 1)
+  call void @g(i32 1)
+  catchret from %2 to label %try.cont
+
+try.cont:                                         ; preds = %catch.3
+  ; CHECK: try.cont:
+  ; CHECK:   store i32 1
+  ; CHECK:   call void @g(i32 2)
+  call void @g(i32 2)
+  unreachable
+
+unreachable:                                      ; preds = %catch
+  unreachable
+
+unreachable.for.entry:                            ; preds = %entry
+  unreachable
+}
+
+define i32 @nopads() #0 personality i32 (...)* @__CxxFrameHandler3 {
+  ret i32 0
+}
+
+; CHECK-LABEL: define i32 @nopads()
+; CHECK-NEXT: ret i32 0
+; CHECK-NOT: __ehhandler$nopads
+
+; CHECK-LABEL: define void @PR25926()
+define void @PR25926() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  ; CHECK: entry:
+  ; CHECK:   store i32 -1
+  ; CHECK:   store i32 0
+  ; CHECK:   invoke void @_CxxThrowException(
+  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null)
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+  ; CHECK: catch:
+  ; CHECK:   store i32 3
+  ; CHECK:   invoke void @_CxxThrowException(
+  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) [ "funclet"(token %1) ]
+          to label %unreachable1 unwind label %catch.dispatch1
+
+catch.dispatch1:                                  ; preds = %catch
+  %2 = catchswitch within %1 [label %catch2] unwind label %ehcleanup
+
+catch2:                                           ; preds = %catch.dispatch1
+  %3 = catchpad within %2 [i8* null, i32 64, i8* null]
+  catchret from %3 to label %try.cont
+
+try.cont:                                         ; preds = %catch2
+  ; CHECK: try.cont:
+  ; CHECK:   store i32 1
+  ; CHECK:   call void @dtor()
+  call void @dtor() #3 [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont4
+
+try.cont4:                                        ; preds = %try.cont
+  ret void
+
+ehcleanup:                                        ; preds = %catch.dispatch1
+  %4 = cleanuppad within %1 []
+  ; CHECK: ehcleanup:
+  ; CHECK:   store i32 -1
+  ; CHECK:   call void @dtor()
+  call void @dtor() #3 [ "funclet"(token %4) ]
+  cleanupret from %4 unwind to caller
+
+unreachable:                                      ; preds = %entry
+  unreachable
+
+unreachable1:                                     ; preds = %catch
+  unreachable
+}
+
+declare void @g(i32) #0
+
+declare void @dtor()
+
+declare x86_stdcallcc void @_CxxThrowException(i8*, %eh.ThrowInfo*)
+
+declare i32 @__CxxFrameHandler3(...)
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noreturn }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.8.0 (trunk 245153) (llvm/trunk 245238)"}
diff --git a/test/CodeGen/X86/2006-10-02-BoolRetCrash.ll b/test/CodeGen/X86/2006-10-02-BoolRetCrash.ll
index 795d4647a3f66..609dbc155ed91 100644
--- a/test/CodeGen/X86/2006-10-02-BoolRetCrash.ll
+++ b/test/CodeGen/X86/2006-10-02-BoolRetCrash.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s 
 ; PR933
+; REQUIRES: default_triple
 
 define fastcc i1 @test() {
         ret i1 true
diff --git a/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll b/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
index dd670648daf62..332816e22cda2 100644
--- a/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
+++ b/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
@@ -6,8 +6,8 @@
 define i32 @test(i32 %argc, i8** %argv) nounwind {
 entry:
 ; CHECK: cmpl	$2
-; CHECK-NEXT: jne
-; CHECK-NEXT: %bb2
+; CHECK-NEXT: je
+; CHECK-NEXT: %entry
 
 	switch i32 %argc, label %UnifiedReturnBlock [
 		 i32 1, label %bb
diff --git a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
index 62c503da35a66..65b577b1e7d7f 100644
--- a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
+++ b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s
 
-@__gthrw_pthread_once = weak alias i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
+@__gthrw_pthread_once = weak alias i32 (i32*, void ()*), i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
 
 define weak i32 @pthread_once(i32*, void ()*) {
   ret i32 0
diff --git a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
index a9e3f33ec6186..2ca003e052aa6 100644
--- a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
+++ b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
@@ -8,7 +8,7 @@ target triple = "i386-pc-linux-gnu"
 @__resp = thread_local global %struct.__res_state* @_res		; <%struct.__res_state**> [#uses=1]
 @_res = global %struct.__res_state zeroinitializer, section ".bss"		; <%struct.__res_state*> [#uses=1]
 
-@__libc_resp = hidden thread_local alias %struct.__res_state** @__resp		; <%struct.__res_state**> [#uses=2]
+@__libc_resp = hidden thread_local alias %struct.__res_state*, %struct.__res_state** @__resp		; <%struct.__res_state**> [#uses=2]
 
 define i32 @foo() {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/X86/2008-03-14-SpillerCrash.ll b/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
index d60d0c2fb0bc8..d484b45a57637 100644
--- a/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
+++ b/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
@@ -6,7 +6,7 @@
 	%struct.locale_data = type { i8*, i8*, i32, i32, { void (%struct.locale_data*)*, %struct.anon }, i32, i32, i32, [0 x %struct.locale_data_value] }
 	%struct.locale_data_value = type { i32* }
 
-@wcstoll_l = alias i64 (i32*, i32**, i32, %struct.__locale_struct*)* @__wcstoll_l
+@wcstoll_l = alias i64 (i32*, i32**, i32, %struct.__locale_struct*), i64 (i32*, i32**, i32, %struct.__locale_struct*)* @__wcstoll_l
 
 define i64 @____wcstoll_l_internal(i32* %nptr, i32** %endptr, i32 %base, i32 %group, %struct.__locale_struct* %loc) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll b/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
index 422d68e7ff498..de95e7925f08a 100644
--- a/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
+++ b/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
@@ -1,8 +1,10 @@
-; RUN: llc < %s -march=x86 | not grep movsd
-; RUN: llc < %s -march=x86 | grep movw
-; RUN: llc < %s -march=x86 | grep addw
+; RUN: llc < %s -march=x86 | FileCheck %s
 ; These transforms are turned off for load volatiles and stores.
 ; Check that they weren't turned off for all loads and stores!
+; CHECK-LABEL: f:
+; CHECK-NOT: movsd
+; CHECK: movw
+; CHECK: addw
 
 @atomic = global double 0.000000e+00		; <double*> [#uses=1]
 @atomic2 = global double 0.000000e+00		; <double*> [#uses=1]
diff --git a/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
index 757f1ff682539..84d373d70a2dc 100644
--- a/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
+++ b/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
@@ -1,18 +1,19 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mattr=+mmx | not grep unpcklpd
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mattr=+mmx | not grep unpckhpd
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep cvttpd2pi | count 1
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep cvtpi2pd | count 1
+; RUN: llc < %s -march=x86 -mattr=+sse2,+mmx | FileCheck %s
 ; originally from PR2687, but things don't work that way any more.
 ; there are no MMX instructions here; we use XMM.
 
 define <2 x double> @a(<2 x i32> %x) nounwind {
 entry:
+; CHECK-LABEL: a
+; CHECK-NOT: unpcklpd
   %y = sitofp <2 x i32> %x to <2 x double>
   ret <2 x double> %y
 }
 
 define <2 x i32> @b(<2 x double> %x) nounwind {
 entry:
+; CHECK-LABEL: b
+; CHECK-NOT: unpckhpd
   %y = fptosi <2 x double> %x to <2 x i32>
   ret <2 x i32> %y
 }
@@ -21,12 +22,18 @@ entry:
 
 define <2 x double> @a2(x86_mmx %x) nounwind {
 entry:
+; CHECK-LABEL: a2
+; CHECK: cvtpi2pd
+; CHECK-NOT: cvtpi2pd
   %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x)
   ret <2 x double> %y
 }
 
 define x86_mmx @b2(<2 x double> %x) nounwind {
 entry:
+; CHECK-LABEL: b2
+; CHECK: cvttpd2pi
+; CHECK-NOT: cvttpd2pi
   %y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
   ret x86_mmx %y
 }
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index 6c177e5b5f5ab..2abb5ba7cd527 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -76,15 +76,15 @@ declare i64 @strlen(i8*) nounwind readonly
 
 declare void @llvm.stackrestore(i8*) nounwind
 
-!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "s1", line: 2, arg: 0, scope: !1, file: !2, type: !6)
-!1 = !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !2, type: !3)
-!2 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 0, file: !17, enums: !18, retainedTypes: !18)
+!0 = !DILocalVariable(name: "s1", line: 2, arg: 1, scope: !1, file: !2, type: !6)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !2, type: !3)
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 0, file: !17, enums: !18, retainedTypes: !18)
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5, !6}
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
 !6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, scope: !2, baseType: !5)
 !7 = !DILocation(line: 2, scope: !1)
-!8 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "str.0", line: 3, scope: !1, file: !2, type: !9)
+!8 = !DILocalVariable(name: "str.0", line: 3, scope: !1, file: !2, type: !9)
 !9 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial, scope: !2, baseType: !10)
 !10 = !DICompositeType(tag: DW_TAG_array_type, size: 8, align: 8, scope: !2, baseType: !5, elements: !11)
 !11 = !{!12}
diff --git a/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll b/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
index 7c87598d0d9c4..609be3bb2e54f 100644
--- a/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
+++ b/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
@@ -4,15 +4,23 @@
 ; a shr (X, -8) that gets subsequently "optimized away" as undef
 ; PR4254
 
+; after fixing PR24373
+; shlq $56, %rdi
+; sarq $48, %rdi
+; folds into
+; movsbq %dil, %rax
+; shlq $8, %rax
+; which is better for x86
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define i64 @foo(i64 %b) nounwind readnone {
 entry:
 ; CHECK-LABEL: foo:
-; CHECK: shlq $56, %rdi
-; CHECK: sarq $48, %rdi
-; CHECK: leaq 1(%rdi), %rax
+; CHECK: movsbq %dil, %rax
+; CHECK: shlq $8, %rax
+; CHECK: orq $1, %rax
 	%shl = shl i64 %b, 56		; <i64> [#uses=1]
 	%shr = ashr i64 %shl, 48		; <i64> [#uses=1]
 	%add5 = or i64 %shr, 1		; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll b/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
index dfb98bb1ab39f..a74aa2dd46231 100644
--- a/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
+++ b/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
@@ -1,7 +1,9 @@
 ; RUN: llc -mcpu=generic -mtriple=x86_64-mingw32 < %s | FileCheck %s
-; CHECK: subq    $40, %rsp
-; CHECK: movaps  %xmm8, 16(%rsp)
-; CHECK: movaps  %xmm7, (%rsp)
+; CHECK: pushq   %rbp
+; CHECK: subq    $32, %rsp
+; CHECK: leaq    32(%rsp), %rbp
+; CHECK: movaps  %xmm8, -16(%rbp)
+; CHECK: movaps  %xmm7, -32(%rbp)
 
 define i32 @a() nounwind {
 entry:
diff --git a/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll b/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll
index 8bb3dc63a3b9a..71a560a63ec52 100644
--- a/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll
+++ b/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; REQUIRES: default_triple
 
 define <2 x i64> @_mm_insert_epi16(<2 x i64> %a, i32 %b, i32 %imm) nounwind readnone {
 entry:
diff --git a/test/CodeGen/X86/2009-06-06-ConcatVectors.ll b/test/CodeGen/X86/2009-06-06-ConcatVectors.ll
index 92419fcb8b81d..e26a8608a4960 100644
--- a/test/CodeGen/X86/2009-06-06-ConcatVectors.ll
+++ b/test/CodeGen/X86/2009-06-06-ConcatVectors.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; REQUIRES: default_triple
 
 define <2 x i64> @_mm_movpi64_pi64(<1 x i64> %a, <1 x i64> %b) nounwind readnone {
 entry:
diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index bda7340b36436..06a56ad902058 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll
@@ -24,9 +24,9 @@ declare i32 @foo(i32) ssp
 
 !0 = !DILocation(line: 5, column: 2, scope: !1)
 !1 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !2)
-!2 = !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !3)
-!3 = !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: 0, file: !8, retainedTypes: !9)
-!4 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "count_", line: 5, scope: !5, file: !3, type: !6)
+!2 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !3)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: 0, file: !8, retainedTypes: !9)
+!4 = !DILocalVariable(name: "count_", line: 5, scope: !5, file: !3, type: !6)
 !5 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !1)
 !6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !7 = !DILocation(line: 6, column: 1, scope: !2)
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index db56ae65d51e8..c15e7a79bfa1a 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -6,7 +6,7 @@
 %struct.Pt = type { double, double }
 %struct.Rect = type { %struct.Pt, %struct.Pt }
 
-define double @foo(%struct.Rect* byval %my_r0) nounwind ssp {
+define double @foo(%struct.Rect* byval %my_r0) nounwind ssp !dbg !1 {
 entry:
 ;CHECK: DEBUG_VALUE
   %retval = alloca double                         ; <double*> [#uses=2]
@@ -31,10 +31,10 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!21}
 
-!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "my_r0", line: 11, arg: 0, scope: !1, file: !2, type: !7)
-!1 = !DISubprogram(name: "foo", linkageName: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !19, scope: !2, type: !4, function: double (%struct.Rect*)* @foo)
+!0 = !DILocalVariable(name: "my_r0", line: 11, arg: 1, scope: !1, file: !2, type: !7)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !19, scope: !2, type: !4)
 !2 = !DIFile(filename: "b2.c", directory: "/tmp/")
-!3 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 0, file: !19, enums: !20, retainedTypes: !20, subprograms: !18)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 0, file: !19, enums: !20, retainedTypes: !20, subprograms: !18)
 !4 = !DISubroutineType(types: !5)
 !5 = !{!6, !7}
 !6 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index e65edac86eccc..eb077c074bc22 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
@@ -1,4 +1,6 @@
 ; RUN: llc -O1 < %s
+; REQUIRES: default_triple
+
 ; ModuleID = 'pr6157.bc'
 ; formerly crashed in SelectionDAGBuilder
 
@@ -16,7 +18,7 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType*) nounwind readnone
 
-!0 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 0, file: !15, enums: !16, retainedTypes: !16)
+!0 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 0, file: !15, enums: !16, retainedTypes: !16)
 !1 = !DIDerivedType(tag: DW_TAG_const_type, size: 192, align: 64, file: !15, scope: !0, baseType: !2)
 !2 = !DICompositeType(tag: DW_TAG_structure_type, name: "C", line: 1, size: 192, align: 64, file: !15, scope: !0, elements: !3)
 !3 = !{!4, !6, !7}
@@ -24,9 +26,9 @@ declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
 !6 = !DIDerivedType(tag: DW_TAG_member, name: "y", line: 1, size: 64, align: 64, offset: 64, file: !15, scope: !2, baseType: !5)
 !7 = !DIDerivedType(tag: DW_TAG_member, name: "z", line: 1, size: 64, align: 64, offset: 128, file: !15, scope: !2, baseType: !5)
-!8 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "t", line: 5, scope: !9, file: !0, type: !2)
+!8 = !DILocalVariable(name: "t", line: 5, scope: !9, file: !0, type: !2)
 !9 = distinct !DILexicalBlock(line: 0, column: 0, file: null, scope: !10)
-!10 = !DISubprogram(name: "foo", linkageName: "foo", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !0, type: !11)
+!10 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !0, type: !11)
 !11 = !DISubroutineType(types: !12)
 !12 = !{!13}
 !13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 3b99e91915f06..f157d5011b029 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -7,7 +7,7 @@
 
 %0 = type { double }
 
-define hidden %0 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone {
+define hidden %0 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone !dbg !1 {
 entry:
   tail call void @llvm.dbg.value(metadata float %a, i64 0, metadata !0, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   tail call void @llvm.dbg.value(metadata float %b, i64 0, metadata !11, metadata !DIExpression()), !dbg !DILocation(scope: !1)
@@ -199,10 +199,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!48}
 
-!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 1921, arg: 0, scope: !1, file: !2, type: !9)
-!1 = !DISubprogram(name: "__divsc3", linkageName: "__divsc3", line: 1922, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 1922, file: !45, scope: !2, type: !4, function: %0 (float, float, float, float)* @__divsc3, variables: !43)
+!0 = !DILocalVariable(name: "a", line: 1921, arg: 1, scope: !1, file: !2, type: !9)
+!1 = distinct !DISubprogram(name: "__divsc3", linkageName: "__divsc3", line: 1922, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 1922, file: !45, scope: !2, type: !4, variables: !43)
 !2 = !DIFile(filename: "libgcc2.c", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
-!3 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !45, enums: !47, retainedTypes: !47, subprograms: !44, imports:  null)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !45, enums: !47, retainedTypes: !47, subprograms: !44, imports:  null)
 !4 = !DISubroutineType(types: !5)
 !5 = !{!6, !9, !9, !9, !9}
 !6 = !DIDerivedType(tag: DW_TAG_typedef, name: "SCtype", line: 170, file: !46, scope: !7, baseType: !8)
@@ -210,14 +210,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !8 = !DIBasicType(tag: DW_TAG_base_type, name: "complex float", size: 64, align: 32, encoding: DW_ATE_complex_float)
 !9 = !DIDerivedType(tag: DW_TAG_typedef, name: "SFtype", line: 167, file: !46, scope: !7, baseType: !10)
 !10 = !DIBasicType(tag: DW_TAG_base_type, name: "float", size: 32, align: 32, encoding: DW_ATE_float)
-!11 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "b", line: 1921, arg: 0, scope: !1, file: !2, type: !9)
-!12 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 1921, arg: 0, scope: !1, file: !2, type: !9)
-!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "d", line: 1921, arg: 0, scope: !1, file: !2, type: !9)
-!14 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "denom", line: 1923, scope: !15, file: !2, type: !9)
+!11 = !DILocalVariable(name: "b", line: 1921, arg: 2, scope: !1, file: !2, type: !9)
+!12 = !DILocalVariable(name: "c", line: 1921, arg: 3, scope: !1, file: !2, type: !9)
+!13 = !DILocalVariable(name: "d", line: 1921, arg: 4, scope: !1, file: !2, type: !9)
+!14 = !DILocalVariable(name: "denom", line: 1923, scope: !15, file: !2, type: !9)
 !15 = distinct !DILexicalBlock(line: 1922, column: 0, file: !45, scope: !1)
-!16 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "ratio", line: 1923, scope: !15, file: !2, type: !9)
-!17 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "x", line: 1923, scope: !15, file: !2, type: !9)
-!18 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "y", line: 1923, scope: !15, file: !2, type: !9)
+!16 = !DILocalVariable(name: "ratio", line: 1923, scope: !15, file: !2, type: !9)
+!17 = !DILocalVariable(name: "x", line: 1923, scope: !15, file: !2, type: !9)
+!18 = !DILocalVariable(name: "y", line: 1923, scope: !15, file: !2, type: !9)
 !19 = !DILocation(line: 1929, scope: !15)
 !20 = !DILocation(line: 1931, scope: !15)
 !21 = !DILocation(line: 1932, scope: !15)
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index 3670c556aa79f..a34e7bd9fe436 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-darwin10"
 
 @llvm.used = appending global [1 x i8*] [i8* bitcast (i8* (%struct.a*)* @bar to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
 
-define i8* @bar(%struct.a* %myvar) nounwind optsize noinline ssp {
+define i8* @bar(%struct.a* %myvar) nounwind optsize noinline ssp !dbg !9 {
 entry:
   tail call void @llvm.dbg.value(metadata %struct.a* %myvar, i64 0, metadata !8, metadata !DIExpression()), !dbg !DILocation(scope: !9)
   %0 = getelementptr inbounds %struct.a, %struct.a* %myvar, i64 0, i32 0, !dbg !28 ; <i32*> [#uses=1]
@@ -26,14 +26,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !0 = !DIGlobalVariable(name: "ret", line: 7, isLocal: false, isDefinition: true, scope: !1, file: !1, type: !3)
 !1 = !DIFile(filename: "foo.c", directory: "/tmp/")
-!2 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !36, enums: !37, retainedTypes: !37, subprograms: !32, globals: !31, imports:  !37)
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !36, enums: !37, retainedTypes: !37, subprograms: !32, globals: !31, imports:  !37)
 !3 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!4 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "x", line: 12, arg: 0, scope: !5, file: !1, type: !3)
-!5 = !DISubprogram(name: "foo", linkageName: "foo", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 13, file: !36, scope: !1, type: !6, function: void (i32)* @foo, variables: !33)
+!4 = !DILocalVariable(name: "x", line: 12, arg: 1, scope: !5, file: !1, type: !3)
+!5 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 13, file: !36, scope: !1, type: !6, variables: !33)
 !6 = !DISubroutineType(types: !7)
 !7 = !{null, !3}
-!8 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "myvar", line: 17, arg: 0, scope: !9, file: !1, type: !13)
-!9 = !DISubprogram(name: "bar", linkageName: "bar", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 17, file: !36, scope: !1, type: !10, function: i8* (%struct.a*)* @bar, variables: !34)
+!8 = !DILocalVariable(name: "myvar", line: 17, arg: 1, scope: !9, file: !1, type: !13)
+!9 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 17, file: !36, scope: !1, type: !10, variables: !34)
 !10 = !DISubroutineType(types: !11)
 !11 = !{!12, !13}
 !12 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !36, scope: !1, baseType: null)
@@ -42,15 +42,15 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !15 = !{!16, !17}
 !16 = !DIDerivedType(tag: DW_TAG_member, name: "c", line: 3, size: 32, align: 32, file: !36, scope: !14, baseType: !3)
 !17 = !DIDerivedType(tag: DW_TAG_member, name: "d", line: 4, size: 64, align: 64, offset: 64, file: !36, scope: !14, baseType: !13)
-!18 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argc", line: 22, arg: 0, scope: !19, file: !1, type: !3)
-!19 = !DISubprogram(name: "main", linkageName: "main", line: 22, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 22, file: !36, scope: !1, type: !20, variables: !35)
+!18 = !DILocalVariable(name: "argc", line: 22, arg: 1, scope: !19, file: !1, type: !3)
+!19 = distinct !DISubprogram(name: "main", linkageName: "main", line: 22, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 22, file: !36, scope: !1, type: !20, variables: !35)
 !20 = !DISubroutineType(types: !21)
 !21 = !{!3, !3, !22}
 !22 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !36, scope: !1, baseType: !23)
 !23 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !36, scope: !1, baseType: !24)
 !24 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!25 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argv", line: 22, arg: 0, scope: !19, file: !1, type: !22)
-!26 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "e", line: 23, scope: !27, file: !1, type: !14)
+!25 = !DILocalVariable(name: "argv", line: 22, arg: 2, scope: !19, file: !1, type: !22)
+!26 = !DILocalVariable(name: "e", line: 23, scope: !27, file: !1, type: !14)
 !27 = distinct !DILexicalBlock(line: 22, column: 0, file: !36, scope: !19)
 !28 = !DILocation(line: 18, scope: !29)
 !29 = distinct !DILexicalBlock(line: 17, column: 0, file: !36, scope: !9)
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index c5201614fdd19..7967d45c2ee86 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -regalloc=basic < %s | FileCheck %s
 ; Test to check separate label for inlined function argument.
 
-define i32 @foo(i32 %y) nounwind optsize ssp {
+define i32 @foo(i32 %y) nounwind optsize ssp !dbg !1 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !0, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   %0 = tail call i32 (...) @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1]
@@ -13,10 +13,10 @@ declare i32 @zoo(...)
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
-define i32 @bar(i32 %x) nounwind optsize ssp {
+define i32 @bar(i32 %x) nounwind optsize ssp !dbg !8 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !7, metadata !DIExpression()), !dbg !DILocation(scope: !8)
-  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !DIExpression()) nounwind, !dbg !DILocation(scope: !1)
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !DIExpression()) nounwind, !dbg !DILocation(scope: !1, inlinedAt: !DILocation(scope: !8))
   %0 = tail call i32 (...) @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
   %1 = add nsw i32 %0, %x, !dbg !13               ; <i32> [#uses=1]
   ret i32 %1, !dbg !13
@@ -25,15 +25,15 @@ entry:
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!20}
 
-!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "y", line: 2, arg: 0, scope: !1, file: !2, type: !6)
-!1 = !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 2, file: !18, scope: !2, type: !4, function: i32 (i32)* @foo, variables: !15)
+!0 = !DILocalVariable(name: "y", line: 2, arg: 1, scope: !1, file: !2, type: !6)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 2, file: !18, scope: !2, type: !4, variables: !15)
 !2 = !DIFile(filename: "f.c", directory: "/tmp")
-!3 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !18, enums: !19, retainedTypes: !19, subprograms: !17, imports:  null)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !18, enums: !19, retainedTypes: !19, subprograms: !17, imports:  null)
 !4 = !DISubroutineType(types: !5)
 !5 = !{!6, !6}
 !6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!7 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "x", line: 6, arg: 0, scope: !8, file: !2, type: !6)
-!8 = !DISubprogram(name: "bar", linkageName: "bar", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 6, file: !18, scope: !2, type: !4, function: i32 (i32)* @bar, variables: !16)
+!7 = !DILocalVariable(name: "x", line: 6, arg: 1, scope: !8, file: !2, type: !6)
+!8 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 6, file: !18, scope: !2, type: !4, variables: !16)
 !9 = !DILocation(line: 3, scope: !10)
 !10 = distinct !DILexicalBlock(line: 2, column: 0, file: !18, scope: !1)
 !11 = !{i32 1}
@@ -46,7 +46,7 @@ entry:
 !18 = !DIFile(filename: "f.c", directory: "/tmp")
 !19 = !{}
 
-;CHECK: DEBUG_VALUE: bar:x <- E
+;CHECK: DEBUG_VALUE: bar:x <- %E
 ;CHECK: Ltmp
 ;CHECK:	DEBUG_VALUE: foo:y <- 1{{$}}
 !20 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index 757c92808e112..1be800cdfcf06 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -7,8 +7,8 @@ target triple = "x86_64-apple-darwin10.2"
 
 @llvm.used = appending global [1 x i8*] [i8* bitcast (i32 (%struct.foo*, i32)* @_ZN3foo3bazEi to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
 
-define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) nounwind readnone optsize noinline ssp align 2 {
-;CHECK: DEBUG_VALUE: baz:this <- RDI{{$}}
+define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) nounwind readnone optsize noinline ssp align 2 !dbg !8 {
+;CHECK: DEBUG_VALUE: baz:this <- %RDI{{$}}
 entry:
   tail call void @llvm.dbg.value(metadata %struct.foo* %this, i64 0, metadata !15, metadata !DIExpression()), !dbg !DILocation(scope: !8)
   tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !16, metadata !DIExpression()), !dbg !DILocation(scope: !8)
@@ -23,35 +23,35 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.module.flags = !{!34}
 !llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28}
 
-!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 11, arg: 0, scope: !1, file: !3, type: !12)
-!1 = !DISubprogram(name: "bar", linkageName: "_ZN3foo3barEi", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 11, file: !31, scope: !2, type: !9, function: i32 (%struct.foo*, i32)* null)
+!0 = !DILocalVariable(name: "this", line: 11, arg: 1, scope: !1, file: !3, type: !12)
+!1 = distinct !DISubprogram(name: "bar", linkageName: "_ZN3foo3barEi", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 11, file: !31, scope: !2, type: !9)
 !2 = !DICompositeType(tag: DW_TAG_structure_type, name: "foo", line: 3, size: 32, align: 32, file: !31, scope: !3, elements: !5)
 !3 = !DIFile(filename: "foo.cp", directory: "/tmp/")
-!4 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 LLVM build", isOptimized: true, emissionKind: 0, file: !31, enums: !32, retainedTypes: !32, subprograms: !33)
+!4 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 LLVM build", isOptimized: true, emissionKind: 0, file: !31, enums: !32, retainedTypes: !32, subprograms: !33)
 !5 = !{!6, !1, !8}
 !6 = !DIDerivedType(tag: DW_TAG_member, name: "y", line: 8, size: 32, align: 32, file: !31, scope: !2, baseType: !7)
 !7 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!8 = !DISubprogram(name: "baz", linkageName: "_ZN3foo3bazEi", line: 15, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 15, file: !31, scope: !2, type: !9, function: i32 (%struct.foo*, i32)* @_ZN3foo3bazEi)
+!8 = distinct !DISubprogram(name: "baz", linkageName: "_ZN3foo3bazEi", line: 15, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 15, file: !31, scope: !2, type: !9)
 !9 = !DISubroutineType(types: !10)
 !10 = !{!7, !11, !7}
 !11 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial, file: !31, scope: !3, baseType: !2)
 !12 = !DIDerivedType(tag: DW_TAG_const_type, size: 64, align: 64, flags: DIFlagArtificial, file: !31, scope: !3, baseType: !13)
 !13 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !31, scope: !3, baseType: !2)
-!14 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "x", line: 11, arg: 0, scope: !1, file: !3, type: !7)
-!15 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 15, arg: 0, scope: !8, file: !3, type: !12)
-!16 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "x", line: 15, arg: 0, scope: !8, file: !3, type: !7)
-!17 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argc", line: 19, arg: 0, scope: !18, file: !3, type: !7)
-!18 = !DISubprogram(name: "main", linkageName: "main", line: 19, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 19, file: !31, scope: !3, type: !19)
+!14 = !DILocalVariable(name: "x", line: 11, arg: 2, scope: !1, file: !3, type: !7)
+!15 = !DILocalVariable(name: "this", line: 15, arg: 1, scope: !8, file: !3, type: !12)
+!16 = !DILocalVariable(name: "x", line: 15, arg: 2, scope: !8, file: !3, type: !7)
+!17 = !DILocalVariable(name: "argc", line: 19, arg: 1, scope: !18, file: !3, type: !7)
+!18 = distinct !DISubprogram(name: "main", linkageName: "main", line: 19, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 19, file: !31, scope: !3, type: !19)
 !19 = !DISubroutineType(types: !20)
 !20 = !{!7, !7, !21}
 !21 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !31, scope: !3, baseType: !22)
 !22 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !31, scope: !3, baseType: !23)
 !23 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!24 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "argv", line: 19, arg: 0, scope: !18, file: !3, type: !21)
-!25 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "a", line: 20, scope: !26, file: !3, type: !2)
+!24 = !DILocalVariable(name: "argv", line: 19, arg: 2, scope: !18, file: !3, type: !21)
+!25 = !DILocalVariable(name: "a", line: 20, scope: !26, file: !3, type: !2)
 !26 = distinct !DILexicalBlock(line: 19, column: 0, file: !31, scope: !27)
 !27 = distinct !DILexicalBlock(line: 19, column: 0, file: !31, scope: !18)
-!28 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 21, scope: !26, file: !3, type: !7)
+!28 = !DILocalVariable(name: "b", line: 21, scope: !26, file: !3, type: !7)
 !29 = !DILocation(line: 16, scope: !30)
 !30 = distinct !DILexicalBlock(line: 15, column: 0, file: !31, scope: !8)
 !31 = !DIFile(filename: "foo.cp", directory: "/tmp/")
diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index 3ce36eec400a4..5e565a1a667f0 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
@@ -1,18 +1,19 @@
 ; RUN: llc -O0 -relocation-model pic < %s -o /dev/null
+; REQUIRES: default_triple
 ; PR7545
 @.str = private constant [4 x i8] c"one\00", align 1 ; <[4 x i8]*> [#uses=1]
 @.str1 = private constant [4 x i8] c"two\00", align 1 ; <[5 x i8]*> [#uses=1]
 @C.9.2167 = internal constant [2 x i8*] [i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str1, i64 0, i64 0)]
 !38 = !DIFile(filename: "pbmsrch.c", directory: "/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch")
-!39 = !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", isOptimized: true, emissionKind: 0, file: !109, enums: !108, retainedTypes: !108)
+!39 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", isOptimized: true, emissionKind: 0, file: !109, enums: !108, retainedTypes: !108)
 !46 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !109, baseType: !47)
 !47 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!97 = !DISubprogram(name: "main", linkageName: "main", line: 73, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !39, type: !98)
+!97 = distinct !DISubprogram(name: "main", linkageName: "main", line: 73, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !39, type: !98)
 !98 = !DISubroutineType(types: !99)
 !99 = !{!100}
 !100 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !101 = !{[2 x i8*]* @C.9.2167}
-!102 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "find_strings", line: 75, scope: !103, file: !38, type: !104)
+!102 = !DILocalVariable(name: "find_strings", line: 75, scope: !103, file: !38, type: !104)
 !103 = distinct !DILexicalBlock(line: 73, column: 0, file: null, scope: !97)
 !104 = !DICompositeType(tag: DW_TAG_array_type, size: 85312, align: 64, file: !109, baseType: !46, elements: !105)
 !105 = !{!106}
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index 6129e78fd3489..d305d678c5964 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -3,7 +3,7 @@
 
 %struct.SVal = type { i8*, i32 }
 
-define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp {
+define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp !dbg !17 {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
   call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !23, metadata !DIExpression()), !dbg !24
@@ -31,7 +31,7 @@ return:                                           ; preds = %bb2
   ret i32 %.0, !dbg !29
 }
 
-define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 {
+define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 !dbg !16 {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
   call void @llvm.dbg.value(metadata %struct.SVal* %this, i64 0, metadata !31, metadata !DIExpression()), !dbg !34
@@ -47,7 +47,7 @@ return:                                           ; preds = %entry
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-define i32 @main() nounwind ssp {
+define i32 @main() nounwind ssp !dbg !20 {
 entry:
   %0 = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=3]
   %v = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=4]
@@ -81,7 +81,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !0 = !DISubprogram(name: "SVal", line: 11, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !47, scope: !1, type: !14)
 !1 = !DICompositeType(tag: DW_TAG_structure_type, name: "SVal", line: 1, size: 128, align: 64, file: !47, scope: !2, elements: !4)
 !2 = !DIFile(filename: "small.cc", directory: "/Users/manav/R8248330")
-!3 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 1, file: !47, enums: !48, retainedTypes: !48, subprograms: !46, imports:  null)
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 1, file: !47, enums: !48, retainedTypes: !48, subprograms: !46, imports:  null)
 !4 = !{!5, !7, !0, !9}
 !5 = !DIDerivedType(tag: DW_TAG_member, name: "Data", line: 7, size: 64, align: 64, file: !47, scope: !1, baseType: !6)
 !6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !47, scope: !2, baseType: null)
@@ -94,35 +94,35 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !14 = !DISubroutineType(types: !15)
 !15 = !{null, !12}
-!16 = !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !47, scope: !1, type: !14, function: void (%struct.SVal*)* @_ZN4SValC1Ev)
-!17 = !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 16, file: !47, scope: !2, type: !18, function: i32 (i32, %struct.SVal*)* @_Z3fooi4SVal)
+!16 = distinct !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !47, scope: !1, type: !14)
+!17 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 16, file: !47, scope: !2, type: !18)
 !18 = !DISubroutineType(types: !19)
 !19 = !{!13, !13, !1}
-!20 = !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 23, file: !47, scope: !2, type: !21, function: i32 ()* @main)
+!20 = distinct !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 23, file: !47, scope: !2, type: !21)
 !21 = !DISubroutineType(types: !22)
 !22 = !{!13}
-!23 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "i", line: 16, arg: 0, scope: !17, file: !2, type: !13)
+!23 = !DILocalVariable(name: "i", line: 16, arg: 1, scope: !17, file: !2, type: !13)
 !24 = !DILocation(line: 16, scope: !17)
-!25 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "location", line: 16, arg: 0, scope: !17, file: !2, type: !26)
+!25 = !DILocalVariable(name: "location", line: 16, arg: 2, scope: !17, file: !2, type: !26)
 !26 = !DIDerivedType(tag: DW_TAG_reference_type, name: "SVal", size: 64, align: 64, file: !47, scope: !2, baseType: !1)
 !27 = !DILocation(line: 17, scope: !28)
 !28 = distinct !DILexicalBlock(line: 16, column: 0, file: !47, scope: !17)
 !29 = !DILocation(line: 18, scope: !28)
 !30 = !DILocation(line: 20, scope: !28)
-!31 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 11, arg: 0, scope: !16, file: !2, type: !32)
+!31 = !DILocalVariable(name: "this", line: 11, arg: 1, scope: !16, file: !2, type: !32)
 !32 = !DIDerivedType(tag: DW_TAG_const_type, size: 64, align: 64, flags: DIFlagArtificial, file: !47, scope: !2, baseType: !33)
 !33 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !47, scope: !2, baseType: !1)
 !34 = !DILocation(line: 11, scope: !16)
 !35 = !DILocation(line: 11, scope: !36)
 !36 = distinct !DILexicalBlock(line: 11, column: 0, file: !47, scope: !37)
 !37 = distinct !DILexicalBlock(line: 11, column: 0, file: !47, scope: !16)
-!38 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "v", line: 24, scope: !39, file: !2, type: !1)
+!38 = !DILocalVariable(name: "v", line: 24, scope: !39, file: !2, type: !1)
 !39 = distinct !DILexicalBlock(line: 23, column: 0, file: !47, scope: !40)
 !40 = distinct !DILexicalBlock(line: 23, column: 0, file: !47, scope: !20)
 !41 = !DILocation(line: 24, scope: !39)
 !42 = !DILocation(line: 25, scope: !39)
 !43 = !DILocation(line: 26, scope: !39)
-!44 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 26, scope: !39, file: !2, type: !13)
+!44 = !DILocalVariable(name: "k", line: 26, scope: !39, file: !2, type: !13)
 !45 = !DILocation(line: 27, scope: !39)
 !47 = !DIFile(filename: "small.cc", directory: "/Users/manav/R8248330")
 !48 = !{}
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index d94bd1c79f910..4303ca991a861 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -2,12 +2,12 @@
 ; Radar 8286101
 ; CHECK: .file   {{[0-9]+}} "<stdin>"
 
-define i32 @foo() nounwind ssp {
+define i32 @foo() nounwind ssp !dbg !0 {
 entry:
   ret i32 42, !dbg !8
 }
 
-define i32 @bar() nounwind ssp {
+define i32 @bar() nounwind ssp !dbg !6 {
 entry:
   ret i32 21, !dbg !10
 }
@@ -15,13 +15,13 @@ entry:
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17}
 
-!0 = !DISubprogram(name: "foo", linkageName: "foo", line: 53, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !14, scope: !1, type: !3, function: i32 ()* @foo)
+!0 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 53, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !14, scope: !1, type: !3)
 !1 = !DIFile(filename: "", directory: "/private/tmp")
-!2 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 114084)", isOptimized: false, emissionKind: 0, file: !15, enums: !16, retainedTypes: !16, subprograms: !13)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 114084)", isOptimized: false, emissionKind: 0, file: !15, enums: !16, retainedTypes: !16, subprograms: !13)
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5}
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!6 = !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !15, scope: !7, type: !3, function: i32 ()* @bar)
+!6 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !15, scope: !7, type: !3)
 !7 = !DIFile(filename: "bug.c", directory: "/private/tmp")
 !8 = !DILocation(line: 53, column: 13, scope: !9)
 !9 = distinct !DILexicalBlock(line: 53, column: 11, file: !14, scope: !0)
diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index 124cc9a430e83..b091003585c25 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
@@ -6,7 +6,7 @@ target triple = "i386-apple-darwin11.0.0"
 
 %struct.bar = type { i32, i32 }
 
-define i32 @foo(%struct.bar* nocapture %i) nounwind readnone optsize noinline ssp {
+define i32 @foo(%struct.bar* nocapture %i) nounwind readnone optsize noinline ssp !dbg !0 {
 ; CHECK: TAG_formal_parameter
 entry:
   tail call void @llvm.dbg.value(metadata %struct.bar* %i, i64 0, metadata !6, metadata !DIExpression()), !dbg !12
@@ -18,13 +18,13 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!19}
 
-!0 = !DISubprogram(name: "foo", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !17, scope: !1, type: !3, function: i32 (%struct.bar*)* @foo, variables: !16)
+!0 = distinct !DISubprogram(name: "foo", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !17, scope: !1, type: !3, variables: !16)
 !1 = !DIFile(filename: "one.c", directory: "/private/tmp")
-!2 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 117922)", isOptimized: true, emissionKind: 0, file: !17, enums: !18, retainedTypes: !18, subprograms: !15, imports:  null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 117922)", isOptimized: true, emissionKind: 0, file: !17, enums: !18, retainedTypes: !18, subprograms: !15, imports:  null)
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5}
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!6 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "i", line: 3, arg: 0, scope: !0, file: !1, type: !7)
+!6 = !DILocalVariable(name: "i", line: 3, arg: 1, scope: !0, file: !1, type: !7)
 !7 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !17, scope: !1, baseType: !8)
 !8 = !DICompositeType(tag: DW_TAG_structure_type, name: "bar", line: 2, size: 64, align: 32, file: !17, scope: !1, elements: !9)
 !9 = !{!10, !11}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index 0ded66fa3bf96..661ec94fee4e9 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -20,7 +20,7 @@ target triple = "x86_64-apple-darwin10.0.0"
 @.str1 = private unnamed_addr constant [14 x i8] c"m=%u, z_s=%d\0A\00"
 @str = internal constant [21 x i8] c"Failing test vector:\00"
 
-define i64 @gcd(i64 %a, i64 %b) nounwind readnone optsize noinline ssp {
+define i64 @gcd(i64 %a, i64 %b) nounwind readnone optsize noinline ssp !dbg !0 {
 entry:
   tail call void @llvm.dbg.value(metadata i64 %a, i64 0, metadata !10, metadata !DIExpression()), !dbg !18
   tail call void @llvm.dbg.value(metadata i64 %b, i64 0, metadata !11, metadata !DIExpression()), !dbg !19
@@ -38,7 +38,7 @@ if.then:                                          ; preds = %while.body
   ret i64 %b.addr.0, !dbg !23
 }
 
-define i32 @main() nounwind optsize ssp {
+define i32 @main() nounwind optsize ssp !dbg !6 {
 entry:
   %call = tail call i32 @rand() nounwind optsize, !dbg !24
   tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !14, metadata !DIExpression()), !dbg !24
@@ -78,24 +78,24 @@ declare i32 @puts(i8* nocapture) nounwind
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!33}
 
-!0 = !DISubprogram(name: "gcd", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !31, scope: !1, type: !3, function: i64 (i64, i64)* @gcd, variables: !29)
+!0 = distinct !DISubprogram(name: "gcd", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !31, scope: !1, type: !3, variables: !29)
 !1 = !DIFile(filename: "rem_small.c", directory: "/private/tmp")
-!2 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 124117)", isOptimized: true, emissionKind: 1, file: !31, enums: !32, retainedTypes: !32, subprograms: !28, imports:  null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 124117)", isOptimized: true, emissionKind: 1, file: !31, enums: !32, retainedTypes: !32, subprograms: !28, imports:  null)
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5}
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "long int", size: 64, align: 64, encoding: DW_ATE_signed)
-!6 = !DISubprogram(name: "main", line: 25, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, file: !31, scope: !1, type: !7, function: i32 ()* @main, variables: !30)
+!6 = distinct !DISubprogram(name: "main", line: 25, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, file: !31, scope: !1, type: !7, variables: !30)
 !7 = !DISubroutineType(types: !8)
 !8 = !{!9}
 !9 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!10 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 5, arg: 0, scope: !0, file: !1, type: !5)
-!11 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "b", line: 5, arg: 0, scope: !0, file: !1, type: !5)
-!12 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "c", line: 6, scope: !13, file: !1, type: !5)
+!10 = !DILocalVariable(name: "a", line: 5, arg: 1, scope: !0, file: !1, type: !5)
+!11 = !DILocalVariable(name: "b", line: 5, arg: 2, scope: !0, file: !1, type: !5)
+!12 = !DILocalVariable(name: "c", line: 6, scope: !13, file: !1, type: !5)
 !13 = distinct !DILexicalBlock(line: 5, column: 52, file: !31, scope: !0)
-!14 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "m", line: 26, scope: !15, file: !1, type: !16)
+!14 = !DILocalVariable(name: "m", line: 26, scope: !15, file: !1, type: !16)
 !15 = distinct !DILexicalBlock(line: 25, column: 12, file: !31, scope: !6)
 !16 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
-!17 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "z_s", line: 27, scope: !15, file: !1, type: !9)
+!17 = !DILocalVariable(name: "z_s", line: 27, scope: !15, file: !1, type: !9)
 !18 = !DILocation(line: 5, column: 41, scope: !0)
 !19 = !DILocation(line: 5, column: 49, scope: !0)
 !20 = !DILocation(line: 7, column: 5, scope: !13)
diff --git a/test/CodeGen/X86/2011-10-21-widen-cmp.ll b/test/CodeGen/X86/2011-10-21-widen-cmp.ll
index 2fe645b078150..cb4648c382f79 100644
--- a/test/CodeGen/X86/2011-10-21-widen-cmp.ll
+++ b/test/CodeGen/X86/2011-10-21-widen-cmp.ll
@@ -1,15 +1,23 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
-
-target triple = "x86_64-unknown-linux-gnu"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
 
 ; Check that a <4 x float> compare is generated and that we are
 ; not stuck in an endless loop.
 
-; CHECK: cmp_2_floats
-; CHECK: cmpordps
-; CHECK: ret
-
 define void @cmp_2_floats() {
+; CHECK-LABEL: cmp_2_floats:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpordps %xmm0, %xmm0
+; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT:    psllq $32, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT:    psrad $31, %xmm0
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT:    pslld $31, %xmm0
+; CHECK-NEXT:    blendvps %xmm0, %xmm0
+; CHECK-NEXT:    movlps %xmm0, (%rax)
+; CHECK-NEXT:    retq
 entry:
   %0 = fcmp oeq <2 x float> undef, undef
   %1 = select <2 x i1> %0, <2 x float> undef, <2 x float> undef
@@ -17,11 +25,13 @@ entry:
   ret void
 }
 
-; CHECK: cmp_2_doubles
-; CHECK: cmpordpd
-; CHECK: blendvpd
-; CHECK: ret
 define void @cmp_2_doubles() {
+; CHECK-LABEL: cmp_2_doubles:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpordpd %xmm0, %xmm0
+; CHECK-NEXT:    blendvpd %xmm0, %xmm0
+; CHECK-NEXT:    movapd %xmm0, (%rax)
+; CHECK-NEXT:    retq
 entry:
   %0 = fcmp oeq <2 x double> undef, undef
   %1 = select <2 x i1> %0, <2 x double> undef, <2 x double> undef
@@ -29,11 +39,11 @@ entry:
   ret void
 }
 
-; CHECK: mp_11193
-; CHECK: psraw   $15
-; CHECK: ret
-define void @mp_11193(<8 x float> * nocapture %aFOO, <8 x float>* nocapture %RET)
-nounwind {
+define void @mp_11193(<8 x float> * nocapture %aFOO, <8 x float>* nocapture %RET) nounwind {
+; CHECK-LABEL: mp_11193:
+; CHECK:       # BB#0: # %allocas
+; CHECK-NEXT:    movl $-1082130432, (%rsi) # imm = 0xFFFFFFFFBF800000
+; CHECK-NEXT:    retq
 allocas:
   %bincmp = fcmp olt <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 9.000000e+00, float 1.000000e+00, float 9.000000e+00, float 1.000000e+00> , <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
   %t = extractelement <8 x i1> %bincmp, i32 0
diff --git a/test/CodeGen/X86/2011-12-06-AVXVectorExtractCombine.ll b/test/CodeGen/X86/2011-12-06-AVXVectorExtractCombine.ll
index 2a1a5c9fb3ea0..e6ba7551421d0 100644
--- a/test/CodeGen/X86/2011-12-06-AVXVectorExtractCombine.ll
+++ b/test/CodeGen/X86/2011-12-06-AVXVectorExtractCombine.ll
@@ -4,13 +4,14 @@
 define void @test(<4 x i32>* nocapture %p) nounwind {
   ; CHECK-LABEL: test:
   ; CHECK: vpxor %xmm0, %xmm0, %xmm0
-  ; CHECK-NEXT: vpmaxsd {{.*}}, %xmm0, %xmm0
-  ; CHECK-NEXT: vmovdqu	%xmm0, (%rdi)
+  ; CHECK-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0
+  ; CHECK-NEXT: vmovdqu %xmm0, (%rdi)
   ; CHECK-NEXT: ret
-  %a = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> <i32 -8, i32 -9, i32 -10, i32 -11>, <4 x i32> zeroinitializer) nounwind
-  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
-  %c = shufflevector <8 x i32> %b, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  store <4 x i32> %c, <4 x i32>* %p, align 1
+  %a = load <4 x i32>, <4 x i32>* %p, align 1
+  %b = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a, <4 x i32> zeroinitializer) nounwind
+  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+  %d = shufflevector <8 x i32> %c, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i32> %d, <4 x i32>* %p, align 1
   ret void
 }
 
diff --git a/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll b/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
index 78cdfcf0e1f0e..539d5547d5f18 100644
--- a/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
+++ b/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
@@ -1,13 +1,17 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
-target triple = "x86_64-unknown-linux-gnu"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
 
 ; Check that the booleans are converted using zext and not via sext.
 ; 0x1 means that we only look at the first bit.
 
-;CHECK: 0x1
-;CHECK-LABEL: ui_to_fp_conv:
-;CHECK: ret
 define void @ui_to_fp_conv(<8 x float> * nocapture %aFOO, <8 x float>* nocapture %RET) nounwind {
+; CHECK-LABEL: ui_to_fp_conv:
+; CHECK:       # BB#0: # %allocas
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00]
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    movups %xmm1, 16(%rsi)
+; CHECK-NEXT:    movups %xmm0, (%rsi)
+; CHECK-NEXT:    retq
 allocas:
   %bincmp = fcmp olt <8 x float> <float 1.000000e+00, float 1.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00> , <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
   %bool2float = uitofp <8 x i1> %bincmp to <8 x float>
diff --git a/test/CodeGen/X86/2012-01-12-extract-sv.ll b/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 677c902668bc9..92ec107a00794 100644
--- a/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -2,20 +2,20 @@
 
 define void @endless_loop() {
 ; CHECK-LABEL: endless_loop:
-; CHECK-NEXT:     # BB#0:
-; CHECK-NEXT:	vmovaps	(%eax), %ymm0
-; CHECK-NEXT:	vextractf128	$1, %ymm0, %xmm0
-; CHECK-NEXT:	vmovsldup	%xmm0, %xmm0    # xmm0 = xmm0[0,0,2,2]
-; CHECK-NEXT:	vmovddup	%xmm0, %xmm1    # xmm1 = xmm0[0,0]
-; CHECK-NEXT:	vinsertf128	$1, %xmm1, %ymm0, %ymm1
-; CHECK-NEXT:	vxorps	%xmm2, %xmm2, %xmm2
-; CHECK-NEXT:	vblendps	$128, %ymm1, %ymm2, %ymm1 # ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; CHECK-NEXT:	vxorps	%ymm2, %ymm2, %ymm2
-; CHECK-NEXT:	vblendps	$1, %ymm0, %ymm2, %ymm0 # ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
-; CHECK-NEXT:	vmovaps	%ymm0, (%eax)
-; CHECK-NEXT:	vmovaps	%ymm1, (%eax)
-; CHECK-NEXT:	vzeroupper
-; CHECK-NEXT:	retl
+; CHECK-NEXT:  # BB#0:
+; CHECK-NEXT:    vmovaps (%eax), %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
+; CHECK-NEXT:    vmovaps %ymm0, (%eax)
+; CHECK-NEXT:    vmovaps %ymm1, (%eax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
 entry:
   %0 = load <8 x i32>, <8 x i32> addrspace(1)* undef, align 32
   %1 = shufflevector <8 x i32> %0, <8 x i32> undef, <16 x i32> <i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/2012-08-17-legalizer-crash.ll b/test/CodeGen/X86/2012-08-17-legalizer-crash.ll
index a19aa52f302f3..816577be15e74 100644
--- a/test/CodeGen/X86/2012-08-17-legalizer-crash.ll
+++ b/test/CodeGen/X86/2012-08-17-legalizer-crash.ll
@@ -26,6 +26,5 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 
 ; CHECK-LABEL: fn1:
-; CHECK: shrq $32, [[REG:%.*]]
-; CHECK: sete
+; CHECK: jb
 }
diff --git a/test/CodeGen/X86/2012-1-10-buildvector.ll b/test/CodeGen/X86/2012-1-10-buildvector.ll
index d1c0266941fd2..eb237847e1bc9 100644
--- a/test/CodeGen/X86/2012-1-10-buildvector.ll
+++ b/test/CodeGen/X86/2012-1-10-buildvector.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mattr=+avx -mtriple=i686-unknown-unknown | FileCheck %s
 
 define void @bad_cast() {
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index a27db95ba1270..50b486c6f925b 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -14,7 +14,7 @@
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp {
+define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp !dbg !14 {
 entry:
   call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !14)
   %type = getelementptr inbounds %struct.node.0.27, %struct.node.0.27* %p, i64 0, i32 0
@@ -38,15 +38,15 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: 0, file: !11, enums: !2, retainedTypes: !2, subprograms: !13, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: 0, file: !11, enums: !2, retainedTypes: !2, subprograms: !13, globals: !2)
 !2 = !{}
-!4 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "hg", line: 725, arg: 4, scope: !14, file: !5, type: !6)
+!4 = !DILocalVariable(name: "hg", line: 725, arg: 4, scope: !14, file: !5, type: !6)
 !5 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh")
 !6 = !DIDerivedType(tag: DW_TAG_typedef, name: "hgstruct", line: 492, file: !11, baseType: !7)
 !7 = !DICompositeType(tag: DW_TAG_structure_type, line: 487, size: 512, align: 64, file: !11)
 !11 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh")
 !12 = !{i32 1, !"Debug Info Version", i32 3}
 !13 = !{!14}
-!14 = !DISubprogram(name: "subdivp", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !11, scope: !5, type: !15, function: i16 (%struct.node.0.27*, double, double, %struct.hgstruct.2.29* )* @subdivp)
+!14 = distinct !DISubprogram(name: "subdivp", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !11, scope: !5, type: !15)
 !15 = !DISubroutineType(types: !16)
 !16 = !{null}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 22227faab942f..7ed416e36c22c 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -14,7 +14,7 @@
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp {
+define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp !dbg !21 {
 entry:
   %num14075 = alloca [20 x i8], align 16
   br label %if.end33
@@ -65,10 +65,10 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!35}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: 0, file: !19, enums: !2, retainedTypes: !2, subprograms: !20, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: 0, file: !19, enums: !2, retainedTypes: !2, subprograms: !20, globals: !2)
 !1 = !{!2}
 !2 = !{}
-!4 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "num1", line: 815, scope: !5, file: !14, type: !15)
+!4 = !DILocalVariable(name: "num1", line: 815, scope: !5, file: !14, type: !15)
 !5 = distinct !DILexicalBlock(line: 815, column: 0, file: !14, scope: !6)
 !6 = distinct !DILexicalBlock(line: 812, column: 0, file: !14, scope: !7)
 !7 = distinct !DILexicalBlock(line: 807, column: 0, file: !14, scope: !8)
@@ -86,7 +86,7 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !19 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset")
 
 !20 = !{!21}
-!21 = !DISubprogram(name: "AttachGalley", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !19, scope: !14, type: !22, function: i32 (%union.rec**)* @AttachGalley)
+!21 = distinct !DISubprogram(name: "AttachGalley", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !19, scope: !14, type: !22)
 !22 = !DISubroutineType(types: !23)
 !23 = !{null}
 
@@ -99,7 +99,7 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 %"class.__gnu_cxx::hash_map" = type { %"class.__gnu_cxx::hashtable" }
 %"class.__gnu_cxx::hashtable" = type { i64, i64, i64, i64, i64, i64 }
 
-define void @main() uwtable ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+define void @main() uwtable ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !37 {
 entry:
   %X = alloca %"class.__gnu_cxx::hash_map", align 8
   br i1 undef, label %cond.true, label %cond.end
@@ -134,11 +134,11 @@ declare void @_Znwm()
 
 !llvm.dbg.cu = !{!30}
 
-!30 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 169129) (llvm/trunk 169135)", isOptimized: true, emissionKind: 0, file: !34, enums: !2, retainedTypes: !2, subprograms: !36)
-!31 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "X", line: 29, scope: !37, type: !32)
+!30 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 169129) (llvm/trunk 169135)", isOptimized: true, emissionKind: 0, file: !34, enums: !2, retainedTypes: !2, subprograms: !36)
+!31 = !DILocalVariable(name: "X", line: 29, scope: !37, type: !32)
 !32 = !DIDerivedType(tag: DW_TAG_typedef, name: "HM", line: 28, file: !34, baseType: null)
 !33 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++")
 !34 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++")
 !35 = !{i32 1, !"Debug Info Version", i32 3}
 !36 = !{!37}
-!37 = !DISubprogram(name: "main", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !19, scope: !14, type: !22, function: void ()* @main)
+!37 = distinct !DISubprogram(name: "main", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !19, scope: !14, type: !22)
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index 77c017eb0e36c..3f7a10ae035b1 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -11,7 +11,7 @@
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-define void @test() unnamed_addr uwtable ssp align 2 {
+define void @test() unnamed_addr uwtable ssp align 2 !dbg !2 {
 entry:
   %callback = alloca %struct.btCompoundLeafCallback, align 8
   br i1 undef, label %if.end, label %if.then
@@ -36,10 +36,10 @@ invoke.cont44:                                    ; preds = %if.end
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8}
 
-!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 168984) (llvm/trunk 168983)", isOptimized: true, emissionKind: 0, file: !6, subprograms: !1)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 168984) (llvm/trunk 168983)", isOptimized: true, emissionKind: 0, file: !6, subprograms: !1)
 !1 = !{!2}
-!2 = !DISubprogram(name: "test", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !6, scope: !5, type: !7, function: void ()* @test)
-!3 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "callback", line: 214, scope: !2, type: !4)
+!2 = distinct !DISubprogram(name: "test", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !6, scope: !5, type: !7)
+!3 = !DILocalVariable(name: "callback", line: 214, scope: !2, type: !4)
 !4 = !DICompositeType(tag: DW_TAG_structure_type, name: "btCompoundLeafCallback", line: 90, size: 512, align: 64, file: !6)
 !5 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet")
 !6 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet")
diff --git a/test/CodeGen/X86/3dnow-intrinsics.ll b/test/CodeGen/X86/3dnow-intrinsics.ll
index 0b27bf2d18531..fe8b95ec4655b 100644
--- a/test/CodeGen/X86/3dnow-intrinsics.ll
+++ b/test/CodeGen/X86/3dnow-intrinsics.ll
@@ -277,7 +277,7 @@ entry:
 declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone
 
 define <2 x float> @test_pswapdsf(<2 x float> %a) nounwind readnone {
-; CHECK: pswapd
+; CHECK: pswapd {{.*#+}} mm0 = mem[1,0]
 entry:
   %0 = bitcast <2 x float> %a to x86_mmx
   %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0)
@@ -286,7 +286,7 @@ entry:
 }
 
 define <2 x i32> @test_pswapdsi(<2 x i32> %a) nounwind readnone {
-; CHECK: pswapd
+; CHECK: pswapd {{.*#+}} mm0 = mem[1,0]
 entry:
   %0 = bitcast <2 x i32> %a to x86_mmx
   %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0)
diff --git a/test/CodeGen/X86/GC/alloc_loop.ll b/test/CodeGen/X86/GC/alloc_loop.ll
index 2a505e80aac88..b924e1cee069e 100644
--- a/test/CodeGen/X86/GC/alloc_loop.ll
+++ b/test/CodeGen/X86/GC/alloc_loop.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; REQUIRES: default_triple
 
 
 declare i8* @llvm_gc_allocate(i32)
diff --git a/test/CodeGen/X86/GC/cg-O0.ll b/test/CodeGen/X86/GC/cg-O0.ll
index b4929425e94a0..1a390c9eb1c1b 100644
--- a/test/CodeGen/X86/GC/cg-O0.ll
+++ b/test/CodeGen/X86/GC/cg-O0.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -O0
+; REQUIRES: default_triple
 
 define i32 @main() {
 entry:
diff --git a/test/CodeGen/X86/GC/dynamic-frame-size.ll b/test/CodeGen/X86/GC/dynamic-frame-size.ll
index a3583d46a29a2..9ec9b8b085076 100644
--- a/test/CodeGen/X86/GC/dynamic-frame-size.ll
+++ b/test/CodeGen/X86/GC/dynamic-frame-size.ll
@@ -17,12 +17,12 @@ define void @test(i8* %ptr) gc "erlang" {
 ; CHECK: .note.gc
 ; CHECK-NEXT: .align 8
 ; safe point count
-; CHECK .short	1
-; CHECK .long	.Ltmp0
+; CHECK: .short	1
+; CHECK: .long	.Ltmp0
 ; stack frame size (in words)
-; CHECK .short	-1
+; CHECK: .short	-1
 ; stack arity (arguments on the stack)
-; CHECK .short	0
+; CHECK: .short	0
 ; live root count
-; CHECK .short	0
+; CHECK: .short	0
 
diff --git a/test/CodeGen/X86/GC/lower_gcroot.ll b/test/CodeGen/X86/GC/lower_gcroot.ll
index c2d418ac50ef5..8cccd78100f52 100644
--- a/test/CodeGen/X86/GC/lower_gcroot.ll
+++ b/test/CodeGen/X86/GC/lower_gcroot.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; REQUIRES: default_triple
 
 	%Env = type i8*
 
diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll
index 408c6b9151c33..ee1c658d4c555 100644
--- a/test/CodeGen/X86/MachineBranchProb.ll
+++ b/test/CodeGen/X86/MachineBranchProb.ll
@@ -18,9 +18,9 @@ for.cond2:                                        ; preds = %for.inc, %for.cond
   %or.cond = or i1 %tobool, %cmp4
   br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0
 ; CHECK: BB#1: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3(56008718) BB#4(3615818718)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.53%) BB#4({{[0-9a-fx/= ]+}}98.47%)
 ; CHECK: BB#4: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3(56008718) BB#2(3559810000)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.55%) BB#2({{[0-9a-fx/= ]+}}98.45%)
 
 for.inc:                                          ; preds = %for.cond2
   %shl = shl i32 %bit.0, 1
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 6f057c5f18e6d..457d9beb37d5b 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.0"
 
-define i32 @foo(i32 %i, i32* nocapture %c) nounwind uwtable readonly ssp {
+define i32 @foo(i32 %i, i32* nocapture %c) nounwind uwtable readonly ssp !dbg !1 {
   tail call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !6, metadata !DIExpression()), !dbg !12
   %ab = load i32, i32* %c, align 1, !dbg !14
   tail call void @llvm.dbg.value(metadata i32* %c, i64 0, metadata !7, metadata !DIExpression()), !dbg !13
@@ -28,17 +28,17 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: 1, file: !20, enums: !21, retainedTypes: !21, subprograms: !18, imports:  null)
-!1 = !DISubprogram(name: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !20, scope: !2, type: !3, function: i32 (i32, i32*)* @foo, variables: !19)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: 1, file: !20, enums: !21, retainedTypes: !21, subprograms: !18, imports:  null)
+!1 = distinct !DISubprogram(name: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !20, scope: !2, type: !3, variables: !19)
 !2 = !DIFile(filename: "a.c", directory: "/private/tmp")
 !3 = !DISubroutineType(types: !4)
 !4 = !{!5}
 !5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!6 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "i", line: 2, arg: 1, scope: !1, file: !2, type: !5)
-!7 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "c", line: 2, arg: 2, scope: !1, file: !2, type: !8)
+!6 = !DILocalVariable(name: "i", line: 2, arg: 1, scope: !1, file: !2, type: !5)
+!7 = !DILocalVariable(name: "c", line: 2, arg: 2, scope: !1, file: !2, type: !8)
 !8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, scope: !0, baseType: !9)
 !9 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!10 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "a", line: 3, scope: !11, file: !2, type: !9)
+!10 = !DILocalVariable(name: "a", line: 3, scope: !11, file: !2, type: !9)
 !11 = distinct !DILexicalBlock(line: 2, column: 25, file: !20, scope: !1)
 !12 = !DILocation(line: 2, column: 13, scope: !1)
 !13 = !DILocation(line: 2, column: 22, scope: !1)
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index c8f249b7529d9..70af4184e8a28 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -1,13 +1,10 @@
-; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s | FileCheck %s
-; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck %s
 
 %struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
 %struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
 
-; CHECK: merge_const_store
+; CHECK-LABEL: merge_const_store:
 ; save 1,2,3 ... as one big integer.
 ; CHECK: movabsq $578437695752307201
 ; CHECK: ret
@@ -42,7 +39,7 @@ define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwt
 }
 
 ; No vectors because we use noimplicitfloat
-; CHECK: merge_const_store_no_vec
+; CHECK-LABEL: merge_const_store_no_vec:
 ; CHECK-NOT: vmovups
 ; CHECK: ret
 define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
@@ -76,7 +73,7 @@ define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimp
 }
 
 ; Move the constants using a single vector store.
-; CHECK: merge_const_store_vec
+; CHECK-LABEL: merge_const_store_vec:
 ; CHECK: vmovups
 ; CHECK: ret
 define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
@@ -110,7 +107,7 @@ define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind
 }
 
 ; Move the first 4 constants as a single vector. Move the rest as scalars.
-; CHECK: merge_nonconst_store
+; CHECK-LABEL: merge_nonconst_store:
 ; CHECK: movl $67305985
 ; CHECK: movb
 ; CHECK: movb
@@ -291,12 +288,16 @@ block4:                                       ; preds = %4, %.lr.ph
   ret void
 }
 
-;; On x86, even unaligned copies can be merged to vector ops.
+;; On x86, even unaligned copies should be merged to vector ops.
+;; TODO: however, this cannot happen at the moment, due to brokenness
+;; in MergeConsecutiveStores. See UseAA FIXME in DAGCombiner.cpp
+;; visitSTORE.
+
 ; CHECK-LABEL: merge_loads_no_align:
 ;  load:
-; CHECK: vmovups
+; CHECK-NOT: vmovups ;; TODO
 ;  store:
-; CHECK: vmovups
+; CHECK-NOT: vmovups ;; TODO
 ; CHECK: ret
 define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
   %a1 = icmp sgt i32 %count, 0
@@ -335,7 +336,7 @@ block4:                                       ; preds = %4, %.lr.ph
 
 ; Make sure that we merge the consecutive load/store sequence below and use a
 ; word (16 bit) instead of a byte copy.
-; CHECK: MergeLoadStoreBaseIndexOffset
+; CHECK-LABEL: MergeLoadStoreBaseIndexOffset:
 ; CHECK: movw    (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
 ; CHECK: movw    [[REG]], (%{{.*}})
 define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
@@ -367,7 +368,7 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
 ; Make sure that we merge the consecutive load/store sequence below and use a
 ; word (16 bit) instead of a byte copy even if there are intermediate sign
 ; extensions.
-; CHECK: MergeLoadStoreBaseIndexOffsetSext
+; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext:
 ; CHECK: movw    (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
 ; CHECK: movw    [[REG]], (%{{.*}})
 define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
@@ -399,7 +400,7 @@ define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
 
 ; However, we can only merge ignore sign extensions when they are on all memory
 ; computations;
-; CHECK: loadStoreBaseIndexOffsetSextNoSex
+; CHECK-LABEL: loadStoreBaseIndexOffsetSextNoSex:
 ; CHECK-NOT: movw    (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
 ; CHECK-NOT: movw    [[REG]], (%{{.*}})
 define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
@@ -481,10 +482,8 @@ define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x flo
   ret void
 
 ; CHECK-LABEL: merge_vec_extract_stores
-; CHECK:      vmovaps %xmm0, 48(%rdi)
-; CHECK-NEXT: vextractf128 $1, %ymm0, 64(%rdi)
-; CHECK-NEXT: vmovaps %xmm1, 80(%rdi)
-; CHECK-NEXT: vextractf128 $1, %ymm1, 96(%rdi)
+; CHECK:      vmovups %ymm0, 48(%rdi)
+; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
 ; CHECK-NEXT: vzeroupper
 ; CHECK-NEXT: retq
 }
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index 98c27f44fabce..91fe7f8193832 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -27,9 +27,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23}
-!0 = !DICompileUnit(language: DW_LANG_C89, producer: "clang", isOptimized: true, emissionKind: 0, file: !1, enums: !{}, retainedTypes: !{})
+!0 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "clang", isOptimized: true, emissionKind: 0, file: !1, enums: !{}, retainedTypes: !{})
 !1 = !DIFile(filename: "t.c", directory: "")
 !16 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!2 = !DISubprogram()
-!22 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "x", line: 16, scope: !2, file: !1, type: !16)
+!2 = distinct !DISubprogram()
+!22 = !DILocalVariable(name: "x", line: 16, scope: !2, file: !1, type: !16)
 !23 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/add-nsw-sext.ll b/test/CodeGen/X86/add-nsw-sext.ll
new file mode 100644
index 0000000000000..0a6f6c315c13f
--- /dev/null
+++ b/test/CodeGen/X86/add-nsw-sext.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; The fundamental problem: an add separated from other arithmetic by a sext can't
+; be combined with the later instructions. However, if the first add is 'nsw',
+; then we can promote the sext ahead of that add to allow optimizations.
+
+define i64 @add_nsw_consts(i32 %i) {
+; CHECK-LABEL: add_nsw_consts:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movslq %edi, %rax
+; CHECK-NEXT:    addq $12, %rax
+; CHECK-NEXT:    retq
+
+  %add = add nsw i32 %i, 5
+  %ext = sext i32 %add to i64
+  %idx = add i64 %ext, 7
+  ret i64 %idx
+}
+
+; An x86 bonus: If we promote the sext ahead of the 'add nsw',
+; we allow LEA formation and eliminate an add instruction.
+
+define i64 @add_nsw_sext_add(i32 %i, i64 %x) {
+; CHECK-LABEL: add_nsw_sext_add:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movslq %edi, %rax
+; CHECK-NEXT:    leaq 5(%rax,%rsi), %rax
+; CHECK-NEXT:    retq
+
+  %add = add nsw i32 %i, 5
+  %ext = sext i32 %add to i64
+  %idx = add i64 %x, %ext
+  ret i64 %idx
+}
+
+; Throw in a scale (left shift) because an LEA can do that too.
+; Use a negative constant (LEA displacement) to verify that's handled correctly.
+
+define i64 @add_nsw_sext_lsh_add(i32 %i, i64 %x) {
+; CHECK-LABEL: add_nsw_sext_lsh_add:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movslq %edi, %rax
+; CHECK-NEXT:    leaq -40(%rsi,%rax,8), %rax
+; CHECK-NEXT:    retq
+
+  %add = add nsw i32 %i, -5
+  %ext = sext i32 %add to i64
+  %shl = shl i64 %ext, 3
+  %idx = add i64 %x, %shl
+  ret i64 %idx
+}
+
+; Don't promote the sext if it has no users. The wider add instruction needs an
+; extra byte to encode.
+
+define i64 @add_nsw_sext(i32 %i, i64 %x) {
+; CHECK-LABEL: add_nsw_sext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    addl $5, %edi
+; CHECK-NEXT:    movslq %edi, %rax
+; CHECK-NEXT:    retq
+
+  %add = add nsw i32 %i, 5
+  %ext = sext i32 %add to i64
+  ret i64 %ext
+}
+
+; The typical use case: a 64-bit system where an 'int' is used as an index into an array.
+
+define i8* @gep8(i32 %i, i8* %x) {
+; CHECK-LABEL: gep8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movslq %edi, %rax
+; CHECK-NEXT:    leaq 5(%rax,%rsi), %rax
+; CHECK-NEXT:    retq
+
+  %add = add nsw i32 %i, 5
+  %ext = sext i32 %add to i64
+  %idx = getelementptr i8, i8* %x, i64 %ext
+  ret i8* %idx
+}
+
+define i16* @gep16(i32 %i, i16* %x) {
+; CHECK-LABEL: gep16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movslq %edi, %rax
+; CHECK-NEXT:    leaq -10(%rsi,%rax,2), %rax
+; CHECK-NEXT:    retq
+
+  %add = add nsw i32 %i, -5
+  %ext = sext i32 %add to i64
+  %idx = getelementptr i16, i16* %x, i64 %ext
+  ret i16* %idx
+}
+
+define i32* @gep32(i32 %i, i32* %x) {
+; CHECK-LABEL: gep32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movslq %edi, %rax
+; CHECK-NEXT:    leaq 20(%rsi,%rax,4), %rax
+; CHECK-NEXT:    retq
+
+  %add = add nsw i32 %i, 5
+  %ext = sext i32 %add to i64
+  %idx = getelementptr i32, i32* %x, i64 %ext
+  ret i32* %idx
+}
+
+define i64* @gep64(i32 %i, i64* %x) {
+; CHECK-LABEL: gep64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movslq %edi, %rax
+; CHECK-NEXT:    leaq -40(%rsi,%rax,8), %rax
+; CHECK-NEXT:    retq
+
+  %add = add nsw i32 %i, -5
+  %ext = sext i32 %add to i64
+  %idx = getelementptr i64, i64* %x, i64 %ext
+  ret i64* %idx
+}
+
+; LEA can't scale by 16, but the adds can still be combined into an LEA.
+
+define i128* @gep128(i32 %i, i128* %x) {
+; CHECK-LABEL: gep128:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movslq %edi, %rax
+; CHECK-NEXT:    shlq $4, %rax
+; CHECK-NEXT:    leaq 80(%rax,%rsi), %rax
+; CHECK-NEXT:    retq
+
+  %add = add nsw i32 %i, 5
+  %ext = sext i32 %add to i64
+  %idx = getelementptr i128, i128* %x, i64 %ext
+  ret i128* %idx
+}
+
+; A bigger win can be achieved when there is more than one use of the
+; sign extended value. In this case, we can eliminate sign extension
+; instructions plus use more efficient addressing modes for memory ops.
+
+define void @PR20134(i32* %a, i32 %i) {
+; CHECK-LABEL: PR20134:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movslq %esi, %rax
+; CHECK-NEXT:    movl 4(%rdi,%rax,4), %ecx
+; CHECK-NEXT:    addl 8(%rdi,%rax,4), %ecx
+; CHECK-NEXT:    movl %ecx, (%rdi,%rax,4)
+; CHECK-NEXT:    retq
+
+  %add1 = add nsw i32 %i, 1
+  %idx1 = sext i32 %add1 to i64
+  %gep1 = getelementptr i32, i32* %a, i64 %idx1
+  %load1 = load i32, i32* %gep1, align 4
+
+  %add2 = add nsw i32 %i, 2
+  %idx2 = sext i32 %add2 to i64
+  %gep2 = getelementptr i32, i32* %a, i64 %idx2
+  %load2 = load i32, i32* %gep2, align 4
+
+  %add3 = add i32 %load1, %load2
+  %idx3 = sext i32 %i to i64
+  %gep3 = getelementptr i32, i32* %a, i64 %idx3
+  store i32 %add3, i32* %gep3, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index 3f19a064323c2..50c7b929c8274 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -2,7 +2,7 @@
 ; RUN: -relocation-model=pic | FileCheck %s
 
 @thread_var = thread_local global i32 42, align 4
-@thread_alias = thread_local(localdynamic) alias i32* @thread_var
+@thread_alias = thread_local(localdynamic) alias i32, i32* @thread_var
 
 ; CHECK-LABEL: get_thread_var
 define i32* @get_thread_var() {
@@ -19,10 +19,10 @@ define i32* @get_thread_alias() {
 @bar = global i32 42
 
 ; CHECK-DAG: .globl	foo1
-@foo1 = alias i32* @bar
+@foo1 = alias i32, i32* @bar
 
 ; CHECK-DAG: .globl	foo2
-@foo2 = alias i32* @bar
+@foo2 = alias i32, i32* @bar
 
 %FunTy = type i32()
 
@@ -30,35 +30,35 @@ define i32 @foo_f() {
   ret i32 0
 }
 ; CHECK-DAG: .weak	bar_f
-@bar_f = weak alias %FunTy* @foo_f
+@bar_f = weak alias %FunTy, %FunTy* @foo_f
 
-@bar_l = linkonce_odr alias i32* @bar
+@bar_l = linkonce_odr alias i32, i32* @bar
 ; CHECK-DAG: .weak	bar_l
 
-@bar_i = internal alias i32* @bar
+@bar_i = internal alias i32, i32* @bar
 
 ; CHECK-DAG: .globl	A
-@A = alias bitcast (i32* @bar to i64*)
+@A = alias i64, bitcast (i32* @bar to i64*)
 
 ; CHECK-DAG: .globl	bar_h
 ; CHECK-DAG: .hidden	bar_h
-@bar_h = hidden alias i32* @bar
+@bar_h = hidden alias i32, i32* @bar
 
 ; CHECK-DAG: .globl	bar_p
 ; CHECK-DAG: .protected	bar_p
-@bar_p = protected alias i32* @bar
+@bar_p = protected alias i32, i32* @bar
 
 ; CHECK-DAG: test2 = bar+4
-@test2 = alias getelementptr(i32, i32 *@bar, i32 1)
+@test2 = alias i32, getelementptr(i32, i32* @bar, i32 1)
 
 ; CHECK-DAG: test3 = 42
-@test3 = alias inttoptr(i32 42 to i32*)
+@test3 = alias i32, inttoptr(i32 42 to i32*)
 
 ; CHECK-DAG: test4 = bar
-@test4 = alias inttoptr(i64 ptrtoint (i32* @bar to i64) to i32*)
+@test4 = alias i32, inttoptr(i64 ptrtoint (i32* @bar to i64) to i32*)
 
 ; CHECK-DAG: test5 = test2-bar
-@test5 = alias inttoptr(i32 sub (i32 ptrtoint (i32* @test2 to i32),
+@test5 = alias i32, inttoptr(i32 sub (i32 ptrtoint (i32* @test2 to i32),
                                  i32 ptrtoint (i32* @bar to i32)) to i32*)
 
 ; CHECK-DAG: .globl	test
diff --git a/test/CodeGen/X86/and-encoding.ll b/test/CodeGen/X86/and-encoding.ll
new file mode 100644
index 0000000000000..f7bbac2a4bd90
--- /dev/null
+++ b/test/CodeGen/X86/and-encoding.ll
@@ -0,0 +1,41 @@
+; RUN: llc -show-mc-encoding < %s | FileCheck %s
+
+; Test that the direct object emission selects the and variant with 8 bit
+; immediate.
+; We used to get this wrong when using direct object emission, but not when
+; reading assembly.
+
+
+target triple = "x86_64-pc-linux"
+
+define void @f1() {
+; CHECK-LABEL: f1:
+; CHECK: andq    $-32, %rsp              # encoding: [0x48,0x83,0xe4,0xe0]
+  %foo = alloca i8, align 32
+  ret void
+}
+
+define void @f2(i1 *%x, i16 *%y) {
+; CHECK-LABEL: f2:
+; CHECK: andl	$1, %eax                # encoding: [0x83,0xe0,0x01]
+  %a = load i1, i1* %x
+  %b = zext i1 %a to i16
+  store i16 %b, i16* %y
+  ret void
+}
+
+define i32 @f3(i1 *%x) {
+; CHECK-LABEL: f3:
+; CHECK: andl	$1, %eax                # encoding: [0x83,0xe0,0x01]
+  %a = load i1, i1* %x
+  %b = zext i1 %a to i32
+  ret i32 %b
+}
+
+define i64 @f4(i1 *%x) {
+; CHECK-LABEL: f4:
+; CHECK: andl	$1, %eax                # encoding: [0x83,0xe0,0x01]
+  %a = load i1, i1* %x
+  %b = zext i1 %a to i64
+  ret i64 %b
+}
diff --git a/test/CodeGen/X86/atomic-flags.ll b/test/CodeGen/X86/atomic-flags.ll
new file mode 100644
index 0000000000000..e0c4a915965c8
--- /dev/null
+++ b/test/CodeGen/X86/atomic-flags.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s
+
+; Make sure that flags are properly preserved despite atomic optimizations.
+
+define i32 @atomic_and_flags_1(i8* %p, i32 %a, i32 %b) {
+; CHECK-LABEL: atomic_and_flags_1:
+
+  ; Generate flags value, and use it.
+  ; CHECK:      cmpl
+  ; CHECK-NEXT: jne
+  %cmp = icmp eq i32 %a, %b
+  br i1 %cmp, label %L1, label %L2
+
+L1:
+  ; The following pattern will get folded.
+  ; CHECK: incb
+  %1 = load atomic i8, i8* %p seq_cst, align 1
+  %2 = add i8 %1, 1 ; This forces the INC instruction to be generated.
+  store atomic i8 %2, i8* %p release, align 1
+
+  ; Use the comparison result again. We need to rematerialize the comparison
+  ; somehow. This test checks that cmpl gets emitted again, but any
+  ; rematerialization would work (the optimizer used to clobber the flags with
+  ; the add).
+  ; CHECK-NEXT: cmpl
+  ; CHECK-NEXT: jne
+  br i1 %cmp, label %L3, label %L4
+
+L2:
+  ret i32 2
+
+L3:
+  ret i32 3
+
+L4:
+  ret i32 4
+}
+
+; Same as above, but using 2 as immediate to avoid the INC instruction.
+define i32 @atomic_and_flags_2(i8* %p, i32 %a, i32 %b) {
+; CHECK-LABEL: atomic_and_flags_2:
+  ; CHECK:      cmpl
+  ; CHECK-NEXT: jne
+  %cmp = icmp eq i32 %a, %b
+  br i1 %cmp, label %L1, label %L2
+L1:
+  ; CHECK: addb
+  %1 = load atomic i8, i8* %p seq_cst, align 1
+  %2 = add i8 %1, 2
+  store atomic i8 %2, i8* %p release, align 1
+  ; CHECK-NEXT: cmpl
+  ; CHECK-NEXT: jne
+  br i1 %cmp, label %L3, label %L4
+L2:
+  ret i32 2
+L3:
+  ret i32 3
+L4:
+  ret i32 4
+}
diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll
index 4989bc14ef865..d5d3fa6db5e83 100644
--- a/test/CodeGen/X86/atomic-minmax-i6432.ll
+++ b/test/CodeGen/X86/atomic-minmax-i6432.ll
@@ -8,7 +8,7 @@ define void @atomic_maxmin_i6432() {
   %1 = atomicrmw max  i64* @sc64, i64 5 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
-; LINUX: seta
+; LINUX: sbbl
 ; LINUX: cmovne
 ; LINUX: cmovne
 ; LINUX: lock cmpxchg8b
@@ -16,7 +16,7 @@ define void @atomic_maxmin_i6432() {
   %2 = atomicrmw min  i64* @sc64, i64 6 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
-; LINUX: setb
+; LINUX: sbbl
 ; LINUX: cmovne
 ; LINUX: cmovne
 ; LINUX: lock cmpxchg8b
@@ -24,7 +24,7 @@ define void @atomic_maxmin_i6432() {
   %3 = atomicrmw umax i64* @sc64, i64 7 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
-; LINUX: seta
+; LINUX: sbbl
 ; LINUX: cmovne
 ; LINUX: cmovne
 ; LINUX: lock cmpxchg8b
@@ -32,7 +32,7 @@ define void @atomic_maxmin_i6432() {
   %4 = atomicrmw umin i64* @sc64, i64 8 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
-; LINUX: setb
+; LINUX: sbbl
 ; LINUX: cmovne
 ; LINUX: cmovne
 ; LINUX: lock cmpxchg8b
diff --git a/test/CodeGen/X86/atomic-non-integer.ll b/test/CodeGen/X86/atomic-non-integer.ll
new file mode 100644
index 0000000000000..98fcd96d3e4c1
--- /dev/null
+++ b/test/CodeGen/X86/atomic-non-integer.ll
@@ -0,0 +1,108 @@
+; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s
+
+; Note: This test is testing that the lowering for atomics matches what we
+; currently emit for non-atomics + the atomic restriction.  The presence of
+; particular lowering detail in these tests should not be read as requiring
+; that detail for correctness unless it's related to the atomicity itself.
+; (Specifically, there were reviewer questions about the lowering for halfs
+;  and their calling convention which remain unresolved.)
+
+define void @store_half(half* %fptr, half %v) {
+; CHECK-LABEL: @store_half
+; CHECK: movq	%rdi, %rbx
+; CHECK: callq	__gnu_f2h_ieee
+; CHECK: movw	%ax, (%rbx)
+  store atomic half %v, half* %fptr unordered, align 2
+  ret void
+}
+
+define void @store_float(float* %fptr, float %v) {
+; CHECK-LABEL: @store_float
+; CHECK: movd	%xmm0, %eax
+; CHECK: movl	%eax, (%rdi)
+  store atomic float %v, float* %fptr unordered, align 4
+  ret void
+}
+
+define void @store_double(double* %fptr, double %v) {
+; CHECK-LABEL: @store_double
+; CHECK: movd	%xmm0, %rax
+; CHECK: movq	%rax, (%rdi)
+  store atomic double %v, double* %fptr unordered, align 8
+  ret void
+}
+
+define void @store_fp128(fp128* %fptr, fp128 %v) {
+; CHECK-LABEL: @store_fp128
+; CHECK: callq	__sync_lock_test_and_set_16
+  store atomic fp128 %v, fp128* %fptr unordered, align 16
+  ret void
+}
+
+define half @load_half(half* %fptr) {
+; CHECK-LABEL: @load_half
+; CHECK: movw	(%rdi), %ax
+; CHECK: movzwl	%ax, %edi
+; CHECK: jmp	__gnu_h2f_ieee
+  %v = load atomic half, half* %fptr unordered, align 2
+  ret half %v
+}
+
+define float @load_float(float* %fptr) {
+; CHECK-LABEL: @load_float
+; CHECK: movl	(%rdi), %eax
+; CHECK: movd	%eax, %xmm0
+  %v = load atomic float, float* %fptr unordered, align 4
+  ret float %v
+}
+
+define double @load_double(double* %fptr) {
+; CHECK-LABEL: @load_double
+; CHECK: movq	(%rdi), %rax
+; CHECK: movd	%rax, %xmm0
+  %v = load atomic double, double* %fptr unordered, align 8
+  ret double %v
+}
+
+define fp128 @load_fp128(fp128* %fptr) {
+; CHECK-LABEL: @load_fp128
+; CHECK: callq	__sync_val_compare_and_swap_16
+  %v = load atomic fp128, fp128* %fptr unordered, align 16
+  ret fp128 %v
+}
+
+
+; sanity check the seq_cst lowering since that's the 
+; interesting one from an ordering perspective on x86.
+
+define void @store_float_seq_cst(float* %fptr, float %v) {
+; CHECK-LABEL: @store_float_seq_cst
+; CHECK: movd	%xmm0, %eax
+; CHECK: xchgl	%eax, (%rdi)
+  store atomic float %v, float* %fptr seq_cst, align 4
+  ret void
+}
+
+define void @store_double_seq_cst(double* %fptr, double %v) {
+; CHECK-LABEL: @store_double_seq_cst
+; CHECK: movd	%xmm0, %rax
+; CHECK: xchgq	%rax, (%rdi)
+  store atomic double %v, double* %fptr seq_cst, align 8
+  ret void
+}
+
+define float @load_float_seq_cst(float* %fptr) {
+; CHECK-LABEL: @load_float_seq_cst
+; CHECK: movl	(%rdi), %eax
+; CHECK: movd	%eax, %xmm0
+  %v = load atomic float, float* %fptr seq_cst, align 4
+  ret float %v
+}
+
+define double @load_double_seq_cst(double* %fptr) {
+; CHECK-LABEL: @load_double_seq_cst
+; CHECK: movq	(%rdi), %rax
+; CHECK: movd	%rax, %xmm0
+  %v = load atomic double, double* %fptr seq_cst, align 8
+  ret double %v
+}
diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll
index dea7d482f9898..c41269b0b606a 100644
--- a/test/CodeGen/X86/atomic128.ll
+++ b/test/CodeGen/X86/atomic128.ll
@@ -119,16 +119,9 @@ define void @fetch_and_min(i128* %p, i128 %bits) {
 ; CHECK-DAG:     movq 8(%rdi), %rdx
 
 ; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         cmpq %rsi, %rax
-; CHECK:         setbe [[CMP:%[a-z0-9]+]]
-; CHECK:         cmpq [[INCHI]], %rdx
-; CHECK:         setle [[HICMP:%[a-z0-9]+]]
-; CHECK:         je [[USE_LO:.?LBB[0-9]+_[0-9]+]]
-
-; CHECK:         movb [[HICMP]], [[CMP]]
-; CHECK: [[USE_LO]]:
-; CHECK:         testb [[CMP]], [[CMP]]
-; CHECK:         movq %rsi, %rbx
+; CHECK:         cmpq
+; CHECK:         sbbq
+; CHECK:         setg
 ; CHECK:         cmovneq %rax, %rbx
 ; CHECK:         movq [[INCHI]], %rcx
 ; CHECK:         cmovneq %rdx, %rcx
@@ -151,16 +144,9 @@ define void @fetch_and_max(i128* %p, i128 %bits) {
 ; CHECK-DAG:     movq 8(%rdi), %rdx
 
 ; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         cmpq %rsi, %rax
-; CHECK:         setae [[CMP:%[a-z0-9]+]]
-; CHECK:         cmpq [[INCHI]], %rdx
-; CHECK:         setge [[HICMP:%[a-z0-9]+]]
-; CHECK:         je [[USE_LO:.?LBB[0-9]+_[0-9]+]]
-
-; CHECK:         movb [[HICMP]], [[CMP]]
-; CHECK: [[USE_LO]]:
-; CHECK:         testb [[CMP]], [[CMP]]
-; CHECK:         movq %rsi, %rbx
+; CHECK:         cmpq
+; CHECK:         sbbq
+; CHECK:         setge
 ; CHECK:         cmovneq %rax, %rbx
 ; CHECK:         movq [[INCHI]], %rcx
 ; CHECK:         cmovneq %rdx, %rcx
@@ -183,16 +169,9 @@ define void @fetch_and_umin(i128* %p, i128 %bits) {
 ; CHECK-DAG:     movq 8(%rdi), %rdx
 
 ; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         cmpq %rsi, %rax
-; CHECK:         setbe [[CMP:%[a-z0-9]+]]
-; CHECK:         cmpq [[INCHI]], %rdx
-; CHECK:         setbe [[HICMP:%[a-z0-9]+]]
-; CHECK:         je [[USE_LO:.?LBB[0-9]+_[0-9]+]]
-
-; CHECK:         movb [[HICMP]], [[CMP]]
-; CHECK: [[USE_LO]]:
-; CHECK:         testb [[CMP]], [[CMP]]
-; CHECK:         movq %rsi, %rbx
+; CHECK:         cmpq
+; CHECK:         sbbq
+; CHECK:         seta
 ; CHECK:         cmovneq %rax, %rbx
 ; CHECK:         movq [[INCHI]], %rcx
 ; CHECK:         cmovneq %rdx, %rcx
@@ -215,16 +194,9 @@ define void @fetch_and_umax(i128* %p, i128 %bits) {
 ; CHECK-DAG:     movq 8(%rdi), %rdx
 
 ; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         cmpq %rax, %rsi
-; CHECK:         setb [[CMP:%[a-z0-9]+]]
-; CHECK:         cmpq [[INCHI]], %rdx
-; CHECK:         seta [[HICMP:%[a-z0-9]+]]
-; CHECK:         je [[USE_LO:.?LBB[0-9]+_[0-9]+]]
-
-; CHECK:         movb [[HICMP]], [[CMP]]
-; CHECK: [[USE_LO]]:
-; CHECK:         testb [[CMP]], [[CMP]]
-; CHECK:         movq %rsi, %rbx
+; CHECK:         cmpq
+; CHECK:         sbbq
+; CHECK:         setb
 ; CHECK:         cmovneq %rax, %rbx
 ; CHECK:         movq [[INCHI]], %rcx
 ; CHECK:         cmovneq %rdx, %rcx
diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll
index 7a6204fc89301..356d9dcff6fa1 100644
--- a/test/CodeGen/X86/atomic_mi.ll
+++ b/test/CodeGen/X86/atomic_mi.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix X64
-; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix X32
-; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
 
 ; This file checks that atomic (non-seq_cst) stores of immediate values are
 ; done in one mov instruction and not 2. More precisely, it makes sure that the
@@ -14,7 +14,11 @@
 ; The binary operations supported are currently add, and, or, xor.
 ; sub is not supported because they are translated by an addition of the
 ; negated immediate.
-; Finally, we also check the same kind of pattern for inc/dec
+;
+; We also check the same patterns:
+; - For inc/dec.
+; - For register instead of immediate operands.
+; - For floating point operations.
 
 ; seq_cst stores are left as (lock) xchgl, but we try to check every other
 ; attribute at least once.
@@ -25,10 +29,10 @@
 ; an implicit lock prefix, so making it explicit is not required.
 
 define void @store_atomic_imm_8(i8* %p) {
-; X64-LABEL: store_atomic_imm_8
+; X64-LABEL: store_atomic_imm_8:
 ; X64: movb
 ; X64-NOT: movb
-; X32-LABEL: store_atomic_imm_8
+; X32-LABEL: store_atomic_imm_8:
 ; X32: movb
 ; X32-NOT: movb
   store atomic i8 42, i8* %p release, align 1
@@ -36,10 +40,10 @@ define void @store_atomic_imm_8(i8* %p) {
 }
 
 define void @store_atomic_imm_16(i16* %p) {
-; X64-LABEL: store_atomic_imm_16
+; X64-LABEL: store_atomic_imm_16:
 ; X64: movw
 ; X64-NOT: movw
-; X32-LABEL: store_atomic_imm_16
+; X32-LABEL: store_atomic_imm_16:
 ; X32: movw
 ; X32-NOT: movw
   store atomic i16 42, i16* %p monotonic, align 2
@@ -47,12 +51,12 @@ define void @store_atomic_imm_16(i16* %p) {
 }
 
 define void @store_atomic_imm_32(i32* %p) {
-; X64-LABEL: store_atomic_imm_32
+; X64-LABEL: store_atomic_imm_32:
 ; X64: movl
 ; X64-NOT: movl
 ;   On 32 bits, there is an extra movl for each of those functions
 ;   (probably for alignment reasons).
-; X32-LABEL: store_atomic_imm_32
+; X32-LABEL: store_atomic_imm_32:
 ; X32: movl 4(%esp), %eax
 ; X32: movl
 ; X32-NOT: movl
@@ -61,12 +65,12 @@ define void @store_atomic_imm_32(i32* %p) {
 }
 
 define void @store_atomic_imm_64(i64* %p) {
-; X64-LABEL: store_atomic_imm_64
+; X64-LABEL: store_atomic_imm_64:
 ; X64: movq
 ; X64-NOT: movq
 ;   These are implemented with a CAS loop on 32 bit architectures, and thus
 ;   cannot be optimized in the same way as the others.
-; X32-LABEL: store_atomic_imm_64
+; X32-LABEL: store_atomic_imm_64:
 ; X32: cmpxchg8b
   store atomic i64 42, i64* %p release, align 8
   ret void
@@ -75,7 +79,7 @@ define void @store_atomic_imm_64(i64* %p) {
 ; If an immediate is too big to fit in 32 bits, it cannot be store in one mov,
 ; even on X64, one must use movabsq that can only target a register.
 define void @store_atomic_imm_64_big(i64* %p) {
-; X64-LABEL: store_atomic_imm_64_big
+; X64-LABEL: store_atomic_imm_64_big:
 ; X64: movabsq
 ; X64: movq
   store atomic i64 100000000000, i64* %p monotonic, align 8
@@ -84,9 +88,9 @@ define void @store_atomic_imm_64_big(i64* %p) {
 
 ; It would be incorrect to replace a lock xchgl by a movl
 define void @store_atomic_imm_32_seq_cst(i32* %p) {
-; X64-LABEL: store_atomic_imm_32_seq_cst
+; X64-LABEL: store_atomic_imm_32_seq_cst:
 ; X64: xchgl
-; X32-LABEL: store_atomic_imm_32_seq_cst
+; X32-LABEL: store_atomic_imm_32_seq_cst:
 ; X32: xchgl
   store atomic i32 42, i32* %p seq_cst, align 4
   ret void
@@ -94,12 +98,12 @@ define void @store_atomic_imm_32_seq_cst(i32* %p) {
 
 ; ----- ADD -----
 
-define void @add_8(i8* %p) {
-; X64-LABEL: add_8
+define void @add_8i(i8* %p) {
+; X64-LABEL: add_8i:
 ; X64-NOT: lock
 ; X64: addb
 ; X64-NOT: movb
-; X32-LABEL: add_8
+; X32-LABEL: add_8i:
 ; X32-NOT: lock
 ; X32: addb
 ; X32-NOT: movb
@@ -109,12 +113,27 @@ define void @add_8(i8* %p) {
   ret void
 }
 
-define void @add_16(i16* %p) {
+define void @add_8r(i8* %p, i8 %v) {
+; X64-LABEL: add_8r:
+; X64-NOT: lock
+; X64: addb
+; X64-NOT: movb
+; X32-LABEL: add_8r:
+; X32-NOT: lock
+; X32: addb
+; X32-NOT: movb
+  %1 = load atomic i8, i8* %p seq_cst, align 1
+  %2 = add i8 %1, %v
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @add_16i(i16* %p) {
 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
-; X64-LABEL: add_16
+; X64-LABEL: add_16i:
 ; X64-NOT: addw
-; X32-LABEL: add_16
+; X32-LABEL: add_16i:
 ; X32-NOT: addw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = add i16 %1, 2
@@ -122,12 +141,25 @@ define void @add_16(i16* %p) {
   ret void
 }
 
-define void @add_32(i32* %p) {
-; X64-LABEL: add_32
+define void @add_16r(i16* %p, i16 %v) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: add_16r:
+; X64-NOT: addw
+; X32-LABEL: add_16r:
+; X32-NOT: addw [.*], (
+  %1 = load atomic i16, i16* %p acquire, align 2
+  %2 = add i16 %1, %v
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @add_32i(i32* %p) {
+; X64-LABEL: add_32i:
 ; X64-NOT: lock
 ; X64: addl
 ; X64-NOT: movl
-; X32-LABEL: add_32
+; X32-LABEL: add_32i:
 ; X32-NOT: lock
 ; X32: addl
 ; X32-NOT: movl
@@ -137,23 +169,94 @@ define void @add_32(i32* %p) {
   ret void
 }
 
-define void @add_64(i64* %p) {
-; X64-LABEL: add_64
+define void @add_32r(i32* %p, i32 %v) {
+; X64-LABEL: add_32r:
+; X64-NOT: lock
+; X64: addl
+; X64-NOT: movl
+; X32-LABEL: add_32r:
+; X32-NOT: lock
+; X32: addl
+; X32-NOT: movl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = add i32 %1, %v
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+; The following is a corner case where the load is added to itself. The pattern
+; matching should not fold this. We only test with 32-bit add, but the same
+; applies to other sizes and operations.
+define void @add_32r_self(i32* %p) {
+; X64-LABEL: add_32r_self:
+; X64-NOT: lock
+; X64: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
+; X64: addl %[[R]], %[[R]]
+; X64: movl %[[R]], (%[[M]])
+; X32-LABEL: add_32r_self:
+; X32-NOT: lock
+; X32: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
+; X32: addl %[[R]], %[[R]]
+; X32: movl %[[R]], (%[[M]])
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = add i32 %1, %1
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+; The following is a corner case where the load's result is returned. The
+; optimizer isn't allowed to duplicate the load because it's atomic.
+define i32 @add_32r_ret_load(i32* %p, i32 %v) {
+; X64-LABEL: add_32r_ret_load:
+; X64-NOT: lock
+; X64:      movl (%rdi), %eax
+; X64-NEXT: addl %eax, %esi
+; X64-NEXT: movl %esi, (%rdi)
+; X64-NEXT: retq
+; X32-LABEL: add_32r_ret_load:
+; X32-NOT: lock
+; X32:      movl 4(%esp), %[[P:[a-z]+]]
+; X32-NEXT: movl (%[[P]]),
+; X32-NOT: %[[P]]
+; More code here, we just don't want it to load from P.
+; X32: movl %{{.*}}, (%[[P]])
+; X32-NEXT: retl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = add i32 %1, %v
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret i32 %1
+}
+
+define void @add_64i(i64* %p) {
+; X64-LABEL: add_64i:
 ; X64-NOT: lock
 ; X64: addq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'addq'.
-; X32-LABEL: add_64
+; X32-LABEL: add_64i:
   %1 = load atomic i64, i64* %p acquire, align 8
   %2 = add i64 %1, 2
   store atomic i64 %2, i64* %p release, align 8
   ret void
 }
 
-define void @add_32_seq_cst(i32* %p) {
-; X64-LABEL: add_32_seq_cst
+define void @add_64r(i64* %p, i64 %v) {
+; X64-LABEL: add_64r:
+; X64-NOT: lock
+; X64: addq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'addq'.
+; X32-LABEL: add_64r:
+  %1 = load atomic i64, i64* %p acquire, align 8
+  %2 = add i64 %1, %v
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @add_32i_seq_cst(i32* %p) {
+; X64-LABEL: add_32i_seq_cst:
 ; X64: xchgl
-; X32-LABEL: add_32_seq_cst
+; X32-LABEL: add_32i_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = add i32 %1, 2
@@ -161,14 +264,25 @@ define void @add_32_seq_cst(i32* %p) {
   ret void
 }
 
+define void @add_32r_seq_cst(i32* %p, i32 %v) {
+; X64-LABEL: add_32r_seq_cst:
+; X64: xchgl
+; X32-LABEL: add_32r_seq_cst:
+; X32: xchgl
+  %1 = load atomic i32, i32* %p monotonic, align 4
+  %2 = add i32 %1, %v
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
 ; ----- AND -----
 
-define void @and_8(i8* %p) {
-; X64-LABEL: and_8
+define void @and_8i(i8* %p) {
+; X64-LABEL: and_8i:
 ; X64-NOT: lock
 ; X64: andb
 ; X64-NOT: movb
-; X32-LABEL: and_8
+; X32-LABEL: and_8i:
 ; X32-NOT: lock
 ; X32: andb
 ; X32-NOT: movb
@@ -178,12 +292,27 @@ define void @and_8(i8* %p) {
   ret void
 }
 
-define void @and_16(i16* %p) {
+define void @and_8r(i8* %p, i8 %v) {
+; X64-LABEL: and_8r:
+; X64-NOT: lock
+; X64: andb
+; X64-NOT: movb
+; X32-LABEL: and_8r:
+; X32-NOT: lock
+; X32: andb
+; X32-NOT: movb
+  %1 = load atomic i8, i8* %p monotonic, align 1
+  %2 = and i8 %1, %v
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @and_16i(i16* %p) {
 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
-; X64-LABEL: and_16
+; X64-LABEL: and_16i:
 ; X64-NOT: andw
-; X32-LABEL: and_16
+; X32-LABEL: and_16i:
 ; X32-NOT: andw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = and i16 %1, 2
@@ -191,12 +320,25 @@ define void @and_16(i16* %p) {
   ret void
 }
 
-define void @and_32(i32* %p) {
-; X64-LABEL: and_32
+define void @and_16r(i16* %p, i16 %v) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: and_16r:
+; X64-NOT: andw
+; X32-LABEL: and_16r:
+; X32-NOT: andw [.*], (
+  %1 = load atomic i16, i16* %p acquire, align 2
+  %2 = and i16 %1, %v
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @and_32i(i32* %p) {
+; X64-LABEL: and_32i:
 ; X64-NOT: lock
 ; X64: andl
 ; X64-NOT: movl
-; X32-LABEL: and_32
+; X32-LABEL: and_32i:
 ; X32-NOT: lock
 ; X32: andl
 ; X32-NOT: movl
@@ -206,23 +348,51 @@ define void @and_32(i32* %p) {
   ret void
 }
 
-define void @and_64(i64* %p) {
-; X64-LABEL: and_64
+define void @and_32r(i32* %p, i32 %v) {
+; X64-LABEL: and_32r:
+; X64-NOT: lock
+; X64: andl
+; X64-NOT: movl
+; X32-LABEL: and_32r:
+; X32-NOT: lock
+; X32: andl
+; X32-NOT: movl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = and i32 %1, %v
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @and_64i(i64* %p) {
+; X64-LABEL: and_64i:
 ; X64-NOT: lock
 ; X64: andq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'andq'.
-; X32-LABEL: and_64
+; X32-LABEL: and_64i:
   %1 = load atomic i64, i64* %p acquire, align 8
   %2 = and i64 %1, 2
   store atomic i64 %2, i64* %p release, align 8
   ret void
 }
 
-define void @and_32_seq_cst(i32* %p) {
-; X64-LABEL: and_32_seq_cst
+define void @and_64r(i64* %p, i64 %v) {
+; X64-LABEL: and_64r:
+; X64-NOT: lock
+; X64: andq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'andq'.
+; X32-LABEL: and_64r:
+  %1 = load atomic i64, i64* %p acquire, align 8
+  %2 = and i64 %1, %v
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @and_32i_seq_cst(i32* %p) {
+; X64-LABEL: and_32i_seq_cst:
 ; X64: xchgl
-; X32-LABEL: and_32_seq_cst
+; X32-LABEL: and_32i_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = and i32 %1, 2
@@ -230,14 +400,25 @@ define void @and_32_seq_cst(i32* %p) {
   ret void
 }
 
+define void @and_32r_seq_cst(i32* %p, i32 %v) {
+; X64-LABEL: and_32r_seq_cst:
+; X64: xchgl
+; X32-LABEL: and_32r_seq_cst:
+; X32: xchgl
+  %1 = load atomic i32, i32* %p monotonic, align 4
+  %2 = and i32 %1, %v
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
 ; ----- OR -----
 
-define void @or_8(i8* %p) {
-; X64-LABEL: or_8
+define void @or_8i(i8* %p) {
+; X64-LABEL: or_8i:
 ; X64-NOT: lock
 ; X64: orb
 ; X64-NOT: movb
-; X32-LABEL: or_8
+; X32-LABEL: or_8i:
 ; X32-NOT: lock
 ; X32: orb
 ; X32-NOT: movb
@@ -247,10 +428,25 @@ define void @or_8(i8* %p) {
   ret void
 }
 
-define void @or_16(i16* %p) {
-; X64-LABEL: or_16
+define void @or_8r(i8* %p, i8 %v) {
+; X64-LABEL: or_8r:
+; X64-NOT: lock
+; X64: orb
+; X64-NOT: movb
+; X32-LABEL: or_8r:
+; X32-NOT: lock
+; X32: orb
+; X32-NOT: movb
+  %1 = load atomic i8, i8* %p acquire, align 1
+  %2 = or i8 %1, %v
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @or_16i(i16* %p) {
+; X64-LABEL: or_16i:
 ; X64-NOT: orw
-; X32-LABEL: or_16
+; X32-LABEL: or_16i:
 ; X32-NOT: orw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = or i16 %1, 2
@@ -258,12 +454,23 @@ define void @or_16(i16* %p) {
   ret void
 }
 
-define void @or_32(i32* %p) {
-; X64-LABEL: or_32
+define void @or_16r(i16* %p, i16 %v) {
+; X64-LABEL: or_16r:
+; X64-NOT: orw
+; X32-LABEL: or_16r:
+; X32-NOT: orw [.*], (
+  %1 = load atomic i16, i16* %p acquire, align 2
+  %2 = or i16 %1, %v
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @or_32i(i32* %p) {
+; X64-LABEL: or_32i:
 ; X64-NOT: lock
 ; X64: orl
 ; X64-NOT: movl
-; X32-LABEL: or_32
+; X32-LABEL: or_32i:
 ; X32-NOT: lock
 ; X32: orl
 ; X32-NOT: movl
@@ -273,23 +480,51 @@ define void @or_32(i32* %p) {
   ret void
 }
 
-define void @or_64(i64* %p) {
-; X64-LABEL: or_64
+define void @or_32r(i32* %p, i32 %v) {
+; X64-LABEL: or_32r:
+; X64-NOT: lock
+; X64: orl
+; X64-NOT: movl
+; X32-LABEL: or_32r:
+; X32-NOT: lock
+; X32: orl
+; X32-NOT: movl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = or i32 %1, %v
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @or_64i(i64* %p) {
+; X64-LABEL: or_64i:
 ; X64-NOT: lock
 ; X64: orq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'orq'.
-; X32-LABEL: or_64
+; X32-LABEL: or_64i:
   %1 = load atomic i64, i64* %p acquire, align 8
   %2 = or i64 %1, 2
   store atomic i64 %2, i64* %p release, align 8
   ret void
 }
 
-define void @or_32_seq_cst(i32* %p) {
-; X64-LABEL: or_32_seq_cst
+define void @or_64r(i64* %p, i64 %v) {
+; X64-LABEL: or_64r:
+; X64-NOT: lock
+; X64: orq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'orq'.
+; X32-LABEL: or_64r:
+  %1 = load atomic i64, i64* %p acquire, align 8
+  %2 = or i64 %1, %v
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @or_32i_seq_cst(i32* %p) {
+; X64-LABEL: or_32i_seq_cst:
 ; X64: xchgl
-; X32-LABEL: or_32_seq_cst
+; X32-LABEL: or_32i_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = or i32 %1, 2
@@ -297,14 +532,25 @@ define void @or_32_seq_cst(i32* %p) {
   ret void
 }
 
+define void @or_32r_seq_cst(i32* %p, i32 %v) {
+; X64-LABEL: or_32r_seq_cst:
+; X64: xchgl
+; X32-LABEL: or_32r_seq_cst:
+; X32: xchgl
+  %1 = load atomic i32, i32* %p monotonic, align 4
+  %2 = or i32 %1, %v
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
 ; ----- XOR -----
 
-define void @xor_8(i8* %p) {
-; X64-LABEL: xor_8
+define void @xor_8i(i8* %p) {
+; X64-LABEL: xor_8i:
 ; X64-NOT: lock
 ; X64: xorb
 ; X64-NOT: movb
-; X32-LABEL: xor_8
+; X32-LABEL: xor_8i:
 ; X32-NOT: lock
 ; X32: xorb
 ; X32-NOT: movb
@@ -314,10 +560,25 @@ define void @xor_8(i8* %p) {
   ret void
 }
 
-define void @xor_16(i16* %p) {
-; X64-LABEL: xor_16
+define void @xor_8r(i8* %p, i8 %v) {
+; X64-LABEL: xor_8r:
+; X64-NOT: lock
+; X64: xorb
+; X64-NOT: movb
+; X32-LABEL: xor_8r:
+; X32-NOT: lock
+; X32: xorb
+; X32-NOT: movb
+  %1 = load atomic i8, i8* %p acquire, align 1
+  %2 = xor i8 %1, %v
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @xor_16i(i16* %p) {
+; X64-LABEL: xor_16i:
 ; X64-NOT: xorw
-; X32-LABEL: xor_16
+; X32-LABEL: xor_16i:
 ; X32-NOT: xorw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = xor i16 %1, 2
@@ -325,12 +586,23 @@ define void @xor_16(i16* %p) {
   ret void
 }
 
-define void @xor_32(i32* %p) {
-; X64-LABEL: xor_32
+define void @xor_16r(i16* %p, i16 %v) {
+; X64-LABEL: xor_16r:
+; X64-NOT: xorw
+; X32-LABEL: xor_16r:
+; X32-NOT: xorw [.*], (
+  %1 = load atomic i16, i16* %p acquire, align 2
+  %2 = xor i16 %1, %v
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @xor_32i(i32* %p) {
+; X64-LABEL: xor_32i:
 ; X64-NOT: lock
 ; X64: xorl
 ; X64-NOT: movl
-; X32-LABEL: xor_32
+; X32-LABEL: xor_32i:
 ; X32-NOT: lock
 ; X32: xorl
 ; X32-NOT: movl
@@ -340,23 +612,51 @@ define void @xor_32(i32* %p) {
   ret void
 }
 
-define void @xor_64(i64* %p) {
-; X64-LABEL: xor_64
+define void @xor_32r(i32* %p, i32 %v) {
+; X64-LABEL: xor_32r:
+; X64-NOT: lock
+; X64: xorl
+; X64-NOT: movl
+; X32-LABEL: xor_32r:
+; X32-NOT: lock
+; X32: xorl
+; X32-NOT: movl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = xor i32 %1, %v
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @xor_64i(i64* %p) {
+; X64-LABEL: xor_64i:
 ; X64-NOT: lock
 ; X64: xorq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'xorq'.
-; X32-LABEL: xor_64
+; X32-LABEL: xor_64i:
   %1 = load atomic i64, i64* %p acquire, align 8
   %2 = xor i64 %1, 2
   store atomic i64 %2, i64* %p release, align 8
   ret void
 }
 
-define void @xor_32_seq_cst(i32* %p) {
-; X64-LABEL: xor_32_seq_cst
+define void @xor_64r(i64* %p, i64 %v) {
+; X64-LABEL: xor_64r:
+; X64-NOT: lock
+; X64: xorq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'xorq'.
+; X32-LABEL: xor_64r:
+  %1 = load atomic i64, i64* %p acquire, align 8
+  %2 = xor i64 %1, %v
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @xor_32i_seq_cst(i32* %p) {
+; X64-LABEL: xor_32i_seq_cst:
 ; X64: xchgl
-; X32-LABEL: xor_32_seq_cst
+; X32-LABEL: xor_32i_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = xor i32 %1, 2
@@ -364,18 +664,29 @@ define void @xor_32_seq_cst(i32* %p) {
   ret void
 }
 
+define void @xor_32r_seq_cst(i32* %p, i32 %v) {
+; X64-LABEL: xor_32r_seq_cst:
+; X64: xchgl
+; X32-LABEL: xor_32r_seq_cst:
+; X32: xchgl
+  %1 = load atomic i32, i32* %p monotonic, align 4
+  %2 = xor i32 %1, %v
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
 ; ----- INC -----
 
 define void @inc_8(i8* %p) {
-; X64-LABEL: inc_8
+; X64-LABEL: inc_8:
 ; X64-NOT: lock
 ; X64: incb
 ; X64-NOT: movb
-; X32-LABEL: inc_8
+; X32-LABEL: inc_8:
 ; X32-NOT: lock
 ; X32: incb
 ; X32-NOT: movb
-; SLOW_INC-LABEL: inc_8
+; SLOW_INC-LABEL: inc_8:
 ; SLOW_INC-NOT: incb
 ; SLOW_INC-NOT: movb
   %1 = load atomic i8, i8* %p seq_cst, align 1
@@ -387,11 +698,11 @@ define void @inc_8(i8* %p) {
 define void @inc_16(i16* %p) {
 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
-; X64-LABEL: inc_16
+; X64-LABEL: inc_16:
 ; X64-NOT: incw
-; X32-LABEL: inc_16
+; X32-LABEL: inc_16:
 ; X32-NOT: incw
-; SLOW_INC-LABEL: inc_16
+; SLOW_INC-LABEL: inc_16:
 ; SLOW_INC-NOT: incw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = add i16 %1, 1
@@ -400,15 +711,15 @@ define void @inc_16(i16* %p) {
 }
 
 define void @inc_32(i32* %p) {
-; X64-LABEL: inc_32
+; X64-LABEL: inc_32:
 ; X64-NOT: lock
 ; X64: incl
 ; X64-NOT: movl
-; X32-LABEL: inc_32
+; X32-LABEL: inc_32:
 ; X32-NOT: lock
 ; X32: incl
 ; X32-NOT: movl
-; SLOW_INC-LABEL: inc_32
+; SLOW_INC-LABEL: inc_32:
 ; SLOW_INC-NOT: incl
 ; SLOW_INC-NOT: movl
   %1 = load atomic i32, i32* %p acquire, align 4
@@ -418,13 +729,13 @@ define void @inc_32(i32* %p) {
 }
 
 define void @inc_64(i64* %p) {
-; X64-LABEL: inc_64
+; X64-LABEL: inc_64:
 ; X64-NOT: lock
 ; X64: incq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'incq'.
-; X32-LABEL: inc_64
-; SLOW_INC-LABEL: inc_64
+; X32-LABEL: inc_64:
+; SLOW_INC-LABEL: inc_64:
 ; SLOW_INC-NOT: incq
 ; SLOW_INC-NOT: movq
   %1 = load atomic i64, i64* %p acquire, align 8
@@ -434,9 +745,9 @@ define void @inc_64(i64* %p) {
 }
 
 define void @inc_32_seq_cst(i32* %p) {
-; X64-LABEL: inc_32_seq_cst
+; X64-LABEL: inc_32_seq_cst:
 ; X64: xchgl
-; X32-LABEL: inc_32_seq_cst
+; X32-LABEL: inc_32_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = add i32 %1, 1
@@ -447,15 +758,15 @@ define void @inc_32_seq_cst(i32* %p) {
 ; ----- DEC -----
 
 define void @dec_8(i8* %p) {
-; X64-LABEL: dec_8
+; X64-LABEL: dec_8:
 ; X64-NOT: lock
 ; X64: decb
 ; X64-NOT: movb
-; X32-LABEL: dec_8
+; X32-LABEL: dec_8:
 ; X32-NOT: lock
 ; X32: decb
 ; X32-NOT: movb
-; SLOW_INC-LABEL: dec_8
+; SLOW_INC-LABEL: dec_8:
 ; SLOW_INC-NOT: decb
 ; SLOW_INC-NOT: movb
   %1 = load atomic i8, i8* %p seq_cst, align 1
@@ -467,11 +778,11 @@ define void @dec_8(i8* %p) {
 define void @dec_16(i16* %p) {
 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
-; X64-LABEL: dec_16
+; X64-LABEL: dec_16:
 ; X64-NOT: decw
-; X32-LABEL: dec_16
+; X32-LABEL: dec_16:
 ; X32-NOT: decw
-; SLOW_INC-LABEL: dec_16
+; SLOW_INC-LABEL: dec_16:
 ; SLOW_INC-NOT: decw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = sub i16 %1, 1
@@ -480,15 +791,15 @@ define void @dec_16(i16* %p) {
 }
 
 define void @dec_32(i32* %p) {
-; X64-LABEL: dec_32
+; X64-LABEL: dec_32:
 ; X64-NOT: lock
 ; X64: decl
 ; X64-NOT: movl
-; X32-LABEL: dec_32
+; X32-LABEL: dec_32:
 ; X32-NOT: lock
 ; X32: decl
 ; X32-NOT: movl
-; SLOW_INC-LABEL: dec_32
+; SLOW_INC-LABEL: dec_32:
 ; SLOW_INC-NOT: decl
 ; SLOW_INC-NOT: movl
   %1 = load atomic i32, i32* %p acquire, align 4
@@ -498,13 +809,13 @@ define void @dec_32(i32* %p) {
 }
 
 define void @dec_64(i64* %p) {
-; X64-LABEL: dec_64
+; X64-LABEL: dec_64:
 ; X64-NOT: lock
 ; X64: decq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'decq'.
-; X32-LABEL: dec_64
-; SLOW_INC-LABEL: dec_64
+; X32-LABEL: dec_64:
+; SLOW_INC-LABEL: dec_64:
 ; SLOW_INC-NOT: decq
 ; SLOW_INC-NOT: movq
   %1 = load atomic i64, i64* %p acquire, align 8
@@ -514,12 +825,157 @@ define void @dec_64(i64* %p) {
 }
 
 define void @dec_32_seq_cst(i32* %p) {
-; X64-LABEL: dec_32_seq_cst
+; X64-LABEL: dec_32_seq_cst:
 ; X64: xchgl
-; X32-LABEL: dec_32_seq_cst
+; X32-LABEL: dec_32_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = sub i32 %1, 1
   store atomic i32 %2, i32* %p seq_cst, align 4
   ret void
 }
+
+; ----- FADD -----
+
+define void @fadd_32r(float* %loc, float %val) {
+; X64-LABEL: fadd_32r:
+; X64-NOT: lock
+; X64-NOT: mov
+; X64: addss (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: movss %[[XMM]], (%[[M]])
+; X32-LABEL: fadd_32r:
+; Don't check x86-32.
+; LLVM's SSE handling is conservative on x86-32 even without using atomics.
+  %floc = bitcast float* %loc to i32*
+  %1 = load atomic i32, i32* %floc seq_cst, align 4
+  %2 = bitcast i32 %1 to float
+  %add = fadd float %2, %val
+  %3 = bitcast float %add to i32
+  store atomic i32 %3, i32* %floc release, align 4
+  ret void
+}
+
+define void @fadd_64r(double* %loc, double %val) {
+; X64-LABEL: fadd_64r:
+; X64-NOT: lock
+; X64-NOT: mov
+; X64: addsd (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: movsd %[[XMM]], (%[[M]])
+; X32-LABEL: fadd_64r:
+; Don't check x86-32 (see comment above).
+  %floc = bitcast double* %loc to i64*
+  %1 = load atomic i64, i64* %floc seq_cst, align 8
+  %2 = bitcast i64 %1 to double
+  %add = fadd double %2, %val
+  %3 = bitcast double %add to i64
+  store atomic i64 %3, i64* %floc release, align 8
+  ret void
+}
+
+@glob32 = global float 0.000000e+00, align 4
+@glob64 = global double 0.000000e+00, align 8
+
+; Floating-point add to a global using an immediate.
+define void @fadd_32g() {
+; X64-LABEL: fadd_32g:
+; X64-NOT: lock
+; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: addss glob32(%rip), %[[XMM]]
+; X64-NEXT: movss %[[XMM]], glob32(%rip)
+; X32-LABEL: fadd_32g:
+; Don't check x86-32 (see comment above).
+  %i = load atomic i32, i32* bitcast (float* @glob32 to i32*) monotonic, align 4
+  %f = bitcast i32 %i to float
+  %add = fadd float %f, 1.000000e+00
+  %s = bitcast float %add to i32
+  store atomic i32 %s, i32* bitcast (float* @glob32 to i32*) monotonic, align 4
+  ret void
+}
+
+define void @fadd_64g() {
+; X64-LABEL: fadd_64g:
+; X64-NOT: lock
+; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: addsd glob64(%rip), %[[XMM]]
+; X64-NEXT: movsd %[[XMM]], glob64(%rip)
+; X32-LABEL: fadd_64g:
+; Don't check x86-32 (see comment above).
+  %i = load atomic i64, i64* bitcast (double* @glob64 to i64*) monotonic, align 8
+  %f = bitcast i64 %i to double
+  %add = fadd double %f, 1.000000e+00
+  %s = bitcast double %add to i64
+  store atomic i64 %s, i64* bitcast (double* @glob64 to i64*) monotonic, align 8
+  ret void
+}
+
+; Floating-point add to a hard-coded immediate location using an immediate.
+define void @fadd_32imm() {
+; X64-LABEL: fadd_32imm:
+; X64-NOT: lock
+; X64:      movl $3735928559, %e[[M:[a-z]+]]
+; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: addss (%r[[M]]), %[[XMM]]
+; X64-NEXT: movss %[[XMM]], (%r[[M]])
+; X32-LABEL: fadd_32imm:
+; Don't check x86-32 (see comment above).
+  %i = load atomic i32, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4
+  %f = bitcast i32 %i to float
+  %add = fadd float %f, 1.000000e+00
+  %s = bitcast float %add to i32
+  store atomic i32 %s, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4
+  ret void
+}
+
+define void @fadd_64imm() {
+; X64-LABEL: fadd_64imm:
+; X64-NOT: lock
+; X64:      movl $3735928559, %e[[M:[a-z]+]]
+; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: addsd (%r[[M]]), %[[XMM]]
+; X64-NEXT: movsd %[[XMM]], (%r[[M]])
+; X32-LABEL: fadd_64imm:
+; Don't check x86-32 (see comment above).
+  %i = load atomic i64, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8
+  %f = bitcast i64 %i to double
+  %add = fadd double %f, 1.000000e+00
+  %s = bitcast double %add to i64
+  store atomic i64 %s, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8
+  ret void
+}
+
+; Floating-point add to a stack location.
+define void @fadd_32stack() {
+; X64-LABEL: fadd_32stack:
+; X64-NOT: lock
+; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: addss [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]]
+; X64-NEXT: movss %[[XMM]], [[STACKOFF]](%rsp)
+; X32-LABEL: fadd_32stack:
+; Don't check x86-32 (see comment above).
+  %ptr = alloca i32, align 4
+  %bc3 = bitcast i32* %ptr to float*
+  %load = load atomic i32, i32* %ptr acquire, align 4
+  %bc0 = bitcast i32 %load to float
+  %fadd = fadd float 1.000000e+00, %bc0
+  %bc1 = bitcast float %fadd to i32
+  store atomic i32 %bc1, i32* %ptr release, align 4
+  ret void
+}
+
+define void @fadd_64stack() {
+; X64-LABEL: fadd_64stack:
+; X64-NOT: lock
+; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: addsd [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]]
+; X64-NEXT: movsd %[[XMM]], [[STACKOFF]](%rsp)
+; X32-LABEL: fadd_64stack:
+; Don't check x86-32 (see comment above).
+  %ptr = alloca i64, align 8
+  %bc3 = bitcast i64* %ptr to double*
+  %load = load atomic i64, i64* %ptr acquire, align 8
+  %bc0 = bitcast i64 %load to double
+  %fadd = fadd double 1.000000e+00, %bc0
+  %bc1 = bitcast double %fadd to i64
+  store atomic i64 %bc1, i64* %ptr release, align 8
+  ret void
+}
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
new file mode 100644
index 0000000000000..f1c636a733051
--- /dev/null
+++ b/test/CodeGen/X86/avg.ll
@@ -0,0 +1,724 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512BW
+
+define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: avg_v4i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: avg_v4i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovd (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = load <4 x i8>, <4 x i8>* %b
+  %3 = zext <4 x i8> %1 to <4 x i32>
+  %4 = zext <4 x i8> %2 to <4 x i32>
+  %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <4 x i32> %5, %4
+  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <4 x i32> %7 to <4 x i8>
+  store <4 x i8> %8, <4 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
+; SSE2-LABEL: avg_v8i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: avg_v8i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = load <8 x i8>, <8 x i8>* %b
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = zext <8 x i8> %2 to <8 x i32>
+  %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <8 x i32> %5, %4
+  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <8 x i32> %7 to <8 x i8>
+  store <8 x i8> %8, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
+; SSE2-LABEL: avg_v16i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    pavgb (%rdi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX-NEXT:    vpavgb (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = load <16 x i8>, <16 x i8>* %b
+  %3 = zext <16 x i8> %1 to <16 x i32>
+  %4 = zext <16 x i8> %2 to <16 x i32>
+  %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <16 x i32> %5, %4
+  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <16 x i32> %7 to <16 x i8>
+  store <16 x i8> %8, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
+; AVX2-LABEL: avg_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512BW-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <32 x i8>, <32 x i8>* %a
+  %2 = load <32 x i8>, <32 x i8>* %b
+  %3 = zext <32 x i8> %1 to <32 x i32>
+  %4 = zext <32 x i8> %2 to <32 x i32>
+  %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <32 x i32> %5, %4
+  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <32 x i32> %7 to <32 x i8>
+  store <32 x i8> %8, <32 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
+; AVX512BW-LABEL: avg_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
+; AVX512BW-NEXT:    vpavgb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <64 x i8>, <64 x i8>* %a
+  %2 = load <64 x i8>, <64 x i8>* %b
+  %3 = zext <64 x i8> %1 to <64 x i32>
+  %4 = zext <64 x i8> %2 to <64 x i32>
+  %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <64 x i32> %5, %4
+  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <64 x i32> %7 to <64 x i8>
+  store <64 x i8> %8, <64 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
+; SSE2-LABEL: avg_v4i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgw %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: avg_v4i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = load <4 x i16>, <4 x i16>* %b
+  %3 = zext <4 x i16> %1 to <4 x i32>
+  %4 = zext <4 x i16> %2 to <4 x i32>
+  %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <4 x i32> %5, %4
+  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <4 x i32> %7 to <4 x i16>
+  store <4 x i16> %8, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
+; SSE2-LABEL: avg_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    pavgw (%rdi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX-NEXT:    vpavgw (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = load <8 x i16>, <8 x i16>* %b
+  %3 = zext <8 x i16> %1 to <8 x i32>
+  %4 = zext <8 x i16> %2 to <8 x i32>
+  %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <8 x i32> %5, %4
+  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  store <8 x i16> %8, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
+; AVX2-LABEL: avg_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512BW-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = load <16 x i16>, <16 x i16>* %b
+  %3 = zext <16 x i16> %1 to <16 x i32>
+  %4 = zext <16 x i16> %2 to <16 x i32>
+  %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <16 x i32> %5, %4
+  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  store <16 x i16> %8, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
+; AVX512BW-LABEL: avg_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rsi), %zmm0
+; AVX512BW-NEXT:    vpavgw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <32 x i16>, <32 x i16>* %a
+  %2 = load <32 x i16>, <32 x i16>* %b
+  %3 = zext <32 x i16> %1 to <32 x i32>
+  %4 = zext <32 x i16> %2 to <32 x i32>
+  %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <32 x i32> %5, %4
+  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  store <32 x i16> %8, <32 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: avg_v4i8_2:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: avg_v4i8_2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v4i8_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovd (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = load <4 x i8>, <4 x i8>* %b
+  %3 = zext <4 x i8> %1 to <4 x i32>
+  %4 = zext <4 x i8> %2 to <4 x i32>
+  %5 = add nuw nsw <4 x i32> %3, %4
+  %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <4 x i32> %7 to <4 x i8>
+  store <4 x i8> %8, <4 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
+; SSE2-LABEL: avg_v8i8_2:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: avg_v8i8_2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v8i8_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = load <8 x i8>, <8 x i8>* %b
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = zext <8 x i8> %2 to <8 x i32>
+  %5 = add nuw nsw <8 x i32> %3, %4
+  %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <8 x i32> %7 to <8 x i8>
+  store <8 x i8> %8, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
+; SSE2-LABEL: avg_v16i8_2:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgb (%rsi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v16i8_2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = load <16 x i8>, <16 x i8>* %b
+  %3 = zext <16 x i8> %1 to <16 x i32>
+  %4 = zext <16 x i8> %2 to <16 x i32>
+  %5 = add nuw nsw <16 x i32> %3, %4
+  %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <16 x i32> %7 to <16 x i8>
+  store <16 x i8> %8, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
+; AVX2-LABEL: avg_v32i8_2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v32i8_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <32 x i8>, <32 x i8>* %a
+  %2 = load <32 x i8>, <32 x i8>* %b
+  %3 = zext <32 x i8> %1 to <32 x i32>
+  %4 = zext <32 x i8> %2 to <32 x i32>
+  %5 = add nuw nsw <32 x i32> %3, %4
+  %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <32 x i32> %7 to <32 x i8>
+  store <32 x i8> %8, <32 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
+; AVX512BW-LABEL: avg_v64i8_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
+; AVX512BW-NEXT:    vpavgb %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <64 x i8>, <64 x i8>* %a
+  %2 = load <64 x i8>, <64 x i8>* %b
+  %3 = zext <64 x i8> %1 to <64 x i32>
+  %4 = zext <64 x i8> %2 to <64 x i32>
+  %5 = add nuw nsw <64 x i32> %4, %4
+  %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <64 x i32> %7 to <64 x i8>
+  store <64 x i8> %8, <64 x i8>* undef, align 4
+  ret void
+}
+
+
+define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
+; SSE2-LABEL: avg_v4i16_2:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgw %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: avg_v4i16_2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v4i16_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = load <4 x i16>, <4 x i16>* %b
+  %3 = zext <4 x i16> %1 to <4 x i32>
+  %4 = zext <4 x i16> %2 to <4 x i32>
+  %5 = add nuw nsw <4 x i32> %3, %4
+  %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <4 x i32> %7 to <4 x i16>
+  store <4 x i16> %8, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
+; SSE2-LABEL: avg_v8i16_2:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgw (%rsi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v8i16_2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = load <8 x i16>, <8 x i16>* %b
+  %3 = zext <8 x i16> %1 to <8 x i32>
+  %4 = zext <8 x i16> %2 to <8 x i32>
+  %5 = add nuw nsw <8 x i32> %3, %4
+  %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  store <8 x i16> %8, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
+; AVX2-LABEL: avg_v16i16_2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v16i16_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = load <16 x i16>, <16 x i16>* %b
+  %3 = zext <16 x i16> %1 to <16 x i32>
+  %4 = zext <16 x i16> %2 to <16 x i32>
+  %5 = add nuw nsw <16 x i32> %3, %4
+  %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  store <16 x i16> %8, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
+; AVX512BW-LABEL: avg_v32i16_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgw (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <32 x i16>, <32 x i16>* %a
+  %2 = load <32 x i16>, <32 x i16>* %b
+  %3 = zext <32 x i16> %1 to <32 x i32>
+  %4 = zext <32 x i16> %2 to <32 x i32>
+  %5 = add nuw nsw <32 x i32> %3, %4
+  %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  store <32 x i16> %8, <32 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v4i8_const(<4 x i8>* %a) {
+; SSE2-LABEL: avg_v4i8_const:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movd %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: avg_v4i8_const:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v4i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
+  %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <4 x i32> %4 to <4 x i8>
+  store <4 x i8> %5, <4 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i8_const(<8 x i8>* %a) {
+; SSE2-LABEL: avg_v8i8_const:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: avg_v8i8_const:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v8i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <8 x i32> %4 to <8 x i8>
+  store <8 x i8> %5, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i8_const(<16 x i8>* %a) {
+; SSE2-LABEL: avg_v16i8_const:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v16i8_const:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = zext <16 x i8> %1 to <16 x i32>
+  %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <16 x i32> %4 to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i8_const(<32 x i8>* %a) {
+; AVX2-LABEL: avg_v32i8_const:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v32i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <32 x i8>, <32 x i8>* %a
+  %2 = zext <32 x i8> %1 to <32 x i32>
+  %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <32 x i32> %4 to <32 x i8>
+  store <32 x i8> %5, <32 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v64i8_const(<64 x i8>* %a) {
+; AVX512BW-LABEL: avg_v64i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <64 x i8>, <64 x i8>* %a
+  %2 = zext <64 x i8> %1 to <64 x i32>
+  %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <64 x i32> %4 to <64 x i8>
+  store <64 x i8> %5, <64 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v4i16_const(<4 x i16>* %a) {
+; SSE2-LABEL: avg_v4i16_const:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: avg_v4i16_const:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v4i16_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
+  %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <4 x i32> %4 to <4 x i16>
+  store <4 x i16> %5, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v8i16_const(<8 x i16>* %a) {
+; SSE2-LABEL: avg_v8i16_const:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v8i16_const:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = zext <8 x i16> %1 to <8 x i32>
+  %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <8 x i32> %4 to <8 x i16>
+  store <8 x i16> %5, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v16i16_const(<16 x i16>* %a) {
+; AVX2-LABEL: avg_v16i16_const:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v16i16_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = zext <16 x i16> %1 to <16 x i32>
+  %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <16 x i32> %4 to <16 x i16>
+  store <16 x i16> %5, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @avg_v32i16_const(<32 x i16>* %a) {
+; AVX512BW-LABEL: avg_v32i16_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
+  %1 = load <32 x i16>, <32 x i16>* %a
+  %2 = zext <32 x i16> %1 to <32 x i32>
+  %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %5 = trunc <32 x i32> %4 to <32 x i16>
+  store <32 x i16> %5, <32 x i16>* undef, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/avx-cvt-2.ll b/test/CodeGen/X86/avx-cvt-2.ll
index 583c7d5947bff..c849312f23673 100644
--- a/test/CodeGen/X86/avx-cvt-2.ll
+++ b/test/CodeGen/X86/avx-cvt-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
 
 ; Check that we generate vector conversion from float to narrower int types
diff --git a/test/CodeGen/X86/avx-cvt.ll b/test/CodeGen/X86/avx-cvt.ll
index 6df3e5324c11e..27339898efdbd 100644
--- a/test/CodeGen/X86/avx-cvt.ll
+++ b/test/CodeGen/X86/avx-cvt.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
 define <8 x float> @sitofp00(<8 x i32> %a) nounwind {
@@ -113,8 +114,7 @@ define float @funcD(i64* nocapture %e) nounwind uwtable readonly ssp {
 define void @fpext() nounwind uwtable {
 ; CHECK-LABEL: fpext:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcvtss2sd -{{[0-9]+}}(%rsp), %xmm0, %xmm0
 ; CHECK-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    retq
   %f = alloca float, align 4
@@ -138,7 +138,7 @@ declare double @llvm.nearbyint.f64(double %p)
 define float @floor_f32(float %a) {
 ; CHECK-LABEL: floor_f32:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vroundss $1, %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call float @llvm.floor.f32(float %a)
   ret float %res
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index e2f690bff232f..4867869863e39 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -143,3 +143,69 @@ define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovsxbd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovsxbq:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovsxbw:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovsxdq:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovsxwd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovsxwq:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 28a0272ecf021..206be2396cba8 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1,8 +1,9 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mattr=avx,aes,pclmul | FileCheck %s
 
 define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_aesni_aesdec:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaesdec %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -13,7 +14,7 @@ declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @test_x86_aesni_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_aesni_aesdeclast:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaesdeclast %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -24,7 +25,7 @@ declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind read
 
 define <2 x i64> @test_x86_aesni_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_aesni_aesenc:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaesenc %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -35,7 +36,7 @@ declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @test_x86_aesni_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_aesni_aesenclast:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaesenclast %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -46,7 +47,7 @@ declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind read
 
 define <2 x i64> @test_x86_aesni_aesimc(<2 x i64> %a0) {
 ; CHECK-LABEL: test_x86_aesni_aesimc:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaesimc %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
@@ -57,7 +58,7 @@ declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
 
 define <2 x i64> @test_x86_aesni_aeskeygenassist(<2 x i64> %a0) {
 ; CHECK-LABEL: test_x86_aesni_aeskeygenassist:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaeskeygenassist $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) ; <<2 x i64>> [#uses=1]
@@ -68,7 +69,7 @@ declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readno
 
 define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_add_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -79,7 +80,7 @@ declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind
 
 define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_cmp_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpordpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
@@ -90,7 +91,7 @@ declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounw
 
 define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_cmp_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpordsd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
@@ -101,7 +102,7 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
 
 define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_comieq_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomisd %xmm1, %xmm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -114,7 +115,7 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
 
 define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_comige_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomisd %xmm1, %xmm0
 ; CHECK-NEXT:    setae %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -127,7 +128,7 @@ declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readno
 
 define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_comigt_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomisd %xmm1, %xmm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -140,7 +141,7 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno
 
 define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_comile_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomisd %xmm1, %xmm0
 ; CHECK-NEXT:    setbe %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -153,7 +154,7 @@ declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readno
 
 define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_comilt_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomisd %xmm1, %xmm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -166,7 +167,7 @@ declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readno
 
 define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_comineq_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomisd %xmm1, %xmm0
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -179,7 +180,7 @@ declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readn
 
 define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
@@ -190,7 +191,7 @@ declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
 
 define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvtdq2ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %a0) ; <<4 x float>> [#uses=1]
@@ -201,7 +202,7 @@ declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
 
 define <4 x i32> @test_x86_sse2_cvtpd2dq(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvtpd2dq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtpd2dq %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
@@ -212,7 +213,7 @@ declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
 
 define <4 x float> @test_x86_sse2_cvtpd2ps(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvtpd2ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtpd2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) ; <<4 x float>> [#uses=1]
@@ -223,7 +224,7 @@ declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
 
 define <4 x i32> @test_x86_sse2_cvtps2dq(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvtps2dq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
@@ -234,7 +235,7 @@ declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
 
 define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvtps2pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtps2pd %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
@@ -245,7 +246,7 @@ declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
 
 define i32 @test_x86_sse2_cvtsd2si(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvtsd2si:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtsd2si %xmm0, %eax
 ; CHECK-NEXT:    retl
   %res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0) ; <i32> [#uses=1]
@@ -256,7 +257,7 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
 
 define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_cvtsd2ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtsd2ss %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
@@ -267,7 +268,7 @@ declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind
 
 define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvtsi2sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl $7, %eax
 ; CHECK-NEXT:    vcvtsi2sdl %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
@@ -279,7 +280,7 @@ declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnon
 
 define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse2_cvtss2sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
@@ -290,7 +291,7 @@ declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind
 
 define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvttpd2dq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvttpd2dq %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
@@ -301,7 +302,7 @@ declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
 
 define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvttps2dq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
@@ -312,7 +313,7 @@ declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
 
 define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvttsd2si:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvttsd2si %xmm0, %eax
 ; CHECK-NEXT:    retl
   %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ; <i32> [#uses=1]
@@ -323,7 +324,7 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
 
 define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_div_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -335,7 +336,7 @@ declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind
 
 define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_max_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -346,7 +347,7 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
 
 define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_max_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -357,7 +358,7 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
 
 define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_min_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vminpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -368,7 +369,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
 
 define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_min_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vminsd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -379,7 +380,7 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind
 
 define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse2_movmsk_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovmskpd %xmm0, %eax
 ; CHECK-NEXT:    retl
   %res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ; <i32> [#uses=1]
@@ -392,7 +393,7 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
 
 define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_mul_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -403,7 +404,7 @@ declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind
 
 define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse2_packssdw_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
@@ -414,7 +415,7 @@ declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind rea
 
 define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_packsswb_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
@@ -425,7 +426,7 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
 
 define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_packuswb_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
@@ -436,7 +437,7 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
 
 define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_padds_b:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -447,7 +448,7 @@ declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_padds_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -458,7 +459,7 @@ declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_paddus_b:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -469,7 +470,7 @@ declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnon
 
 define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_paddus_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -480,7 +481,7 @@ declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnon
 
 define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_pavg_b:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -491,7 +492,7 @@ declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_pavg_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -502,7 +503,7 @@ declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_pmadd_wd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) ; <<4 x i32>> [#uses=1]
@@ -513,7 +514,7 @@ declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnon
 
 define <8 x i16> @test_x86_sse2_pmaxs_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_pmaxs_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -524,7 +525,7 @@ declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_x86_sse2_pmaxu_b(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_pmaxu_b:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -535,7 +536,7 @@ declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_pmins_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_pmins_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -546,7 +547,7 @@ declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_x86_sse2_pminu_b(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_pminu_b:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -557,7 +558,7 @@ declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
 
 define i32 @test_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
 ; CHECK-LABEL: test_x86_sse2_pmovmskb_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovmskb %xmm0, %eax
 ; CHECK-NEXT:    retl
   %res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ; <i32> [#uses=1]
@@ -568,7 +569,7 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_pmulh_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -579,7 +580,7 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_pmulhu_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -590,7 +591,7 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
 
 define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse2_pmulu_dq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
@@ -601,7 +602,7 @@ declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnon
 
 define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psad_bw:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1]
@@ -612,7 +613,7 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psll_d:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -623,7 +624,7 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psll_q:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -634,7 +635,7 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psll_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -645,7 +646,7 @@ declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <4 x i32> @test_x86_sse2_pslli_d(<4 x i32> %a0) {
 ; CHECK-LABEL: test_x86_sse2_pslli_d:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpslld $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
@@ -656,7 +657,7 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
 
 define <2 x i64> @test_x86_sse2_pslli_q(<2 x i64> %a0) {
 ; CHECK-LABEL: test_x86_sse2_pslli_q:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsllq $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
@@ -667,7 +668,7 @@ declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_pslli_w(<8 x i16> %a0) {
 ; CHECK-LABEL: test_x86_sse2_pslli_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
@@ -678,7 +679,7 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
 
 define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psra_d:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -689,7 +690,7 @@ declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psra_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -700,7 +701,7 @@ declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <4 x i32> @test_x86_sse2_psrai_d(<4 x i32> %a0) {
 ; CHECK-LABEL: test_x86_sse2_psrai_d:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrad $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
@@ -711,7 +712,7 @@ declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_psrai_w(<8 x i16> %a0) {
 ; CHECK-LABEL: test_x86_sse2_psrai_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsraw $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
@@ -722,7 +723,7 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
 
 define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psrl_d:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -733,7 +734,7 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psrl_q:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -744,7 +745,7 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psrl_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -755,7 +756,7 @@ declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
 ; CHECK-LABEL: test_x86_sse2_psrli_d:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrld $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
@@ -766,7 +767,7 @@ declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
 
 define <2 x i64> @test_x86_sse2_psrli_q(<2 x i64> %a0) {
 ; CHECK-LABEL: test_x86_sse2_psrli_q:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrlq $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
@@ -777,7 +778,7 @@ declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) {
 ; CHECK-LABEL: test_x86_sse2_psrli_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrlw $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
@@ -788,7 +789,7 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
 
 define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psubs_b:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -799,7 +800,7 @@ declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psubs_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -810,7 +811,7 @@ declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psubus_b:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -821,7 +822,7 @@ declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnon
 
 define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse2_psubus_w:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -832,7 +833,7 @@ declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnon
 
 define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse2_sqrt_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vsqrtpd %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
@@ -843,7 +844,7 @@ declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
 
 define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse2_sqrt_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
@@ -854,7 +855,7 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
 
 define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse2_storel_dq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmovlps %xmm0, (%eax)
 ; CHECK-NEXT:    retl
@@ -867,7 +868,7 @@ declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
 define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
   ; add operation forces the execution domain.
 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vpaddb LCPI77_0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqu %xmm0, (%eax)
@@ -882,7 +883,7 @@ declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
 define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
   ; fadd operation forces the execution domain.
 ; CHECK-LABEL: test_x86_sse2_storeu_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
@@ -898,7 +899,7 @@ declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
 
 define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_sub_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -909,7 +910,7 @@ declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind
 
 define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_ucomieq_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -922,7 +923,7 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
 
 define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_ucomige_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    setae %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -935,7 +936,7 @@ declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readn
 
 define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_ucomigt_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -948,7 +949,7 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn
 
 define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_ucomile_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    setbe %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -961,7 +962,7 @@ declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readn
 
 define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_ucomilt_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -974,7 +975,7 @@ declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readn
 
 define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_ucomineq_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -987,7 +988,7 @@ declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind read
 
 define <2 x double> @test_x86_sse3_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse3_addsub_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -998,7 +999,7 @@ declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwi
 
 define <4 x float> @test_x86_sse3_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse3_addsub_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -1009,7 +1010,7 @@ declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind
 
 define <2 x double> @test_x86_sse3_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse3_hadd_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -1020,7 +1021,7 @@ declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind
 
 define <4 x float> @test_x86_sse3_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse3_hadd_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -1031,7 +1032,7 @@ declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind re
 
 define <2 x double> @test_x86_sse3_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse3_hsub_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -1042,7 +1043,7 @@ declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind
 
 define <4 x float> @test_x86_sse3_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse3_hsub_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -1053,7 +1054,7 @@ declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind re
 
 define <16 x i8> @test_x86_sse3_ldu_dq(i8* %a0) {
 ; CHECK-LABEL: test_x86_sse3_ldu_dq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vlddqu (%eax), %xmm0
 ; CHECK-NEXT:    retl
@@ -1065,7 +1066,7 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
 
 define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
 ; CHECK-LABEL: test_x86_sse41_blendvpd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1]
@@ -1076,7 +1077,7 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
 
 define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
 ; CHECK-LABEL: test_x86_sse41_blendvps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1]
@@ -1087,7 +1088,7 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
 
 define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse41_dppd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
@@ -1098,7 +1099,7 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
 
 define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse41_dpps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
@@ -1109,7 +1110,7 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind
 
 define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse41_insertps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[3]
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
@@ -1121,7 +1122,7 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
 
 define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse41_mpsadbw:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
@@ -1132,7 +1133,7 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse41_packusdw:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
@@ -1143,7 +1144,7 @@ declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readno
 
 define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: test_x86_sse41_pblendvb:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1]
@@ -1154,7 +1155,7 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
 
 define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
 ; CHECK-LABEL: test_x86_sse41_phminposuw:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vphminposuw %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
@@ -1165,7 +1166,7 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
 
 define <16 x i8> @test_x86_sse41_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse41_pmaxsb:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -1176,7 +1177,7 @@ declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_x86_sse41_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse41_pmaxsd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1187,7 +1188,7 @@ declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <4 x i32> @test_x86_sse41_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse41_pmaxud:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1198,7 +1199,7 @@ declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i16> @test_x86_sse41_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse41_pmaxuw:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -1209,7 +1210,7 @@ declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_x86_sse41_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse41_pminsb:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -1220,7 +1221,7 @@ declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_x86_sse41_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse41_pminsd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1231,7 +1232,7 @@ declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <4 x i32> @test_x86_sse41_pminud(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse41_pminud:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpminud %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1242,7 +1243,7 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_sse41_pminuw:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -1251,75 +1252,9 @@ define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
 declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
 
 
-define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovsxbd:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0
-; CHECK-NEXT:    retl
-  %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovsxbq:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm0
-; CHECK-NEXT:    retl
-  %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovsxbw:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm0
-; CHECK-NEXT:    retl
-  %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovsxdq:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0
-; CHECK-NEXT:    retl
-  %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovsxwd:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
-; CHECK-NEXT:    retl
-  %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovsxwq:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm0
-; CHECK-NEXT:    retl
-  %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
-
-
 define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
 ; CHECK-LABEL: test_x86_sse41_pmovzxbd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
@@ -1330,7 +1265,7 @@ declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
 
 define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
 ; CHECK-LABEL: test_x86_sse41_pmovzxbq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
@@ -1341,7 +1276,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
 
 define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
 ; CHECK-LABEL: test_x86_sse41_pmovzxbw:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
@@ -1352,7 +1287,7 @@ declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
 
 define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
 ; CHECK-LABEL: test_x86_sse41_pmovzxdq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
@@ -1363,7 +1298,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
 
 define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
 ; CHECK-LABEL: test_x86_sse41_pmovzxwd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
@@ -1374,7 +1309,7 @@ declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
 
 define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
 ; CHECK-LABEL: test_x86_sse41_pmovzxwq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
@@ -1385,7 +1320,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
 
 define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse41_pmuldq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
@@ -1396,7 +1331,7 @@ declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
 
 define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_sse41_ptestc:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vptest %xmm1, %xmm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -1409,7 +1344,7 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
 
 define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_sse41_ptestnzc:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vptest %xmm1, %xmm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1422,7 +1357,7 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
 
 define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_sse41_ptestz:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vptest %xmm1, %xmm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1435,7 +1370,7 @@ declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse41_round_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vroundpd $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
@@ -1446,7 +1381,7 @@ declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readno
 
 define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse41_round_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vroundps $7, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
@@ -1457,7 +1392,7 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 
 define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse41_round_sd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
@@ -1468,7 +1403,7 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
 
 define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse41_round_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
@@ -1479,7 +1414,7 @@ declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) noun
 
 define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
 ; CHECK-LABEL: test_x86_sse42_pcmpestri128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl $7, %eax
 ; CHECK-NEXT:    movl $7, %edx
 ; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0
@@ -1493,7 +1428,7 @@ declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nou
 
 define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
 ; CHECK-LABEL: test_x86_sse42_pcmpestri128_load:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmovdqa (%eax), %xmm0
@@ -1511,7 +1446,7 @@ define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
 
 define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
 ; CHECK-LABEL: test_x86_sse42_pcmpestria128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl $7, %eax
 ; CHECK-NEXT:    movl $7, %edx
 ; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0
@@ -1526,7 +1461,7 @@ declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) no
 
 define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
 ; CHECK-LABEL: test_x86_sse42_pcmpestric128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl $7, %eax
 ; CHECK-NEXT:    movl $7, %edx
 ; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0
@@ -1541,7 +1476,7 @@ declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) no
 
 define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
 ; CHECK-LABEL: test_x86_sse42_pcmpestrio128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl $7, %eax
 ; CHECK-NEXT:    movl $7, %edx
 ; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0
@@ -1556,7 +1491,7 @@ declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) no
 
 define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
 ; CHECK-LABEL: test_x86_sse42_pcmpestris128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl $7, %eax
 ; CHECK-NEXT:    movl $7, %edx
 ; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0
@@ -1571,7 +1506,7 @@ declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) no
 
 define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
 ; CHECK-LABEL: test_x86_sse42_pcmpestriz128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl $7, %eax
 ; CHECK-NEXT:    movl $7, %edx
 ; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0
@@ -1586,7 +1521,7 @@ declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) no
 
 define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
 ; CHECK-LABEL: test_x86_sse42_pcmpestrm128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl $7, %eax
 ; CHECK-NEXT:    movl $7, %edx
 ; CHECK-NEXT:    vpcmpestrm $7, %xmm1, %xmm0
@@ -1599,7 +1534,7 @@ declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i
 
 define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
 ; CHECK-LABEL: test_x86_sse42_pcmpestrm128_load:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl $7, %eax
 ; CHECK-NEXT:    movl $7, %edx
@@ -1613,7 +1548,7 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2
 
 define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse42_pcmpistri128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    retl
@@ -1625,7 +1560,7 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
 
 define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
 ; CHECK-LABEL: test_x86_sse42_pcmpistri128_load:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    vmovdqa (%ecx), %xmm0
@@ -1641,7 +1576,7 @@ define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
 
 define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse42_pcmpistria128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1654,7 +1589,7 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse42_pcmpistric128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -1667,7 +1602,7 @@ declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse42_pcmpistrio128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0
 ; CHECK-NEXT:    seto %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1680,7 +1615,7 @@ declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse42_pcmpistris128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0
 ; CHECK-NEXT:    sets %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1693,7 +1628,7 @@ declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse42_pcmpistriz128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1706,7 +1641,7 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse42_pcmpistrm128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpistrm $7, %xmm1, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
@@ -1717,7 +1652,7 @@ declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwin
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
 ; CHECK-LABEL: test_x86_sse42_pcmpistrm128_load:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vpcmpistrm $7, (%eax), %xmm0
 ; CHECK-NEXT:    retl
@@ -1729,7 +1664,7 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1
 
 define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_add_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -1740,7 +1675,7 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read
 
 define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_cmp_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpordps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
@@ -1751,7 +1686,7 @@ declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind
 
 define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_cmp_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpordss %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
@@ -1762,7 +1697,7 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
 
 define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_comieq_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomiss %xmm1, %xmm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1775,7 +1710,7 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_comige_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setae %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1788,7 +1723,7 @@ declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_comigt_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomiss %xmm1, %xmm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1801,7 +1736,7 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_comile_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setbe %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1814,7 +1749,7 @@ declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_comilt_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomiss %xmm1, %xmm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -1827,7 +1762,7 @@ declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_comineq_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -1840,7 +1775,7 @@ declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse_cvtsi2ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl $7, %eax
 ; CHECK-NEXT:    vcvtsi2ssl %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
@@ -1852,7 +1787,7 @@ declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
 
 define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse_cvtss2si:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtss2si %xmm0, %eax
 ; CHECK-NEXT:    retl
   %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1]
@@ -1863,7 +1798,7 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse_cvttss2si:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvttss2si %xmm0, %eax
 ; CHECK-NEXT:    retl
   %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1]
@@ -1874,7 +1809,7 @@ declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
 
 define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_div_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vdivss %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -1885,7 +1820,7 @@ declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind read
 
 define void @test_x86_sse_ldmxcsr(i8* %a0) {
 ; CHECK-LABEL: test_x86_sse_ldmxcsr:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vldmxcsr (%eax)
 ; CHECK-NEXT:    retl
@@ -1898,7 +1833,7 @@ declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind
 
 define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_max_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -1909,7 +1844,7 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
 
 define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_max_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -1920,7 +1855,7 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
 
 define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_min_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vminps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -1931,7 +1866,7 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
 
 define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_min_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -1942,7 +1877,7 @@ declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind read
 
 define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse_movmsk_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovmskps %xmm0, %eax
 ; CHECK-NEXT:    retl
   %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1]
@@ -1954,7 +1889,7 @@ declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
 define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_mul_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -1965,7 +1900,7 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read
 
 define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse_rcp_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vrcpps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
@@ -1976,7 +1911,7 @@ declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
 
 define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse_rcp_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
@@ -1987,7 +1922,7 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 
 define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse_rsqrt_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vrsqrtps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
@@ -1998,7 +1933,7 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
 
 define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse_rsqrt_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
@@ -2009,7 +1944,7 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 
 define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse_sqrt_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vsqrtps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
@@ -2020,7 +1955,7 @@ declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 
 define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_sse_sqrt_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
@@ -2031,7 +1966,7 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 
 define void @test_x86_sse_stmxcsr(i8* %a0) {
 ; CHECK-LABEL: test_x86_sse_stmxcsr:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vstmxcsr (%eax)
 ; CHECK-NEXT:    retl
@@ -2043,7 +1978,7 @@ declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
 
 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_storeu_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmovups %xmm0, (%eax)
 ; CHECK-NEXT:    retl
@@ -2055,7 +1990,7 @@ declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
 
 define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_sub_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -2066,7 +2001,7 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read
 
 define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_ucomieq_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -2079,7 +2014,7 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_ucomige_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setae %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -2092,7 +2027,7 @@ declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_ucomigt_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -2105,7 +2040,7 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_ucomile_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setbe %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -2118,7 +2053,7 @@ declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_ucomilt_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -2131,7 +2066,7 @@ declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_ucomineq_ss:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -2144,7 +2079,7 @@ declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnon
 
 define <16 x i8> @test_x86_ssse3_pabs_b_128(<16 x i8> %a0) {
 ; CHECK-LABEL: test_x86_ssse3_pabs_b_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpabsb %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
@@ -2155,7 +2090,7 @@ declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
 
 define <4 x i32> @test_x86_ssse3_pabs_d_128(<4 x i32> %a0) {
 ; CHECK-LABEL: test_x86_ssse3_pabs_d_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpabsd %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
@@ -2166,7 +2101,7 @@ declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
 
 define <8 x i16> @test_x86_ssse3_pabs_w_128(<8 x i16> %a0) {
 ; CHECK-LABEL: test_x86_ssse3_pabs_w_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpabsw %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
@@ -2177,7 +2112,7 @@ declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
 
 define <4 x i32> @test_x86_ssse3_phadd_d_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_phadd_d_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -2188,7 +2123,7 @@ declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind rea
 
 define <8 x i16> @test_x86_ssse3_phadd_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_phadd_sw_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -2199,7 +2134,7 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind re
 
 define <8 x i16> @test_x86_ssse3_phadd_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_phadd_w_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -2210,7 +2145,7 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind rea
 
 define <4 x i32> @test_x86_ssse3_phsub_d_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_phsub_d_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -2221,7 +2156,7 @@ declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind rea
 
 define <8 x i16> @test_x86_ssse3_phsub_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_phsub_sw_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -2232,7 +2167,7 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind re
 
 define <8 x i16> @test_x86_ssse3_phsub_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_phsub_w_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -2243,7 +2178,7 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind rea
 
 define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
@@ -2254,7 +2189,7 @@ declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind
 
 define <8 x i16> @test_x86_ssse3_pmul_hr_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_pmul_hr_sw_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -2265,7 +2200,7 @@ declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind
 
 define <16 x i8> @test_x86_ssse3_pshuf_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_pshuf_b_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -2276,7 +2211,7 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind rea
 
 define <16 x i8> @test_x86_ssse3_psign_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_psign_b_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsignb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -2287,7 +2222,7 @@ declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind rea
 
 define <4 x i32> @test_x86_ssse3_psign_d_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_psign_d_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsignd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -2298,7 +2233,7 @@ declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind rea
 
 define <8 x i16> @test_x86_ssse3_psign_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_x86_ssse3_psign_w_128:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsignw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -2309,7 +2244,7 @@ declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind rea
 
 define <4 x double> @test_x86_avx_addsub_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_addsub_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
@@ -2320,7 +2255,7 @@ declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nou
 
 define <8 x float> @test_x86_avx_addsub_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_addsub_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
@@ -2331,7 +2266,7 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
 
 define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
 ; CHECK-LABEL: test_x86_avx_blendv_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ; <<4 x double>> [#uses=1]
@@ -2342,7 +2277,7 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
 
 define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
 ; CHECK-LABEL: test_x86_avx_blendv_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ; <<8 x float>> [#uses=1]
@@ -2353,7 +2288,7 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
 
 define <4 x double> @test_x86_avx_cmp_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_cmp_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpordpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
@@ -2364,7 +2299,7 @@ declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) no
 
 define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_cmp_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpordps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
@@ -2373,7 +2308,7 @@ define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) {
 
 define <8 x float> @test_x86_avx_cmp_ps_256_pseudo_op(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_cmp_ps_256_pseudo_op:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1
 ; CHECK-NEXT:    vcmpltps %ymm1, %ymm0, %ymm1
 ; CHECK-NEXT:    vcmpleps %ymm1, %ymm0, %ymm1
@@ -2446,7 +2381,7 @@ declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounw
 
 define <4 x float> @test_x86_avx_cvt_pd2_ps_256(<4 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx_cvt_pd2_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtpd2psy %ymm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
@@ -2458,7 +2393,7 @@ declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
 
 define <4 x i32> @test_x86_avx_cvt_pd2dq_256(<4 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx_cvt_pd2dq_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtpd2dqy %ymm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
@@ -2470,7 +2405,7 @@ declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
 
 define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx_cvt_ps2_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) ; <<4 x double>> [#uses=1]
@@ -2481,7 +2416,7 @@ declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
 
 define <8 x i32> @test_x86_avx_cvt_ps2dq_256(<8 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx_cvt_ps2dq_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
@@ -2492,7 +2427,7 @@ declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
 
 define <4 x double> @test_x86_avx_cvtdq2_pd_256(<4 x i32> %a0) {
 ; CHECK-LABEL: test_x86_avx_cvtdq2_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) ; <<4 x double>> [#uses=1]
@@ -2503,7 +2438,7 @@ declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
 
 define <8 x float> @test_x86_avx_cvtdq2_ps_256(<8 x i32> %a0) {
 ; CHECK-LABEL: test_x86_avx_cvtdq2_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %a0) ; <<8 x float>> [#uses=1]
@@ -2514,7 +2449,7 @@ declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
 
 define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvttpd2dqy %ymm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
@@ -2526,7 +2461,7 @@ declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
 
 define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
@@ -2537,7 +2472,7 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
 
 define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_dp_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
@@ -2548,7 +2483,7 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwi
 
 define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_hadd_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
@@ -2559,7 +2494,7 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
 
 define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_hadd_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
@@ -2570,7 +2505,7 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
 
 define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_hsub_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
@@ -2581,7 +2516,7 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
 
 define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_hsub_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
@@ -2592,7 +2527,7 @@ declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind
 
 define <32 x i8> @test_x86_avx_ldu_dq_256(i8* %a0) {
 ; CHECK-LABEL: test_x86_avx_ldu_dq_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vlddqu (%eax), %ymm0
 ; CHECK-NEXT:    retl
@@ -2602,107 +2537,107 @@ define <32 x i8> @test_x86_avx_ldu_dq_256(i8* %a0) {
 declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
 
 
-define <2 x double> @test_x86_avx_maskload_pd(i8* %a0, <2 x double> %a1) {
+define <2 x double> @test_x86_avx_maskload_pd(i8* %a0, <2 x i64> %mask) {
 ; CHECK-LABEL: test_x86_avx_maskload_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmaskmovpd (%eax), %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-  %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x double>) nounwind readonly
+declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readonly
 
 
-define <4 x double> @test_x86_avx_maskload_pd_256(i8* %a0, <4 x double> %a1) {
+define <4 x double> @test_x86_avx_maskload_pd_256(i8* %a0, <4 x i64> %mask) {
 ; CHECK-LABEL: test_x86_avx_maskload_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmaskmovpd (%eax), %ymm0, %ymm0
 ; CHECK-NEXT:    retl
-  %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
+  %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1]
   ret <4 x double> %res
 }
-declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x double>) nounwind readonly
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readonly
 
 
-define <4 x float> @test_x86_avx_maskload_ps(i8* %a0, <4 x float> %a1) {
+define <4 x float> @test_x86_avx_maskload_ps(i8* %a0, <4 x i32> %mask) {
 ; CHECK-LABEL: test_x86_avx_maskload_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmaskmovps (%eax), %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-  %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x float>) nounwind readonly
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readonly
 
 
-define <8 x float> @test_x86_avx_maskload_ps_256(i8* %a0, <8 x float> %a1) {
+define <8 x float> @test_x86_avx_maskload_ps_256(i8* %a0, <8 x i32> %mask) {
 ; CHECK-LABEL: test_x86_avx_maskload_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmaskmovps (%eax), %ymm0, %ymm0
 ; CHECK-NEXT:    retl
-  %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
+  %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x float>) nounwind readonly
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readonly
 
 
-define void @test_x86_avx_maskstore_pd(i8* %a0, <2 x double> %a1, <2 x double> %a2) {
+define void @test_x86_avx_maskstore_pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2) {
 ; CHECK-LABEL: test_x86_avx_maskstore_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmaskmovpd %xmm1, %xmm0, (%eax)
 ; CHECK-NEXT:    retl
-  call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x double> %a1, <2 x double> %a2)
+  call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2)
   ret void
 }
-declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x double>, <2 x double>) nounwind
+declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
 
 
-define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x double> %a1, <4 x double> %a2) {
+define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) {
 ; CHECK-LABEL: test_x86_avx_maskstore_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
-  call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x double> %a1, <4 x double> %a2)
+  call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %mask, <4 x double> %a2)
   ret void
 }
-declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x double>, <4 x double>) nounwind
+declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind
 
 
-define void @test_x86_avx_maskstore_ps(i8* %a0, <4 x float> %a1, <4 x float> %a2) {
+define void @test_x86_avx_maskstore_ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2) {
 ; CHECK-LABEL: test_x86_avx_maskstore_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmaskmovps %xmm1, %xmm0, (%eax)
 ; CHECK-NEXT:    retl
-  call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x float> %a1, <4 x float> %a2)
+  call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2)
   ret void
 }
-declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x float>, <4 x float>) nounwind
+declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
 
 
-define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x float> %a1, <8 x float> %a2) {
+define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) {
 ; CHECK-LABEL: test_x86_avx_maskstore_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
-  call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x float> %a1, <8 x float> %a2)
+  call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %mask, <8 x float> %a2)
   ret void
 }
-declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x float>, <8 x float>) nounwind
+declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind
 
 
 define <4 x double> @test_x86_avx_max_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_max_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
@@ -2713,7 +2648,7 @@ declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwi
 
 define <8 x float> @test_x86_avx_max_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_max_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
@@ -2724,7 +2659,7 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
 
 define <4 x double> @test_x86_avx_min_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_min_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vminpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
@@ -2735,7 +2670,7 @@ declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwi
 
 define <8 x float> @test_x86_avx_min_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_min_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
@@ -2746,7 +2681,7 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind
 
 define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx_movmsk_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovmskpd %ymm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
@@ -2758,7 +2693,7 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
 
 define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx_movmsk_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovmskps %ymm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
@@ -2775,7 +2710,7 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
 define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx_ptestc_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vptest %ymm1, %ymm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -2789,7 +2724,7 @@ declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
 
 define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx_ptestnzc_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vptest %ymm1, %ymm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -2803,7 +2738,7 @@ declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
 
 define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx_ptestz_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vptest %ymm1, %ymm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -2817,7 +2752,7 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
 
 define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx_rcp_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vrcpps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
@@ -2828,7 +2763,7 @@ declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
 
 define <4 x double> @test_x86_avx_round_pd_256(<4 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx_round_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vroundpd $7, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) ; <<4 x double>> [#uses=1]
@@ -2839,7 +2774,7 @@ declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind read
 
 define <8 x float> @test_x86_avx_round_ps_256(<8 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx_round_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vroundps $7, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) ; <<8 x float>> [#uses=1]
@@ -2850,7 +2785,7 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
 
 define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx_rsqrt_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vrsqrtps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
@@ -2861,7 +2796,7 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 
 define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx_sqrt_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vsqrtpd %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
@@ -2872,7 +2807,7 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
 
 define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx_sqrt_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vsqrtps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
@@ -2885,7 +2820,7 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
   ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
   ; add operation forces the execution domain.
 ; CHECK-LABEL: test_x86_avx_storeu_dq_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
@@ -2905,7 +2840,7 @@ declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
 define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
   ; add operation forces the execution domain.
 ; CHECK-LABEL: test_x86_avx_storeu_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
@@ -2921,7 +2856,7 @@ declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
 
 define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_storeu_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmovups %ymm0, (%eax)
 ; CHECK-NEXT:    vzeroupper
@@ -2934,7 +2869,7 @@ declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
 
 define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
 ; CHECK-LABEL: test_x86_avx_vbroadcastf128_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vbroadcastf128 (%eax), %ymm0
 ; CHECK-NEXT:    retl
@@ -2946,7 +2881,7 @@ declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly
 
 define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
 ; CHECK-LABEL: test_x86_avx_vbroadcastf128_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vbroadcastf128 (%eax), %ymm0
 ; CHECK-NEXT:    retl
@@ -2958,7 +2893,7 @@ declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
 
 define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
@@ -2969,7 +2904,7 @@ declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>,
 
 define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
@@ -2980,7 +2915,7 @@ declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8
 
 define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
 ; CHECK-NEXT:    retl
   %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
@@ -2991,7 +2926,7 @@ declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) noun
 
 define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx_vpermil_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
@@ -3002,7 +2937,7 @@ declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnon
 
 define <4 x double> @test_x86_avx_vpermil_pd_256(<4 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx_vpermil_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double> %a0, i8 7) ; <<4 x double>> [#uses=1]
@@ -3013,7 +2948,7 @@ declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind rea
 
 define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx_vpermil_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,0]
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
@@ -3024,7 +2959,7 @@ declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8) nounwind readnone
 
 define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx_vpermil_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,0,7,5,4,4]
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1]
@@ -3035,7 +2970,7 @@ declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readn
 
 define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx_vpermilvar_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) ; <<2 x double>> [#uses=1]
@@ -3046,7 +2981,7 @@ declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwi
 
 define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx_vpermilvar_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) ; <<4 x double>> [#uses=1]
@@ -3054,10 +2989,18 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64>
 }
 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
 
+define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_vpermilvar_pd_256_2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpermilpd {{.*}}, %ymm0, %ymm0 ## ymm0 = ymm0[1,0,2,3]
+; CHECK-NEXT:    retl
+  %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1]
+  ret <4 x double> %res
+}
 
 define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_avx_vpermilvar_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) ; <<4 x float>> [#uses=1]
@@ -3065,7 +3008,7 @@ define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) {
 }
 define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, <4 x i32>* %a1) {
 ; CHECK-LABEL: test_x86_avx_vpermilvar_ps_load:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vpermilps (%eax), %xmm0, %xmm0
 ; CHECK-NEXT:    retl
@@ -3078,7 +3021,7 @@ declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind
 
 define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: test_x86_avx_vpermilvar_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
   %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
@@ -3089,7 +3032,7 @@ declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) noun
 
 define i32 @test_x86_avx_vtestc_pd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestc_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -3102,7 +3045,7 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnon
 
 define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestc_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -3116,7 +3059,7 @@ declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind rea
 
 define i32 @test_x86_avx_vtestc_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestc_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -3129,7 +3072,7 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestc_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax
@@ -3143,7 +3086,7 @@ declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readn
 
 define i32 @test_x86_avx_vtestnzc_pd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestnzc_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -3156,7 +3099,7 @@ declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readn
 
 define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestnzc_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -3170,7 +3113,7 @@ declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind r
 
 define i32 @test_x86_avx_vtestnzc_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestnzc_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -3183,7 +3126,7 @@ declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnon
 
 define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestnzc_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -3197,7 +3140,7 @@ declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind rea
 
 define i32 @test_x86_avx_vtestz_pd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestz_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -3210,7 +3153,7 @@ declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnon
 
 define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestz_pd_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -3224,7 +3167,7 @@ declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind rea
 
 define i32 @test_x86_avx_vtestz_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestz_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -3237,7 +3180,7 @@ declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestz_ps_256:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -3251,7 +3194,7 @@ declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readn
 
 define void @test_x86_avx_vzeroall() {
 ; CHECK-LABEL: test_x86_avx_vzeroall:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vzeroall
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
@@ -3263,7 +3206,7 @@ declare void @llvm.x86.avx.vzeroall() nounwind
 
 define void @test_x86_avx_vzeroupper() {
 ; CHECK-LABEL: test_x86_avx_vzeroupper:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
@@ -3276,7 +3219,7 @@ declare void @llvm.x86.avx.vzeroupper() nounwind
 
 define void @monitor(i8* %P, i32 %E, i32 %H) nounwind {
 ; CHECK-LABEL: monitor:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3290,7 +3233,7 @@ declare void @llvm.x86.sse3.monitor(i8*, i32, i32) nounwind
 
 define void @mwait(i32 %E, i32 %H) nounwind {
 ; CHECK-LABEL: mwait:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    mwait
@@ -3302,7 +3245,7 @@ declare void @llvm.x86.sse3.mwait(i32, i32) nounwind
 
 define void @sfence() nounwind {
 ; CHECK-LABEL: sfence:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    sfence
 ; CHECK-NEXT:    retl
   tail call void @llvm.x86.sse.sfence()
@@ -3312,7 +3255,7 @@ declare void @llvm.x86.sse.sfence() nounwind
 
 define void @lfence() nounwind {
 ; CHECK-LABEL: lfence:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    lfence
 ; CHECK-NEXT:    retl
   tail call void @llvm.x86.sse2.lfence()
@@ -3322,7 +3265,7 @@ declare void @llvm.x86.sse2.lfence() nounwind
 
 define void @mfence() nounwind {
 ; CHECK-LABEL: mfence:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    mfence
 ; CHECK-NEXT:    retl
   tail call void @llvm.x86.sse2.mfence()
@@ -3332,7 +3275,7 @@ declare void @llvm.x86.sse2.mfence() nounwind
 
 define void @clflush(i8* %p) nounwind {
 ; CHECK-LABEL: clflush:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    clflush (%eax)
 ; CHECK-NEXT:    retl
@@ -3343,7 +3286,7 @@ declare void @llvm.x86.sse2.clflush(i8*) nounwind
 
 define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
 ; CHECK-LABEL: crc32_32_8:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    crc32b {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    retl
@@ -3354,7 +3297,7 @@ declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
 
 define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
 ; CHECK-LABEL: crc32_32_16:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    crc32w {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    retl
@@ -3365,7 +3308,7 @@ declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
 
 define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: crc32_32_32:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    crc32l {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    retl
@@ -3376,9 +3319,9 @@ declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
 
 define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
 ; CHECK-LABEL: movnt_dq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    vpaddq LCPI282_0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq LCPI277_0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovntdq %ymm0, (%eax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
@@ -3391,7 +3334,7 @@ declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
 
 define void @movnt_ps(i8* %p, <8 x float> %a) nounwind {
 ; CHECK-LABEL: movnt_ps:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmovntps %ymm0, (%eax)
 ; CHECK-NEXT:    vzeroupper
@@ -3404,7 +3347,7 @@ declare void @llvm.x86.avx.movnt.ps.256(i8*, <8 x float>) nounwind
 define void @movnt_pd(i8* %p, <4 x double> %a1) nounwind {
   ; add operation forces the execution domain.
 ; CHECK-LABEL: movnt_pd:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
@@ -3421,7 +3364,7 @@ declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
 ; Check for pclmulqdq
 define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_pclmulqdq:
-; CHECK:       # BB#0:
+; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
   %res = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ; <<2 x i64>> [#uses=1]
diff --git a/test/CodeGen/X86/avx-isa-check.ll b/test/CodeGen/X86/avx-isa-check.ll
new file mode 100644
index 0000000000000..77bfbd4bb423d
--- /dev/null
+++ b/test/CodeGen/X86/avx-isa-check.ll
@@ -0,0 +1,570 @@
+; check AVX2 instructions that are disabled in case avx512VL/avx512BW present
+   
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=core-avx2 -mattr=+avx2                 -o /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl                                    -o /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl  -mattr=+avx512vl                  -o /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl  -mattr=+avx512bw                  -o /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl  -mattr=+avx512vl -mattr=+avx512bw -o /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=skx                                    -o /dev/null
+
+define <4 x i64> @vpand_256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+  ; Force the execution domain with an add.
+  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+  %x = and <4 x i64> %a2, %b
+  ret <4 x i64> %x
+}
+
+define <2 x i64> @vpand_128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
+  ; Force the execution domain with an add.
+  %a2 = add <2 x i64> %a, <i64 1, i64 1>
+  %x = and <2 x i64> %a2, %b
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @vpandn_256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+  ; Force the execution domain with an add.
+  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+  %y = xor <4 x i64> %a2, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %x = and <4 x i64> %a, %y
+  ret <4 x i64> %x
+}
+
+define <2 x i64> @vpandn_128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
+  ; Force the execution domain with an add.
+  %a2 = add <2 x i64> %a, <i64 1, i64 1>
+  %y = xor <2 x i64> %a2, <i64 -1, i64 -1>
+  %x = and <2 x i64> %a, %y
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @vpor_256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+  ; Force the execution domain with an add.
+  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+  %x = or <4 x i64> %a2, %b
+  ret <4 x i64> %x
+}
+
+define <4 x i64> @vpxor_256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+  ; Force the execution domain with an add.
+  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+  %x = xor <4 x i64> %a2, %b
+  ret <4 x i64> %x
+}
+
+define <2 x i64> @vpor_128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
+  ; Force the execution domain with an add.
+  %a2 = add <2 x i64> %a, <i64 1, i64 1>
+  %x = or <2 x i64> %a2, %b
+  ret <2 x i64> %x
+}
+
+define <2 x i64> @vpxor_128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
+  ; Force the execution domain with an add.
+  %a2 = add <2 x i64> %a, <i64 1, i64 1>
+  %x = xor <2 x i64> %a2, %b
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @test_vpaddq_256(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+  %x = add <4 x i64> %i, %j
+  ret <4 x i64> %x
+}
+
+define <8 x i32> @test_vpaddd_256(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+  %x = add <8 x i32> %i, %j
+  ret <8 x i32> %x
+}
+
+define <16 x i16> @test_vpaddw_256(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+  %x = add <16 x i16> %i, %j
+  ret <16 x i16> %x
+}
+
+define <32 x i8> @test_vpaddb_256(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+  %x = add <32 x i8> %i, %j
+  ret <32 x i8> %x
+}
+
+define <4 x i64> @test_vpsubq_256(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+  %x = sub <4 x i64> %i, %j
+  ret <4 x i64> %x
+}
+
+define <8 x i32> @test_vpsubd_256(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+  %x = sub <8 x i32> %i, %j
+  ret <8 x i32> %x
+}
+
+define <16 x i16> @test_vpsubw_256(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+  %x = sub <16 x i16> %i, %j
+  ret <16 x i16> %x
+}
+
+define <32 x i8> @test_vpsubb_256(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+  %x = sub <32 x i8> %i, %j
+  ret <32 x i8> %x
+}
+
+define <16 x i16> @test_vpmullw_256(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+  %x = mul <16 x i16> %i, %j
+  ret <16 x i16> %x
+}
+
+define <8 x i32> @test_vpcmpgtd_256(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+  %bincmp = icmp slt <8 x i32> %i, %j
+  %x = sext <8 x i1> %bincmp to <8 x i32>
+  ret <8 x i32> %x
+}
+
+define <32 x i8> @test_vpcmpeqb_256(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+  %bincmp = icmp eq <32 x i8> %i, %j
+  %x = sext <32 x i1> %bincmp to <32 x i8>
+  ret <32 x i8> %x
+}
+
+define <16 x i16> @test_vpcmpeqw_256(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+  %bincmp = icmp eq <16 x i16> %i, %j
+  %x = sext <16 x i1> %bincmp to <16 x i16>
+  ret <16 x i16> %x
+}
+
+define <32 x i8> @test_vpcmpgtb_256(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+  %bincmp = icmp slt <32 x i8> %i, %j
+  %x = sext <32 x i1> %bincmp to <32 x i8>
+  ret <32 x i8> %x
+}
+
+define <16 x i16> @test_vpcmpgtw_256(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+  %bincmp = icmp slt <16 x i16> %i, %j
+  %x = sext <16 x i1> %bincmp to <16 x i16>
+  ret <16 x i16> %x
+}
+
+define <8 x i32> @test_vpcmpeqd_256(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+  %bincmp = icmp eq <8 x i32> %i, %j
+  %x = sext <8 x i1> %bincmp to <8 x i32>
+  ret <8 x i32> %x
+}
+
+define <2 x i64> @test_vpaddq_128(<2 x i64> %i, <2 x i64> %j) nounwind readnone {
+  %x = add <2 x i64> %i, %j
+  ret <2 x i64> %x
+}
+
+define <4 x i32> @test_vpaddd_128(<4 x i32> %i, <4 x i32> %j) nounwind readnone {
+  %x = add <4 x i32> %i, %j
+  ret <4 x i32> %x
+}
+
+define <8 x i16> @test_vpaddw_128(<8 x i16> %i, <8 x i16> %j) nounwind readnone {
+  %x = add <8 x i16> %i, %j
+  ret <8 x i16> %x
+}
+
+define <16 x i8> @test_vpaddb_128(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
+  %x = add <16 x i8> %i, %j
+  ret <16 x i8> %x
+}
+
+define <2 x i64> @test_vpsubq_128(<2 x i64> %i, <2 x i64> %j) nounwind readnone {
+  %x = sub <2 x i64> %i, %j
+  ret <2 x i64> %x
+}
+
+define <4 x i32> @test_vpsubd_128(<4 x i32> %i, <4 x i32> %j) nounwind readnone {
+  %x = sub <4 x i32> %i, %j
+  ret <4 x i32> %x
+}
+
+define <8 x i16> @test_vpsubw_128(<8 x i16> %i, <8 x i16> %j) nounwind readnone {
+  %x = sub <8 x i16> %i, %j
+  ret <8 x i16> %x
+}
+
+define <16 x i8> @test_vpsubb_128(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
+  %x = sub <16 x i8> %i, %j
+  ret <16 x i8> %x
+}
+
+define <8 x i16> @test_vpmullw_128(<8 x i16> %i, <8 x i16> %j) nounwind readnone {
+  %x = mul <8 x i16> %i, %j
+  ret <8 x i16> %x
+}
+
+define <8 x i16> @test_vpcmpgtw_128(<8 x i16> %i, <8 x i16> %j) nounwind readnone {
+  %bincmp = icmp slt <8 x i16> %i, %j
+  %x = sext <8 x i1> %bincmp to <8 x i16>
+  ret <8 x i16> %x
+}
+
+define <16 x i8> @test_vpcmpgtb_128(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
+  %bincmp = icmp slt <16 x i8> %i, %j
+  %x = sext <16 x i1> %bincmp to <16 x i8>
+  ret <16 x i8> %x
+}
+
+define <8 x i16> @test_vpcmpeqw_128(<8 x i16> %i, <8 x i16> %j) nounwind readnone {
+  %bincmp = icmp eq <8 x i16> %i, %j
+  %x = sext <8 x i1> %bincmp to <8 x i16>
+  ret <8 x i16> %x
+}
+
+define <16 x i8> @test_vpcmpeqb_128(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
+  %bincmp = icmp eq <16 x i8> %i, %j
+  %x = sext <16 x i1> %bincmp to <16 x i8>
+  ret <16 x i8> %x
+}
+
+define <8 x i16> @shuffle_v8i16_vpalignr(<8 x i16> %a, <8 x i16> %b) {
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_vpalignr(<16 x i16> %a, <16 x i16> %b) {
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_vpalignr(<16 x i8> %a, <16 x i8> %b) {
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  ret <16 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_vpalignr(<32 x i8> %a, <32 x i8> %b) {
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i8> %shuffle
+}
+
+define <2 x i64> @shuffle_v2i64_vpalignr(<2 x i64> %a, <2 x i64> %b) {
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_vpalignr(<4 x i32> %a, <4 x i32> %b) {
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_vpalignr(<8 x i32> %a, <8 x i32> %b) {
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) {
+  %shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
+  %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x float>
+  %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  %bitcast64 = bitcast <4 x float> %shuffle32 to <2 x double>
+  ret <2 x double> %bitcast64
+}
+
+define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) {
+  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24>
+  ret <16 x i16> %shuffle
+}
+
+define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
+  %r1 = extractelement <2 x i64> %x, i32 0
+  %r2 = extractelement <2 x i64> %x, i32 1
+  store i64 %r2, i64* %dst, align 1
+  ret i64 %r1
+}
+
+define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
+  %r1 = extractelement <4 x i32> %x, i32 1
+  %r2 = extractelement <4 x i32> %x, i32 3
+  store i32 %r2, i32* %dst, align 1
+  ret i32 %r1
+}
+
+define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
+  %r1 = extractelement <8 x i16> %x, i32 1
+  %r2 = extractelement <8 x i16> %x, i32 3
+  store i16 %r2, i16* %dst, align 1
+  ret i16 %r1
+}
+
+define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
+  %r1 = extractelement <16 x i8> %x, i32 1
+  %r2 = extractelement <16 x i8> %x, i32 3
+  store i8 %r2, i8* %dst, align 1
+  ret i8 %r1
+}
+
+define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
+  %val = load i64, i64* %ptr
+  %r1 = insertelement <2 x i64> %x, i64 %val, i32 1
+  %r2 = insertelement <2 x i64> %r1, i64 %y, i32 3
+  ret <2 x i64> %r2
+}
+
+define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
+  %val = load i32, i32* %ptr
+  %r1 = insertelement <4 x i32> %x, i32 %val, i32 1
+  %r2 = insertelement <4 x i32> %r1, i32 %y, i32 3
+  ret <4 x i32> %r2
+}
+
+define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) {
+  %val = load i16, i16* %ptr
+  %r1 = insertelement <8 x i16> %x, i16 %val, i32 1
+  %r2 = insertelement <8 x i16> %r1, i16 %y, i32 5
+  ret <8 x i16> %r2
+}
+
+define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) {
+  %val = load i8, i8* %ptr
+  %r1 = insertelement <16 x i8> %x, i8 %val, i32 3
+  %r2 = insertelement <16 x i8> %r1, i8 %y, i32 10
+  ret <16 x i8> %r2
+}
+
+define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+  ret <4 x i32> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) {
+; vmovshdup 256 test
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
+; vmovshdup 128 test 
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) {
+; vmovsldup 256 test
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
+; vmovsldup 128 test
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %shuffle
+}
+
+define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) {
+  %a = load double, double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) {
+  %a = load double, double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x double> %shuffle
+}
+
+define void @store_floats(<4 x float> %x, i64* %p) {
+  %a = fadd <4 x float> %x, %x
+  %b = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %c = bitcast <2 x float> %b to i64
+  store i64 %c, i64* %p
+  ret void
+}
+
+define void @store_double(<2 x double> %x, i64* %p) {
+  %a = fadd <2 x double> %x, %x
+  %b = extractelement <2 x double> %a, i32 0
+  %c = bitcast double %b to i64
+  store i64 %c, i64* %p
+  ret void
+}
+
+define void @store_h_double(<2 x double> %x, i64* %p) {
+  %a = fadd <2 x double> %x, %x
+  %b = extractelement <2 x double> %a, i32 1
+  %c = bitcast double %b to i64
+  store i64 %c, i64* %p
+  ret void
+}
+
+define <2 x double> @test39(double* %ptr) nounwind {
+  %a = load double, double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+  }
+
+define <2 x double> @test40(<2 x double>* %ptr) nounwind {
+  %v = load  <2 x double>,  <2 x double>* %ptr
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+  }
+
+define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <8 x i32> @ashr_v8i32(<8 x i32> %a, <8 x i32> %b) {
+  %shift = ashr <8 x i32> %a, %b
+  ret <8 x i32> %shift
+}
+
+define <8 x i32> @lshr_v8i32(<8 x i32> %a, <8 x i32> %b) {
+  %shift = lshr <8 x i32> %a, %b
+  ret <8 x i32> %shift
+}
+
+define <8 x i32> @shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
+  %shift = shl <8 x i32> %a, %b
+  ret <8 x i32> %shift
+}
+
+define <8 x i32> @ashr_const_v8i32(<8 x i32> %a) {
+  %shift = ashr <8 x i32> %a,  <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i32> %shift
+}
+
+define <8 x i32> @lshr_const_v8i32(<8 x i32> %a) {
+  %shift = lshr <8 x i32> %a,  <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i32> %shift
+}
+
+define <8 x i32> @shl_const_v8i32(<8 x i32> %a) {
+  %shift = shl <8 x i32> %a,  <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i32> %shift
+}
+
+define <4 x i64> @ashr_v4i64(<4 x i64> %a, <4 x i64> %b) {
+  %shift = ashr <4 x i64> %a, %b
+  ret <4 x i64> %shift
+}
+
+define <4 x i64> @lshr_v4i64(<4 x i64> %a, <4 x i64> %b) {
+  %shift = lshr <4 x i64> %a, %b
+  ret <4 x i64> %shift
+}
+
+define <4 x i64> @shl_v4i64(<4 x i64> %a, <4 x i64> %b) {
+  %shift = shl <4 x i64> %a, %b
+  ret <4 x i64> %shift
+}
+
+define <4 x i64> @ashr_const_v4i64(<4 x i64> %a) {
+  %shift = ashr <4 x i64> %a,  <i64 3, i64 3, i64 3, i64 3>
+  ret <4 x i64> %shift
+}
+
+define <4 x i64> @lshr_const_v4i64(<4 x i64> %a) {
+  %shift = lshr <4 x i64> %a,  <i64 3, i64 3, i64 3, i64 3>
+  ret <4 x i64> %shift
+}
+
+define <4 x i64> @shl_const_v4i64(<4 x i64> %a) {
+  %shift = shl <4 x i64> %a,  <i64 3, i64 3, i64 3, i64 3>
+  ret <4 x i64> %shift
+}
+
+define <16 x i16> @ashr_v16i16(<16 x i16> %a, <16 x i16> %b) {
+  %shift = ashr <16 x i16> %a, %b
+  ret <16 x i16> %shift
+}
+
+define <16 x i16> @lshr_v16i16(<16 x i16> %a, <16 x i16> %b) {
+  %shift = lshr <16 x i16> %a, %b
+  ret <16 x i16> %shift
+}
+
+define <16 x i16> @shl_v16i16(<16 x i16> %a, <16 x i16> %b) {
+  %shift = shl <16 x i16> %a, %b
+  ret <16 x i16> %shift
+}
+
+define <16 x i16> @ashr_const_v16i16(<16 x i16> %a) {
+  %shift = ashr <16 x i16> %a,  <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <16 x i16> %shift
+}
+
+define <16 x i16> @lshr_const_v16i16(<16 x i16> %a) {
+  %shift = lshr <16 x i16> %a,  <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <16 x i16> %shift
+}
+
+define <16 x i16> @shl_const_v16i16(<16 x i16> %a) {
+  %shift = shl <16 x i16> %a,  <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <16 x i16> %shift
+}
+
+define <4 x i32> @ashr_v4i32(<4 x i32> %a, <4 x i32> %b) {
+  %shift = ashr <4 x i32> %a, %b
+  ret <4 x i32> %shift
+}
+
+define <4 x i32> @shl_const_v4i32(<4 x i32> %a) {
+  %shift = shl <4 x i32> %a,  <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %shift
+}
+
+define <2 x i64> @ashr_v2i64(<2 x i64> %a, <2 x i64> %b) {
+  %shift = ashr <2 x i64> %a, %b
+  ret <2 x i64> %shift
+}
+
+define <2 x i64> @shl_const_v2i64(<2 x i64> %a) {
+  %shift = shl <2 x i64> %a,  <i64 3, i64 3>
+  ret <2 x i64> %shift
+}
+
+define <8 x i16> @ashr_v8i16(<8 x i16> %a, <8 x i16> %b) {
+  %shift = ashr <8 x i16> %a, %b
+  ret <8 x i16> %shift
+}
+
+define <8 x i16> @lshr_v8i16(<8 x i16> %a, <8 x i16> %b) {
+  %shift = lshr <8 x i16> %a, %b
+  ret <8 x i16> %shift
+}
+
+define <8 x i16> @shl_v8i16(<8 x i16> %a, <8 x i16> %b) {
+  %shift = shl <8 x i16> %a, %b
+  ret <8 x i16> %shift
+}
+
+define <8 x i16> @ashr_const_v8i16(<8 x i16> %a) {
+  %shift = ashr <8 x i16> %a,<i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %shift
+}
+
+define <8 x i16> @lshr_const_v8i16(<8 x i16> %a) {
+  %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %shift
+}
+
+define <8 x i16> @shl_const_v8i16(<8 x i16> %a) {
+  %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %shift
+}
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll
index d2f213bac7bbc..d7eceb7cce664 100644
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -88,7 +88,7 @@ entry:
   ret void
 }
 
-declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x float>, <8 x float>) nounwind
+declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind
 
 ; CHECK_O0: _f_f
 ; CHECK-O0: vmovss LCPI
@@ -105,7 +105,7 @@ cif_mask_mixed:                                   ; preds = %allocas
   br i1 undef, label %cif_mixed_test_all, label %cif_mixed_test_any_check
 
 cif_mixed_test_all:                               ; preds = %cif_mask_mixed
-  call void @llvm.x86.avx.maskstore.ps.256(i8* undef, <8 x float> <float 0xFFFFFFFFE0000000, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <8 x float> undef) nounwind
+  call void @llvm.x86.avx.maskstore.ps.256(i8* undef, <8 x i32> <i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x float> undef) nounwind
   unreachable
 
 cif_mixed_test_any_check:                         ; preds = %cif_mask_mixed
diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll
index e71ac473b44d6..e9e7d5aea2737 100644
--- a/test/CodeGen/X86/avx-logic.ll
+++ b/test/CodeGen/X86/avx-logic.ll
@@ -1,4 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s
 
 define <4 x double> @andpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: andpd256:
diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll
index 83585b5360958..033a95276608a 100644
--- a/test/CodeGen/X86/avx-shift.ll
+++ b/test/CodeGen/X86/avx-shift.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
 ;;; Shift left
diff --git a/test/CodeGen/X86/avx-shuffle-x86_32.ll b/test/CodeGen/X86/avx-shuffle-x86_32.ll
index 78b4888cfa16a..fae5b41abfa6f 100755
--- a/test/CodeGen/X86/avx-shuffle-x86_32.ll
+++ b/test/CodeGen/X86/avx-shuffle-x86_32.ll
@@ -1,8 +1,26 @@
-; RUN: llc < %s -mtriple=i686-pc-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s
 
+; Avoid unnecessary vinsertf128
 define <4 x i64> @test1(<4 x i64> %a) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT:    retl
  %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  ret <4 x i64>%b
- ; CHECK-LABEL: test1:
- ; CHECK-NOT: vinsertf128
- }
+}
+
+define <8 x i16> @test2(<4 x i16>* %v) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    retl
+  %v9 = load <4 x i16>, <4 x i16> * %v, align 8
+  %v10 = shufflevector <4 x i16> %v9, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %v11 = shufflevector <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %v10, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %v11
+}
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 3ea7e386c4262..ebaaf0e8d00d2 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -1,26 +1,34 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
 
-
-; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
-; CHECK-NEXT: vinsertf128 $1
 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: funcA:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <32 x i8> %shuffle
 }
 
-; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11]
-; CHECK-NEXT: vinsertf128 $1
 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: funcB:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <16 x i16> %shuffle
 }
 
-; CHECK: vmovq
-; CHECK-NEXT: vmovddup %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
+; CHECK-LABEL: funcC:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovq %rdi, %xmm0
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
   %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
@@ -29,9 +37,12 @@ entry:
   ret <4 x i64> %vecinit6.i
 }
 
-; CHECK: vmovddup %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
+; CHECK-LABEL: funcD:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
   %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
@@ -43,8 +54,23 @@ entry:
 ; Test this turns into a broadcast:
 ;   shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
 ;
-; CHECK: vbroadcastss
 define <8 x float> @funcE() nounwind {
+; CHECK-LABEL: funcE:
+; CHECK:       ## BB#0: ## %for_exit499
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    ## implicit-def: %YMM0
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne LBB4_2
+; CHECK-NEXT:  ## BB#1: ## %load.i1247
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    andq $-32, %rsp
+; CHECK-NEXT:    subq $1312, %rsp ## imm = 0x520
+; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %ymm0
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  LBB4_2: ## %__load_and_broadcast_32.exit1249
+; CHECK-NEXT:    retq
 allocas:
   %udx495 = alloca [18 x [18 x float]], align 32
   br label %for_test505.preheader
@@ -69,29 +95,79 @@ __load_and_broadcast_32.exit1249:                 ; preds = %load.i1247, %for_ex
   ret <8 x float> %load_broadcast12281250
 }
 
-; CHECK: vpermilps $4
-; CHECK-NEXT: vinsertf128 $1
 define <8 x float> @funcF(i32 %val) nounwind {
+; CHECK-LABEL: funcF:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovd %edi, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
   %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
   %tmp = bitcast <8 x i32> %ret7 to <8 x float>
   ret <8 x float> %tmp
 }
 
-; CHECK: vpermilps $0
-; CHECK-NEXT: vinsertf128  $1
 define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: funcG:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x float> %shuffle
 }
 
-; CHECK: vextractf128  $1
-; CHECK-NEXT: vpermilps $85
-; CHECK-NEXT: vinsertf128  $1
 define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: funcH:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x float> %shuffle
 }
 
+define <2 x double> @splat_load_2f64_11(<2 x double>* %ptr) {
+; CHECK-LABEL: splat_load_2f64_11:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    retq
+  %x = load <2 x double>, <2 x double>* %ptr
+  %x1 = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %x1
+}
+
+define <4 x double> @splat_load_4f64_2222(<4 x double>* %ptr) {
+; CHECK-LABEL: splat_load_4f64_2222:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; CHECK-NEXT:    retq
+  %x = load <4 x double>, <4 x double>* %ptr
+  %x1 = shufflevector <4 x double> %x, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x double> %x1
+}
+
+define <4 x float> @splat_load_4f32_0000(<4 x float>* %ptr) {
+; CHECK-LABEL: splat_load_4f32_0000:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
+  %x = load <4 x float>, <4 x float>* %ptr
+  %x1 = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x float> %x1
+}
+
+define <8 x float> @splat_load_8f32_77777777(<8 x float>* %ptr) {
+; CHECK-LABEL: splat_load_8f32_77777777:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss 28(%rdi), %ymm0
+; CHECK-NEXT:    retq
+  %x = load <8 x float>, <8 x float>* %ptr
+  %x1 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x float> %x1
+}
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 8b8c11b858755..86b0628aa0bc6 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -1,7 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
-; CHECK: vbroadcastsd (%
 define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: A:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 8
   %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
@@ -11,8 +15,11 @@ entry:
   ret <4 x i64> %vecinit6.i
 }
 
-; CHECK: vbroadcastss (%
 define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: B:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
@@ -22,8 +29,11 @@ entry:
   ret <8 x i32> %vecinit6.i
 }
 
-; CHECK: vbroadcastsd (%
 define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: C:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load double, double* %ptr, align 8
   %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
@@ -33,8 +43,11 @@ entry:
   ret <4 x double> %vecinit6.i
 }
 
-; CHECK: vbroadcastss (%
 define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: D:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
@@ -46,8 +59,11 @@ entry:
 
 ;;;; 128-bit versions
 
-; CHECK: vbroadcastss (%
 define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: e:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@@ -57,12 +73,14 @@ entry:
   ret <4 x float> %vecinit6.i
 }
 
-
-; CHECK: _e2
-; CHECK-NOT: vbroadcastss
-; CHECK: ret
+; Don't broadcast constants on pre-AVX2 hardware.
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
-    %vecinit.i = insertelement <4 x float> undef, float      0xbf80000000000000, i32 0
+; CHECK-LABEL: _e2:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03]
+; CHECK-NEXT:    retq
+entry:
+   %vecinit.i = insertelement <4 x float> undef, float       0xbf80000000000000, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float  0xbf80000000000000, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
@@ -70,8 +88,11 @@ define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
 }
 
 
-; CHECK: vbroadcastss (%
 define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: F:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
@@ -81,12 +102,158 @@ entry:
   ret <4 x i32> %vecinit6.i
 }
 
+; FIXME: Pointer adjusted broadcasts
+
+define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i32_4i32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_4i32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_8i32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i32>, <8 x i32>* %ptr
+  %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i32> %ret
+}
+
+define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f32_4f32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_4f32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_8f32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x float>, <8 x float>* %ptr
+  %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x float> %ret
+}
+
+define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_4i64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovapd (%rdi), %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i64>, <4 x i64>* %ptr
+  %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i64> %ret
+}
+
+define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_4f64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x double>, <4 x double>* %ptr
+  %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x double> %ret
+}
+
 ; Unsupported vbroadcasts
 
-; CHECK: _G
-; CHECK-NOT: broadcast (%
-; CHECK: ret
 define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: G:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; CHECK-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 8
   %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
@@ -94,18 +261,21 @@ entry:
   ret <2 x i64> %vecinit2.i
 }
 
-; CHECK: _H
-; CHECK-NOT: broadcast
-; CHECK: ret
 define <4 x i32> @H(<4 x i32> %a) {
+; CHECK-LABEL: H:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT:    retq
+entry:
   %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   ret <4 x i32> %x
 }
 
-; CHECK: _I
-; CHECK-NOT: broadcast (%
-; CHECK: ret
 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: I:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; CHECK-NEXT:    retq
 entry:
   %q = load double, double* %ptr, align 4
   %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
@@ -113,10 +283,13 @@ entry:
   ret <2 x double> %vecinit2.i
 }
 
-; CHECK: _RR
-; CHECK: vbroadcastss (%
-; CHECK: ret
 define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _RR:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    movl (%rsi), %eax
+; CHECK-NEXT:    movl %eax, (%rax)
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@@ -129,11 +302,11 @@ entry:
   ret <4 x float> %vecinit6.i
 }
 
-
-; CHECK: _RR2
-; CHECK: vbroadcastss (%
-; CHECK: ret
 define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _RR2:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %v = insertelement <4 x float> undef, float %q, i32 0
@@ -141,16 +314,15 @@ entry:
   ret <4 x float> %t
 }
 
-
 ; These tests check that a vbroadcast instruction is used when we have a splat
 ; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
 ; (via the insertelements).
 
-; CHECK-LABEL: splat_concat1
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastss (%
-; CHECK-NEXT: ret
 define <8 x float> @splat_concat1(float* %p) {
+; CHECK-LABEL: splat_concat1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load float, float* %p, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %3 = insertelement <4 x float> %2, float %1, i32 1
@@ -160,11 +332,11 @@ define <8 x float> @splat_concat1(float* %p) {
   ret <8 x float> %6
 }
 
-; CHECK-LABEL: splat_concat2
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastss (%
-; CHECK-NEXT: ret
 define <8 x float> @splat_concat2(float* %p) {
+; CHECK-LABEL: splat_concat2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load float, float* %p, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %3 = insertelement <4 x float> %2, float %1, i32 1
@@ -178,11 +350,11 @@ define <8 x float> @splat_concat2(float* %p) {
   ret <8 x float> %10
 }
 
-; CHECK-LABEL: splat_concat3
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastsd (%
-; CHECK-NEXT: ret
 define <4 x double> @splat_concat3(double* %p) {
+; CHECK-LABEL: splat_concat3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load double, double* %p, align 8
   %2 = insertelement <2 x double> undef, double %1, i32 0
   %3 = insertelement <2 x double> %2, double %1, i32 1
@@ -190,11 +362,11 @@ define <4 x double> @splat_concat3(double* %p) {
   ret <4 x double> %4
 }
 
-; CHECK-LABEL: splat_concat4
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastsd (%
-; CHECK-NEXT: ret
 define <4 x double> @splat_concat4(double* %p) {
+; CHECK-LABEL: splat_concat4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load double, double* %p, align 8
   %2 = insertelement <2 x double> undef, double %1, i32 0
   %3 = insertelement <2 x double> %2, double %1, i32 1
@@ -203,4 +375,3 @@ define <4 x double> @splat_concat4(double* %p) {
   %6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x double> %6
 }
-
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 4e43f6f519210..0958008d9a3e1 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
 
@@ -262,13 +263,13 @@ entry:
   ret <8 x float> %shuffle
 }
 
-;; Test zero mask generation. 
+;; Test zero mask generation.
 ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
 ;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
 
 define <4 x double> @vperm2z_0x08(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x08:
-; ALL:       # BB#0:
+; ALL:       ## BB#0:
 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
@@ -277,7 +278,7 @@ define <4 x double> @vperm2z_0x08(<4 x double> %a) {
 
 define <4 x double> @vperm2z_0x18(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x18:
-; ALL:       # BB#0:
+; ALL:       ## BB#0:
 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; ALL-NEXT:    retq
@@ -287,7 +288,7 @@ define <4 x double> @vperm2z_0x18(<4 x double> %a) {
 
 define <4 x double> @vperm2z_0x28(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x28:
-; ALL:       # BB#0:
+; ALL:       ## BB#0:
 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -296,7 +297,7 @@ define <4 x double> @vperm2z_0x28(<4 x double> %a) {
 
 define <4 x double> @vperm2z_0x38(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x38:
-; ALL:       # BB#0:
+; ALL:       ## BB#0:
 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; ALL-NEXT:    retq
@@ -306,8 +307,9 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) {
 
 define <4 x double> @vperm2z_0x80(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x80:
-; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero
+; ALL:       ## BB#0:
+; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x double> %s
@@ -315,7 +317,7 @@ define <4 x double> @vperm2z_0x80(<4 x double> %a) {
 
 define <4 x double> @vperm2z_0x81(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x81:
-; ALL:       # BB#0:
+; ALL:       ## BB#0:
 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -324,8 +326,9 @@ define <4 x double> @vperm2z_0x81(<4 x double> %a) {
 
 define <4 x double> @vperm2z_0x82(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x82:
-; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero
+; ALL:       ## BB#0:
+; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
   ret <4 x double> %s
@@ -333,7 +336,7 @@ define <4 x double> @vperm2z_0x82(<4 x double> %a) {
 
 define <4 x double> @vperm2z_0x83(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x83:
-; ALL:       # BB#0:
+; ALL:       ## BB#0:
 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
@@ -343,10 +346,21 @@ define <4 x double> @vperm2z_0x83(<4 x double> %a) {
 ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
 
 define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
-; ALL-LABEL: vperm2z_int_0x83:
-; ALL:       # BB#0:
-; AVX1:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
-; AVX2:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; AVX1-LABEL: vperm2z_int_0x83:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vperm2z_int_0x83:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
   %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   %c = add <4 x i64> %b, %s
   ret <4 x i64> %c
diff --git a/test/CodeGen/X86/avx-win64.ll b/test/CodeGen/X86/avx-win64.ll
index dc6bd594450f4..64bc398a97eaa 100644
--- a/test/CodeGen/X86/avx-win64.ll
+++ b/test/CodeGen/X86/avx-win64.ll
@@ -42,6 +42,4 @@ safe_if_after_false:                              ; preds = %safe_if_run_false,
 }
 
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
-declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x float>) nounwind readonly
-declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x float>, <8 x float>) nounwind
 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
index f71ec5c10e69a..341dd867e4ff4 100644
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll
@@ -32,7 +32,7 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
 ; On X32, account for the argument's move to registers
 ; X32: movl    4(%esp), %eax
 ; CHECK-NOT: mov
-; CHECK: insertps    $48
+; CHECK: vinsertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; CHECK-NEXT: ret
   %1 = load <4 x float>, <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -46,7 +46,7 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
 ; X32: movl    4(%esp), %eax
 ; CHECK-NOT: mov
 ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK: vinsertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; CHECK-NEXT: ret
   %1 = load <4 x float>, <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -60,7 +60,7 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
 ; X32: movl    8(%esp), %ecx
 ; CHECK-NOT: mov
 ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: vinsertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK: vinsertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; CHECK-NEXT: ret
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
   %2 = load <4 x float>, <4 x float>* %1, align 16
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
index 9b6d5aa5eeae6..1762927682530 100755
--- a/test/CodeGen/X86/avx2-conversions.ll
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -1,136 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
 
-; CHECK: trunc4
-; CHECK: vpermd
-; CHECK-NOT: vinsert
-; CHECK: ret
 define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
+; CHECK-LABEL: trunc4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %B = trunc <4 x i64> %A to <4 x i32>
   ret <4 x i32>%B
 }
 
-; CHECK: trunc8
-; CHECK: vpshufb
-; CHECK-NOT: vinsert
-; CHECK: ret
-
 define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
+; CHECK-LABEL: trunc8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %B = trunc <8 x i32> %A to <8 x i16>
   ret <8 x i16>%B
 }
 
-; CHECK: sext4
-; CHECK: vpmovsxdq
-; CHECK-NOT: vinsert
-; CHECK: ret
 define <4 x i64> @sext4(<4 x i32> %A) nounwind {
+; CHECK-LABEL: sext4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %B = sext <4 x i32> %A to <4 x i64>
   ret <4 x i64>%B
 }
 
-; CHECK: sext8
-; CHECK: vpmovsxwd
-; CHECK-NOT: vinsert
-; CHECK: ret
 define <8 x i32> @sext8(<8 x i16> %A) nounwind {
+; CHECK-LABEL: sext8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %B = sext <8 x i16> %A to <8 x i32>
   ret <8 x i32>%B
 }
 
-; CHECK: zext4
-; CHECK: vpmovzxdq
-; CHECK-NOT: vinsert
-; CHECK: ret
 define <4 x i64> @zext4(<4 x i32> %A) nounwind {
+; CHECK-LABEL: zext4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT:    retq
   %B = zext <4 x i32> %A to <4 x i64>
   ret <4 x i64>%B
 }
 
-; CHECK: zext8
-; CHECK: vpmovzxwd
-; CHECK-NOT: vinsert
-; CHECK: ret
 define <8 x i32> @zext8(<8 x i16> %A) nounwind {
+; CHECK-LABEL: zext8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT:    retq
   %B = zext <8 x i16> %A to <8 x i32>
   ret <8 x i32>%B
 }
-; CHECK: zext_8i8_8i32
-; CHECK: vpmovzxwd
-; CHECK: vpand
-; CHECK: ret
+
 define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
-  %B = zext <8 x i8> %A to <8 x i32>  
+; CHECK-LABEL: zext_8i8_8i32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT:    retq
+  %B = zext <8 x i8> %A to <8 x i32>
   ret <8 x i32>%B
 }
 
-; CHECK-LABEL: zext_16i8_16i16:
-; CHECK: vpmovzxbw
-; CHECK-NOT: vinsert
-; CHECK: ret
 define <16 x i16> @zext_16i8_16i16(<16 x i8> %z) {
+; CHECK-LABEL: zext_16i8_16i16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT:    retq
   %t = zext <16 x i8> %z to <16 x i16>
   ret <16 x i16> %t
 }
 
-; CHECK-LABEL: sext_16i8_16i16:
-; CHECK: vpmovsxbw
-; CHECK-NOT: vinsert
-; CHECK: ret
 define <16 x i16> @sext_16i8_16i16(<16 x i8> %z) {
+; CHECK-LABEL: sext_16i8_16i16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %t = sext <16 x i8> %z to <16 x i16>
   ret <16 x i16> %t
 }
 
-; CHECK-LABEL: trunc_16i16_16i8:
-; CHECK: vpshufb
-; CHECK: vpshufb
-; CHECK: vpunpcklqdq
-; CHECK: ret
 define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) {
+; CHECK-LABEL: trunc_16i16_16i8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %t = trunc <16 x i16> %z to <16 x i8>
   ret <16 x i8> %t
 }
 
-; CHECK: load_sext_test1
-; CHECK: vpmovsxdq (%r{{[^,]*}}), %ymm{{.*}}
-; CHECK: ret 
 define <4 x i64> @load_sext_test1(<4 x i32> *%ptr) {
+; CHECK-LABEL: load_sext_test1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxdq (%rdi), %ymm0
+; CHECK-NEXT:    retq
  %X = load <4 x i32>, <4 x i32>* %ptr
  %Y = sext <4 x i32> %X to <4 x i64>
  ret <4 x i64>%Y
 }
 
-; CHECK: load_sext_test2
-; CHECK: vpmovsxbq (%r{{[^,]*}}), %ymm{{.*}}
-; CHECK: ret 
 define <4 x i64> @load_sext_test2(<4 x i8> *%ptr) {
+; CHECK-LABEL: load_sext_test2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxbq (%rdi), %ymm0
+; CHECK-NEXT:    retq
  %X = load <4 x i8>, <4 x i8>* %ptr
  %Y = sext <4 x i8> %X to <4 x i64>
  ret <4 x i64>%Y
 }
 
-; CHECK: load_sext_test3
-; CHECK: vpmovsxwq (%r{{[^,]*}}), %ymm{{.*}}
-; CHECK: ret 
 define <4 x i64> @load_sext_test3(<4 x i16> *%ptr) {
+; CHECK-LABEL: load_sext_test3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxwq (%rdi), %ymm0
+; CHECK-NEXT:    retq
  %X = load <4 x i16>, <4 x i16>* %ptr
  %Y = sext <4 x i16> %X to <4 x i64>
  ret <4 x i64>%Y
 }
 
-; CHECK: load_sext_test4
-; CHECK: vpmovsxwd (%r{{[^,]*}}), %ymm{{.*}}
-; CHECK: ret 
 define <8 x i32> @load_sext_test4(<8 x i16> *%ptr) {
+; CHECK-LABEL: load_sext_test4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxwd (%rdi), %ymm0
+; CHECK-NEXT:    retq
  %X = load <8 x i16>, <8 x i16>* %ptr
  %Y = sext <8 x i16> %X to <8 x i32>
  ret <8 x i32>%Y
 }
 
-; CHECK: load_sext_test5
-; CHECK: vpmovsxbd (%r{{[^,]*}}), %ymm{{.*}}
-; CHECK: ret 
 define <8 x i32> @load_sext_test5(<8 x i8> *%ptr) {
+; CHECK-LABEL: load_sext_test5:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxbd (%rdi), %ymm0
+; CHECK-NEXT:    retq
  %X = load <8 x i8>, <8 x i8>* %ptr
  %Y = sext <8 x i8> %X to <8 x i32>
  ret <8 x i32>%Y
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
index a30d8371775c3..36b6da5ef9603 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -83,3 +83,123 @@ define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) {
 }
 declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind readnone
 
+
+define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
+  ; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
+  ; CHECK:       ## BB#0:
+  ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+  ; CHECK-NEXT:    retl
+  %res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly
+
+
+define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
+  ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps:
+  ; CHECK:       ## BB#0:
+  ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
+  ; CHECK-NEXT:    retl
+  %res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly
+
+
+define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
+  ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
+  ; CHECK:       ## BB#0:
+  ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+  ; CHECK-NEXT:    retl
+  %res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly
+
+
+define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pbroadcastb_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0)
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
+
+
+define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pbroadcastb_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT:    retl
+  %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0)
+  ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
+
+
+define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pbroadcastw_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0)
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
+
+
+define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pbroadcastw_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT:    retl
+  %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0)
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
+
+
+define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pbroadcastd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0)
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
+
+
+define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pbroadcastd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    retl
+  %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0)
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
+
+
+define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) {
+; CHECK-LABEL: test_x86_avx2_pbroadcastq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
+
+
+define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
+; CHECK-LABEL: test_x86_avx2_pbroadcastq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    retl
+  %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0)
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 5b607afef91c7..606aca9dc02b6 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -641,30 +641,6 @@ define <4 x i64> @test_x86_avx2_pmul.dq(<8 x i32> %a0, <8 x i32> %a1) {
 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
 
 
-define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
-  ; CHECK: vbroadcastsd
-  %res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0) ; <<4 x double>> [#uses=1]
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly
-
-
-define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
-  ; CHECK: vbroadcastss
-  %res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly
-
-
-define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
-  ; CHECK: vbroadcastss
-  %res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly
-
-
 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
   ; CHECK: vpblendd
   %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
@@ -681,70 +657,6 @@ define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
 declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
 
 
-define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
-  ; CHECK: vpbroadcastb
-  %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
-
-
-define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) {
-  ; CHECK: vpbroadcastb
-  %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
-
-
-define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) {
-  ; CHECK: vpbroadcastw
-  %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
-
-
-define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) {
-  ; CHECK: vpbroadcastw
-  %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
-
-
-define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
-  ; CHECK: vbroadcastss
-  %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
-
-
-define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
-  ; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}}
-  %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ; <<8 x i32>> [#uses=1]
-  ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
-
-
-define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) {
-  ; CHECK: vpbroadcastq
-  %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
-
-
-define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
-  ; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
-  %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
-
-
 define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) {
   ; Check that the arguments are swapped between the intrinsic definition
   ; and its lowering. Indeed, the offsets are the first source in
@@ -756,15 +668,15 @@ define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) {
 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
 
 
-define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x float> %a1) {
+define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) {
   ; Check that the arguments are swapped between the intrinsic definition
   ; and its lowering. Indeed, the offsets are the first source in
   ; the instruction.
   ; CHECK: vpermps %ymm0, %ymm1, %ymm0
-  %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
+  %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x float>) nounwind readonly
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
 
 
 define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
diff --git a/test/CodeGen/X86/avx2-nontemporal.ll b/test/CodeGen/X86/avx2-nontemporal.ll
index 4d28a979712ac..058358f13b864 100644
--- a/test/CodeGen/X86/avx2-nontemporal.ll
+++ b/test/CodeGen/X86/avx2-nontemporal.ll
@@ -1,21 +1,18 @@
 ; RUN: llc < %s -march=x86 -mattr=+avx2 | FileCheck %s
 
-define void @f(<8 x float> %A, i8* %B, <4 x double> %C, i32 %D, <4 x i64> %E) {
-; CHECK: vmovntps
+define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E) {
+; CHECK: vmovntps %y
   %cast = bitcast i8* %B to <8 x float>*
   %A2 = fadd <8 x float> %A, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x4200000000000000>
-  store <8 x float> %A2, <8 x float>* %cast, align 16, !nontemporal !0
-; CHECK: vmovntdq
+  store <8 x float> %A2, <8 x float>* %cast, align 32, !nontemporal !0
+; CHECK: vmovntdq %y
   %cast1 = bitcast i8* %B to <4 x i64>*
   %E2 = add <4 x i64> %E, <i64 1, i64 2, i64 3, i64 4>
-  store <4 x i64> %E2, <4 x i64>* %cast1, align 16, !nontemporal !0
-; CHECK: vmovntpd
+  store <4 x i64> %E2, <4 x i64>* %cast1, align 32, !nontemporal !0
+; CHECK: vmovntpd %y
   %cast2 = bitcast i8* %B to <4 x double>*
   %C2 = fadd <4 x double> %C, <double 0x0, double 0x0, double 0x0, double 0x4200000000000000>
-  store <4 x double> %C2, <4 x double>* %cast2, align 16, !nontemporal !0
-; CHECK: movnti
-  %cast3 = bitcast i8* %B to i32*
-  store i32 %D, i32* %cast3, align 16, !nontemporal !0
+  store <4 x double> %C2, <4 x double>* %cast2, align 32, !nontemporal !0
   ret void
 }
 
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 94dcdcabdd336..6b77edb155a41 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1,7 +1,11 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s
 
-; CHECK: vpbroadcastb (%
 define <16 x i8> @BB16(i8* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: BB16:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastb (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i8, i8* %ptr, align 4
   %q0 = insertelement <16 x i8> undef, i8 %q, i32 0
@@ -22,8 +26,12 @@ entry:
   %qf = insertelement <16 x i8> %qe, i8 %q, i32 15
   ret <16 x i8> %qf
 }
-; CHECK: vpbroadcastb (%
+
 define <32 x i8> @BB32(i8* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: BB32:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastb (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i8, i8* %ptr, align 4
   %q0 = insertelement <32 x i8> undef, i8 %q, i32 0
@@ -61,9 +69,12 @@ entry:
   %q2f = insertelement <32 x i8> %q2e, i8 %q, i32 31
   ret <32 x i8> %q2f
 }
-; CHECK: vpbroadcastw (%
 
 define <8 x i16> @W16(i16* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: W16:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastw (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i16, i16* %ptr, align 4
   %q0 = insertelement <8 x i16> undef, i16 %q, i32 0
@@ -76,8 +87,12 @@ entry:
   %q7 = insertelement <8 x i16> %q6, i16 %q, i32 7
   ret <8 x i16> %q7
 }
-; CHECK: vpbroadcastw (%
+
 define <16 x i16> @WW16(i16* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: WW16:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastw (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i16, i16* %ptr, align 4
   %q0 = insertelement <16 x i16> undef, i16 %q, i32 0
@@ -98,8 +113,12 @@ entry:
   %qf = insertelement <16 x i16> %qe, i16 %q, i32 15
   ret <16 x i16> %qf
 }
-; CHECK: vbroadcastss (%
+
 define <4 x i32> @D32(i32* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: D32:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %q0 = insertelement <4 x i32> undef, i32 %q, i32 0
@@ -108,8 +127,12 @@ entry:
   %q3 = insertelement <4 x i32> %q2, i32 %q, i32 3
   ret <4 x i32> %q3
 }
-; CHECK: vbroadcastss (%
+
 define <8 x i32> @DD32(i32* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: DD32:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %q0 = insertelement <8 x i32> undef, i32 %q, i32 0
@@ -122,16 +145,24 @@ entry:
   %q7 = insertelement <8 x i32> %q6, i32 %q, i32 7
   ret <8 x i32> %q7
 }
-; CHECK: vpbroadcastq (%
+
 define <2 x i64> @Q64(i64* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: Q64:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastq (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 4
   %q0 = insertelement <2 x i64> undef, i64 %q, i32 0
   %q1 = insertelement <2 x i64> %q0, i64 %q, i32 1
   ret <2 x i64> %q1
 }
-; CHECK: vbroadcastsd (%
+
 define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: QQ64:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 4
   %q0 = insertelement <4 x i64> undef, i64 %q, i32 0
@@ -141,9 +172,214 @@ entry:
   ret <4 x i64> %q3
 }
 
+; FIXME: Pointer adjusted broadcasts
+
+define <16 x i8> @load_splat_16i8_16i8_1111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_16i8_16i8_1111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastb 1(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <16 x i8>, <16 x i8>* %ptr
+  %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i8> %ret
+}
+
+define <32 x i8> @load_splat_32i8_16i8_11111111111111111111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastb 1(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <16 x i8>, <16 x i8>* %ptr
+  %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <32 x i8> %ret
+}
+
+define <32 x i8> @load_splat_32i8_32i8_11111111111111111111111111111111(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastb 1(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <32 x i8>, <32 x i8>* %ptr
+  %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <32 x i8> %ret
+}
+
+define <8 x i16> @load_splat_8i16_8i16_11111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i16_8i16_11111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i16>, <8 x i16>* %ptr
+  %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i16> %ret
+}
+
+define <16 x i16> @load_splat_16i16_8i16_1111111111111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_16i16_8i16_1111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i16>, <8 x i16>* %ptr
+  %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @load_splat_16i16_16i16_1111111111111111(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_16i16_16i16_1111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <16 x i16>, <16 x i16>* %ptr
+  %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i16> %ret
+}
+
+define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i32_4i32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_4i32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_8i32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i32>, <8 x i32>* %ptr
+  %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i32> %ret
+}
+
+define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f32_4f32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_4f32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_8f32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x float>, <8 x float>* %ptr
+  %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x float> %ret
+}
+
+define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_4i64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i64>, <4 x i64>* %ptr
+  %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i64> %ret
+}
+
+define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_4f64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x double>, <4 x double>* %ptr
+  %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x double> %ret
+}
+
 ; make sure that we still don't support broadcast double into 128-bit vector
 ; this used to crash
 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: I:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; CHECK-NEXT:    retq
 entry:
   %q = load double, double* %ptr, align 4
   %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
@@ -151,28 +387,33 @@ entry:
   ret <2 x double> %vecinit2.i
 }
 
-; CHECK: V111
-; CHECK: vpbroadcastd
-; CHECK: ret
 define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp {
+; CHECK-LABEL: V111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %g = add <8 x i32> %in, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <8 x i32> %g
 }
 
-; CHECK: V113
-; CHECK: vbroadcastss
-; CHECK: ret
 define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
+; CHECK-LABEL: V113:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %g = fadd <8 x float> %in, <float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000>
   ret <8 x float> %g
 }
 
-; CHECK: _e2
-; CHECK: vbroadcastss
-; CHECK: ret
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _e2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
   %vecinit.i = insertelement <4 x float> undef, float        0xbf80000000000000, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float  0xbf80000000000000, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
@@ -180,10 +421,11 @@ define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
   ret <4 x float> %vecinit6.i
 }
 
-; CHECK: _e4
-; CHECK-NOT: broadcast
-; CHECK: ret
 define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _e4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52]
+; CHECK-NEXT:    retq
   %vecinit0.i = insertelement <8 x i8> undef, i8       52, i32 0
   %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
   %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2
@@ -197,6 +439,17 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
 
 
 define void @crash() nounwind alwaysinline {
+; CHECK-LABEL: crash:
+; CHECK:       ## BB#0: ## %WGLoopsEntry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je LBB31_1
+; CHECK-NEXT:  ## BB#2: ## %ret
+; CHECK-NEXT:    retq
+; CHECK-NEXT:    .align 4, 0x90
+; CHECK-NEXT:  LBB31_1: ## %footer349VF
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    jmp LBB31_1
 WGLoopsEntry:
   br i1 undef, label %ret, label %footer329VF
 
@@ -223,135 +476,151 @@ ret:
   ret void
 }
 
-; CHECK: _inreg0
-; CHECK: broadcastss
-; CHECK: ret
 define <8 x i32> @_inreg0(i32 %scalar) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _inreg0:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovd %edi, %xmm0
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %in = insertelement <8 x i32> undef, i32 %scalar, i32 0
   %wide = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
   ret <8 x i32> %wide
 }
 
-; CHECK: _inreg1
-; CHECK: broadcastss
-; CHECK: ret
 define <8 x float> @_inreg1(float %scalar) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _inreg1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %in = insertelement <8 x float> undef, float %scalar, i32 0
   %wide = shufflevector <8 x float> %in, <8 x float> undef, <8 x i32> zeroinitializer
   ret <8 x float> %wide
 }
 
-; CHECK: _inreg2
-; CHECK: broadcastss
-; CHECK: ret
 define <4 x float> @_inreg2(float %scalar) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _inreg2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %in = insertelement <4 x float> undef, float %scalar, i32 0
   %wide = shufflevector <4 x float> %in, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %wide
 }
 
-; CHECK: _inreg3
-; CHECK: broadcastsd
-; CHECK: ret
 define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _inreg3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %in = insertelement <4 x double> undef, double %scalar, i32 0
   %wide = shufflevector <4 x double> %in, <4 x double> undef, <4 x i32> zeroinitializer
   ret <4 x double> %wide
 }
 
-;CHECK-LABEL: _inreg8xfloat:
-;CHECK: vbroadcastss
-;CHECK: ret
 define   <8 x float> @_inreg8xfloat(<8 x float> %a) {
+; CHECK-LABEL: _inreg8xfloat:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer
   ret <8 x float> %b
 }
 
-;CHECK-LABEL: _inreg4xfloat:
-;CHECK: vbroadcastss
-;CHECK: ret
 define   <4 x float> @_inreg4xfloat(<4 x float> %a) {
+; CHECK-LABEL: _inreg4xfloat:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %b
 }
 
-;CHECK-LABEL: _inreg16xi16:
-;CHECK: vpbroadcastw
-;CHECK: ret
 define   <16 x i16> @_inreg16xi16(<16 x i16> %a) {
+; CHECK-LABEL: _inreg16xi16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> zeroinitializer
   ret <16 x i16> %b
 }
 
-;CHECK-LABEL: _inreg8xi16:
-;CHECK: vpbroadcastw
-;CHECK: ret
 define   <8 x i16> @_inreg8xi16(<8 x i16> %a) {
+; CHECK-LABEL: _inreg8xi16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %b = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer
   ret <8 x i16> %b
 }
 
-
-;CHECK-LABEL: _inreg4xi64:
-;CHECK: vbroadcastsd
-;CHECK: ret
 define   <4 x i64> @_inreg4xi64(<4 x i64> %a) {
+; CHECK-LABEL: _inreg4xi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
   ret <4 x i64> %b
 }
 
-;CHECK-LABEL: _inreg2xi64:
-;CHECK: vpbroadcastq
-;CHECK: ret
 define   <2 x i64> @_inreg2xi64(<2 x i64> %a) {
+; CHECK-LABEL: _inreg2xi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %b
 }
 
-;CHECK-LABEL: _inreg4xdouble:
-;CHECK: vbroadcastsd
-;CHECK: ret
 define   <4 x double> @_inreg4xdouble(<4 x double> %a) {
+; CHECK-LABEL: _inreg4xdouble:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer
   ret <4 x double> %b
 }
 
-;CHECK-LABEL: _inreg2xdouble:
-;CHECK: vmovddup
-;CHECK: ret
 define   <2 x double> @_inreg2xdouble(<2 x double> %a) {
+; CHECK-LABEL: _inreg2xdouble:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    retq
   %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
   ret <2 x double> %b
 }
 
-;CHECK-LABEL: _inreg8xi32:
-;CHECK: vbroadcastss
-;CHECK: ret
 define   <8 x i32> @_inreg8xi32(<8 x i32> %a) {
+; CHECK-LABEL: _inreg8xi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
   ret <8 x i32> %b
 }
 
-;CHECK-LABEL: _inreg4xi32:
-;CHECK: vbroadcastss
-;CHECK: ret
 define   <4 x i32> @_inreg4xi32(<4 x i32> %a) {
+; CHECK-LABEL: _inreg4xi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
   ret <4 x i32> %b
 }
 
-;CHECK-LABEL: _inreg32xi8:
-;CHECK: vpbroadcastb
-;CHECK: ret
 define   <32 x i8> @_inreg32xi8(<32 x i8> %a) {
+; CHECK-LABEL: _inreg32xi8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %b = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
   ret <32 x i8> %b
 }
 
-;CHECK-LABEL: _inreg16xi8:
-;CHECK: vpbroadcastb
-;CHECK: ret
 define   <16 x i8> @_inreg16xi8(<16 x i8> %a) {
+; CHECK-LABEL: _inreg16xi8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %b
 }
@@ -360,11 +629,11 @@ define   <16 x i8> @_inreg16xi8(<16 x i8> %a) {
 ; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
 ; (via the insertelements).
 
-; CHECK-LABEL: splat_concat1
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastss
-; CHECK-NEXT: ret
 define <8 x float> @splat_concat1(float %f) {
+; CHECK-LABEL: splat_concat1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %1 = insertelement <4 x float> undef, float %f, i32 0
   %2 = insertelement <4 x float> %1, float %f, i32 1
   %3 = insertelement <4 x float> %2, float %f, i32 2
@@ -373,11 +642,11 @@ define <8 x float> @splat_concat1(float %f) {
   ret <8 x float> %5
 }
 
-; CHECK-LABEL: splat_concat2
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastss
-; CHECK-NEXT: ret
 define <8 x float> @splat_concat2(float %f) {
+; CHECK-LABEL: splat_concat2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %1 = insertelement <4 x float> undef, float %f, i32 0
   %2 = insertelement <4 x float> %1, float %f, i32 1
   %3 = insertelement <4 x float> %2, float %f, i32 2
@@ -390,22 +659,22 @@ define <8 x float> @splat_concat2(float %f) {
   ret <8 x float> %9
 }
 
-; CHECK-LABEL: splat_concat3
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastsd
-; CHECK-NEXT: ret
 define <4 x double> @splat_concat3(double %d) {
+; CHECK-LABEL: splat_concat3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %1 = insertelement <2 x double> undef, double %d, i32 0
   %2 = insertelement <2 x double> %1, double %d, i32 1
   %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   ret <4 x double> %3
 }
 
-; CHECK-LABEL: splat_concat4
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastsd
-; CHECK-NEXT: ret
 define <4 x double> @splat_concat4(double %d) {
+; CHECK-LABEL: splat_concat4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %1 = insertelement <2 x double> undef, double %d, i32 0
   %2 = insertelement <2 x double> %1, double %d, i32 1
   %3 = insertelement <2 x double> undef, double %d, i32 0
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 1ecd1007905ae..9220e4f269cd0 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -1,4 +1,9 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=AVX512F %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck --check-prefix=CHECK --check-prefix=AVX512VL %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck --check-prefix=CHECK --check-prefix=AVX512BW %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=AVX512DQ %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq -mattr=+avx512bw -mattr=+avx512vl| FileCheck --check-prefix=CHECK --check-prefix=SKX %s
 
 define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
 ; CHECK-LABEL: addpd512:
@@ -83,18 +88,54 @@ entry:
 }
 
 define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
-; CHECK-LABEL: imulq512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
-; CHECK-NEXT:    vpsrlq $32, %zmm0, %zmm3
-; CHECK-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
-; CHECK-NEXT:    vpsllq $32, %zmm3, %zmm3
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
-; CHECK-NEXT:    vpsrlq $32, %zmm1, %zmm1
-; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpsllq $32, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512F-LABEL: imulq512:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512F-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: imulq512:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512VL-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: imulq512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: imulq512:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: imulq512:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
   %z = mul <8 x i64>%x, %y
   ret <8 x i64>%z
 }
@@ -463,10 +504,13 @@ entry:
   ret <8 x i64>%d
 }
 
-; CHECK-LABEL: test_mask_vaddps
-; CHECK: vaddps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vaddps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -475,10 +519,13 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vmulps
-; CHECK: vmulps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vmulps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vmulps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -487,10 +534,13 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vminps
-; CHECK: vminps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vminps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -500,10 +550,41 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vminpd
-; CHECK: vminpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
+; AVX512F-LABEL: test_mask_vminpd:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512F-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_mask_vminpd:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; AVX512VL-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: test_mask_vminpd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512BW-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512BW-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_mask_vminpd:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512DQ-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: test_mask_vminpd:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
                                      <8 x double> %j, <8 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -513,10 +594,13 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_mask_vmaxps
-; CHECK: vmaxps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vmaxps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -526,10 +610,41 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vmaxpd
-; CHECK: vmaxpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
+; AVX512F-LABEL: test_mask_vmaxpd:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512F-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_mask_vmaxpd:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; AVX512VL-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: test_mask_vmaxpd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512BW-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512BW-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_mask_vmaxpd:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512DQ-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: test_mask_vmaxpd:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
                                      <8 x double> %j, <8 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -539,10 +654,13 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_mask_vsubps
-; CHECK: vsubps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vsubps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -551,10 +669,13 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vdivps
-; CHECK: vdivps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vdivps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vdivps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -563,10 +684,13 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vaddpd
-; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqq %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <8 x double> %j, <8 x i64> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -575,10 +699,13 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_maskz_vaddpd
-; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}}}
-; CHECK: ret
 define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
+; CHECK-LABEL: test_maskz_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
                                       <8 x i64> %mask1) nounwind readnone {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %x = fadd <8 x double> %i, %j
@@ -586,10 +713,13 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_mask_fold_vaddpd
-; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}.*}}
-; CHECK: ret
 define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_fold_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vaddpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <8 x double>* %j,  <8 x i64> %mask1)
                                      nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -599,10 +729,13 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_maskz_fold_vaddpd
-; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}.*}}
-; CHECK: ret
 define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
+; CHECK-LABEL: test_maskz_fold_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
                                       <8 x i64> %mask1) nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load <8 x double>, <8 x double>* %j, align 8
@@ -611,10 +744,11 @@ define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_broadcast_vaddpd
-; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*}}
-; CHECK: ret
 define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind {
+; CHECK-LABEL: test_broadcast_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %tmp = load double, double* %j
   %b = insertelement <8 x double> undef, double %tmp, i32 0
   %c = shufflevector <8 x double> %b, <8 x double> undef,
@@ -623,10 +757,14 @@ define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind
   ret <8 x double> %x
 }
 
-; CHECK-LABEL: test_mask_broadcast_vaddpd
-; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]}.*}}
-; CHECK: ret
 define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_broadcast_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vpcmpneqq %zmm0, %zmm2, %k1
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
                                       double* %j, <8 x i64> %mask1) nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
@@ -638,10 +776,13 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double>
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_maskz_broadcast_vaddpd
-; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]} {z}.*}}
-; CHECK: ret
 define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
+; CHECK-LABEL: test_maskz_broadcast_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
                                        <8 x i64> %mask1) nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
@@ -652,3 +793,104 @@ define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
   %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
   ret <8 x double> %r
 }
+
+define <16 x float>  @test_fxor(<16 x float> %a) {
+; AVX512F-LABEL: test_fxor:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_fxor:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: test_fxor:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fxor:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vxorps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: test_fxor:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vxorps {{.*}}(%rip), %zmm0, %zmm0
+; SKX-NEXT:    retq
+
+  %res = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+  ret <16 x float>%res
+}
+
+define <8 x float>  @test_fxor_8f32(<8 x float> %a) {
+; CHECK-LABEL: test_fxor_8f32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+  ret <8 x float>%res
+}
+
+define <8 x double> @fabs_v8f64(<8 x double> %p)
+; AVX512F-LABEL: fabs_v8f64:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fabs_v8f64:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: fabs_v8f64:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: fabs_v8f64:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vandpd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: fabs_v8f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vandpd {{.*}}(%rip), %zmm0, %zmm0
+; SKX-NEXT:    retq
+{
+  %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
+  ret <8 x double> %t
+}
+declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
+
+define <16 x float> @fabs_v16f32(<16 x float> %p)
+; AVX512F-LABEL: fabs_v16f32:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fabs_v16f32:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: fabs_v16f32:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: fabs_v16f32:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: fabs_v16f32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vandps {{.*}}(%rip), %zmm0, %zmm0
+; SKX-NEXT:    retq
+{
+  %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
+  ret <16 x float> %t
+}
+declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
diff --git a/test/CodeGen/X86/avx512-bugfix-25270.ll b/test/CodeGen/X86/avx512-bugfix-25270.ll
new file mode 100644
index 0000000000000..d024475274b48
--- /dev/null
+++ b/test/CodeGen/X86/avx512-bugfix-25270.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+declare void @Print__512(<16 x i32>) #0
+
+define void @bar__512(<16 x i32>* %var) #0 {
+; CHECK-LABEL: bar__512:
+; CHECK:       ## BB#0: ## %allocas
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $112, %rsp
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    vmovdqu32 (%rbx), %zmm0
+; CHECK-NEXT:    vmovups %zmm0, (%rsp) ## 64-byte Spill
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1
+; CHECK-NEXT:    vmovdqa32 %zmm1, (%rbx)
+; CHECK-NEXT:    callq _Print__512
+; CHECK-NEXT:    vmovups (%rsp), %zmm0 ## 64-byte Reload
+; CHECK-NEXT:    callq _Print__512
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0
+; CHECK-NEXT:    vmovdqa32 %zmm0, (%rbx)
+; CHECK-NEXT:    addq $112, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+allocas:
+  %var_load_load = load <16 x i32>, <16 x i32>* %var, align 1
+  store <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>, <16 x i32>* %var, align 64
+  call void @Print__512(<16 x i32> %var_load_load)
+ ; %var_load_load value should be reloaded
+  call void @Print__512(<16 x i32> %var_load_load)
+  store <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>, <16 x i32>* %var, align 64
+  ret void
+}
+
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
index e5373c575c1ad..0f89aa71162e4 100644
--- a/test/CodeGen/X86/avx512-build-vector.ll
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
 define <16 x i32> @test2(<16 x i32> %x) {
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index edb6bef1a4ac0..a61aeba5aff93 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -1,55 +1,167 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX
 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL_X32
 
-; KNL-LABEL: test1
-; KNL: vxorps
 define <16 x i1> @test1() {
+; ALL_X64-LABEL: test1:
+; ALL_X64:       ## BB#0:
+; ALL_X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; ALL_X64-NEXT:    retq
+;
+; KNL_X32-LABEL: test1:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; KNL_X32-NEXT:    retl
   ret <16 x i1> zeroinitializer
 }
 
-; SKX-LABEL: test2
-; SKX: vpmovb2m
-; SKX: vpmovb2m
-; SKX: kandw
-; SKX: vpmovm2b
-; KNL-LABEL: test2
-; KNL: vpmovsxbd
-; KNL: vpmovsxbd
-; KNL: vpandd
-; KNL: vpmovdb
 define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
+; KNL-LABEL: test2:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
+; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test2:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k0
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    kandw %k0, %k1, %k0
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    retq
+;
+; KNL_X32-LABEL: test2:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_X32-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL_X32-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL_X32-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL_X32-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL_X32-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
+; KNL_X32-NEXT:    vpbroadcastd LCPI1_0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL_X32-NEXT:    retl
   %c = and <16 x i1>%a, %b
   ret <16 x i1> %c
 }
 
-; SKX-LABEL: test3
-; SKX: vpmovw2m
-; SKX: vpmovw2m
-; SKX: kandb
-; SKX: vpmovm2w
 define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
+; KNL-LABEL: test3:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1 {%k1}
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqw %zmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test3:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT:    vpmovw2m %xmm1, %k0
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    kandb %k0, %k1, %k0
+; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    retq
+;
+; KNL_X32-LABEL: test3:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL_X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [63,0,63,0,63,0,63,0,63,0,63,0,63,0,63,0]
+; KNL_X32-NEXT:    vpsllvq %zmm2, %zmm1, %zmm1
+; KNL_X32-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL_X32-NEXT:    vpsllvq %zmm2, %zmm0, %zmm0
+; KNL_X32-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL_X32-NEXT:    vptestmq %zmm1, %zmm1, %k1 {%k1}
+; KNL_X32-NEXT:    vpbroadcastd LCPI2_1, %zmm0
+; KNL_X32-NEXT:    vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpmovqw %zmm0, %xmm0
+; KNL_X32-NEXT:    retl
   %c = and <8 x i1>%a, %b
   ret <8 x i1> %c
 }
 
-; SKX-LABEL: test4
-; SKX: vpmovd2m
-; SKX: vpmovd2m
-; SKX: kandw
-; SKX: vpmovm2d
 define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) {
+; KNL-LABEL: test4:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test4:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k0
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    kandw %k1, %k0, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    retq
+;
+; KNL_X32-LABEL: test4:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; KNL_X32-NEXT:    retl
   %c = and <4 x i1>%a, %b
   ret <4 x i1> %c
 }
 
-; SKX-LABEL: test5
-; SKX: vpcmpgtd
-; SKX: vpmovm2w
-; SKX: call
-; SKX: vpmovzxwd
 declare <8 x i1> @func8xi1(<8 x i1> %a)
+
 define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
+; KNL-LABEL: test5:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rax
+; KNL-NEXT:  Ltmp0:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    callq _func8xi1
+; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT:    vpslld $31, %ymm0, %ymm0
+; KNL-NEXT:    vpsrad $31, %ymm0, %ymm0
+; KNL-NEXT:    popq %rax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test5:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rax
+; SKX-NEXT:  Ltmp0:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
+; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    callq _func8xi1
+; SKX-NEXT:    vpmovzxwd %xmm0, %ymm0
+; SKX-NEXT:    vpslld $31, %ymm0, %ymm0
+; SKX-NEXT:    vpsrad $31, %ymm0, %ymm0
+; SKX-NEXT:    popq %rax
+; SKX-NEXT:    retq
+;
+; KNL_X32-LABEL: test5:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    subl $12, %esp
+; KNL_X32-NEXT:  Ltmp0:
+; KNL_X32-NEXT:    .cfi_def_cfa_offset 16
+; KNL_X32-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; KNL_X32-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL_X32-NEXT:    calll L_func8xi1$stub
+; KNL_X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL_X32-NEXT:    vpslld $31, %ymm0, %ymm0
+; KNL_X32-NEXT:    vpsrad $31, %ymm0, %ymm0
+; KNL_X32-NEXT:    addl $12, %esp
+; KNL_X32-NEXT:    retl
   %cmpRes = icmp sgt <8 x i32>%a, %b
   %resi = call <8 x i1> @func8xi1(<8 x i1> %cmpRes)
   %res = sext <8 x i1>%resi to <8 x i32>
@@ -58,14 +170,50 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
 
 declare <16 x i1> @func16xi1(<16 x i1> %a)
 
-; KNL-LABEL: test6
-; KNL: vpbroadcastd
-; KNL: vpmovdb
-; KNL: call
-; KNL: vpmovzxbd
-; KNL: vpslld  $31, %zmm
-; KNL: vpsrad  $31, %zmm
 define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
+; KNL-LABEL: test6:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rax
+; KNL-NEXT:  Ltmp1:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
+; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    callq _func16xi1
+; KNL-NEXT:    vpmovzxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vpsrad $31, %zmm0, %zmm0
+; KNL-NEXT:    popq %rax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test6:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rax
+; SKX-NEXT:  Ltmp1:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    callq _func16xi1
+; SKX-NEXT:    vpmovzxbd %xmm0, %zmm0
+; SKX-NEXT:    vpslld $31, %zmm0, %zmm0
+; SKX-NEXT:    vpsrad $31, %zmm0, %zmm0
+; SKX-NEXT:    popq %rax
+; SKX-NEXT:    retq
+;
+; KNL_X32-LABEL: test6:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    subl $12, %esp
+; KNL_X32-NEXT:  Ltmp1:
+; KNL_X32-NEXT:    .cfi_def_cfa_offset 16
+; KNL_X32-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
+; KNL_X32-NEXT:    vpbroadcastd LCPI5_0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL_X32-NEXT:    calll L_func16xi1$stub
+; KNL_X32-NEXT:    vpmovzxbd %xmm0, %zmm0
+; KNL_X32-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL_X32-NEXT:    vpsrad $31, %zmm0, %zmm0
+; KNL_X32-NEXT:    addl $12, %esp
+; KNL_X32-NEXT:    retl
   %cmpRes = icmp sgt <16 x i32>%a, %b
   %resi = call <16 x i1> @func16xi1(<16 x i1> %cmpRes)
   %res = sext <16 x i1>%resi to <16 x i32>
@@ -73,82 +221,265 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
 }
 
 declare <4 x i1> @func4xi1(<4 x i1> %a)
-; SKX-LABEL: test7
-; SKX: vpmovm2d
-; SKX: call
-; SKX: vpslld  $31, %xmm
-; SKX: vpsrad  $31, %xmm
 
 define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) {
+; KNL-LABEL: test7:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rax
+; KNL-NEXT:  Ltmp2:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    callq _func4xi1
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    popq %rax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test7:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rax
+; SKX-NEXT:  Ltmp2:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    callq _func4xi1
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpsrad $31, %xmm0, %xmm0
+; SKX-NEXT:    popq %rax
+; SKX-NEXT:    retq
+;
+; KNL_X32-LABEL: test7:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    subl $12, %esp
+; KNL_X32-NEXT:  Ltmp2:
+; KNL_X32-NEXT:    .cfi_def_cfa_offset 16
+; KNL_X32-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL_X32-NEXT:    calll L_func4xi1$stub
+; KNL_X32-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL_X32-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL_X32-NEXT:    addl $12, %esp
+; KNL_X32-NEXT:    retl
   %cmpRes = icmp sgt <4 x i32>%a, %b
   %resi = call <4 x i1> @func4xi1(<4 x i1> %cmpRes)
   %res = sext <4 x i1>%resi to <4 x i32>
   ret <4 x i32> %res
 }
 
-; SKX-LABEL: test7a
-; SKX: call
-; SKX: vpmovw2m  %xmm0, %k0
-; SKX: kandb
 define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
+; KNL-LABEL: test7a:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rax
+; KNL-NEXT:  Ltmp3:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    callq _func8xi1
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    movb $85, %al
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1 {%k1}
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqw %zmm0, %xmm0
+; KNL-NEXT:    popq %rax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test7a:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rax
+; SKX-NEXT:  Ltmp3:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
+; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    callq _func8xi1
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k0
+; SKX-NEXT:    movb $85, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    kandb %k1, %k0, %k0
+; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    popq %rax
+; SKX-NEXT:    retq
+;
+; KNL_X32-LABEL: test7a:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    subl $12, %esp
+; KNL_X32-NEXT:  Ltmp3:
+; KNL_X32-NEXT:    .cfi_def_cfa_offset 16
+; KNL_X32-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; KNL_X32-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL_X32-NEXT:    calll L_func8xi1$stub
+; KNL_X32-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL_X32-NEXT:    vpsllvq LCPI7_0, %zmm0, %zmm0
+; KNL_X32-NEXT:    movb $85, %al
+; KNL_X32-NEXT:    movzbl %al, %eax
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    vptestmq %zmm0, %zmm0, %k1 {%k1}
+; KNL_X32-NEXT:    vpbroadcastd LCPI7_1, %zmm0
+; KNL_X32-NEXT:    vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpmovqw %zmm0, %xmm0
+; KNL_X32-NEXT:    addl $12, %esp
+; KNL_X32-NEXT:    retl
   %cmpRes = icmp sgt <8 x i32>%a, %b
   %resi = call <8 x i1> @func8xi1(<8 x i1> %cmpRes)
   %res = and <8 x i1>%resi,  <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
   ret <8 x i1> %res
 }
 
-
-; KNL_X32-LABEL: test8
-; KNL_X32: testb $1, 4(%esp)
-; KNL_X32:jne
-
-; KNL-LABEL: test8
-; KNL: testb   $1, %dil
-; KNL:jne
-
 define <16 x i8> @test8(<16 x i8> %a1, <16 x i8> %a2, i1 %cond) {
+; ALL_X64-LABEL: test8:
+; ALL_X64:       ## BB#0:
+; ALL_X64-NEXT:    testb $1, %dil
+; ALL_X64-NEXT:    jne LBB8_2
+; ALL_X64-NEXT:  ## BB#1:
+; ALL_X64-NEXT:    vmovaps %zmm1, %zmm0
+; ALL_X64-NEXT:  LBB8_2:
+; ALL_X64-NEXT:    retq
+;
+; KNL_X32-LABEL: test8:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    jne LBB8_2
+; KNL_X32-NEXT:  ## BB#1:
+; KNL_X32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_X32-NEXT:  LBB8_2:
+; KNL_X32-NEXT:    retl
   %res = select i1 %cond, <16 x i8> %a1, <16 x i8> %a2
   ret <16 x i8> %res
 }
 
-; KNL-LABEL: test9
-; KNL: vucomisd
-; KNL: setb
 define i1 @test9(double %a, double %b) {
+; ALL_X64-LABEL: test9:
+; ALL_X64:       ## BB#0:
+; ALL_X64-NEXT:    vucomisd %xmm0, %xmm1
+; ALL_X64-NEXT:    setb %al
+; ALL_X64-NEXT:    retq
+;
+; KNL_X32-LABEL: test9:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0
+; KNL_X32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
+; KNL_X32-NEXT:    setb %al
+; KNL_X32-NEXT:    retl
   %c = fcmp ugt double %a, %b
   ret i1 %c
 }
 
-; KNL_X32-LABEL: test10
-; KNL_X32: testb $1, 12(%esp)
-; KNL_X32: cmovnel
-
-; KNL-LABEL: test10
-; KNL: testb   $1, %dl
-; KNL: cmovel
 define i32 @test10(i32 %a, i32 %b, i1 %cond) {
+; ALL_X64-LABEL: test10:
+; ALL_X64:       ## BB#0:
+; ALL_X64-NEXT:    testb $1, %dl
+; ALL_X64-NEXT:    cmovel %esi, %edi
+; ALL_X64-NEXT:    movl %edi, %eax
+; ALL_X64-NEXT:    retq
+;
+; KNL_X32-LABEL: test10:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; KNL_X32-NEXT:    cmovnel %eax, %ecx
+; KNL_X32-NEXT:    movl (%ecx), %eax
+; KNL_X32-NEXT:    retl
   %c = select i1 %cond, i32 %a, i32 %b
   ret i32 %c
 }
 
-; KNL-LABEL: test11
-; KNL: cmp
-; KNL: setg
 define i1 @test11(i32 %a, i32 %b) {
+; ALL_X64-LABEL: test11:
+; ALL_X64:       ## BB#0:
+; ALL_X64-NEXT:    cmpl %esi, %edi
+; ALL_X64-NEXT:    setg %al
+; ALL_X64-NEXT:    retq
+;
+; KNL_X32-LABEL: test11:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    setg %al
+; KNL_X32-NEXT:    retl
   %c = icmp sgt i32 %a, %b
   ret i1 %c
 }
 
-; KNL-LABEL: test12
-; KNL: callq _test11
-;; return value in %al
-; KNL: movzbl	%al, %ebx
-; KNL: callq _test10
-; KNL: testb   $1, %bl
-
 define i32 @test12(i32 %a1, i32 %a2, i32 %b1) {
+; ALL_X64-LABEL: test12:
+; ALL_X64:       ## BB#0:
+; ALL_X64-NEXT:    pushq %rbp
+; ALL_X64-NEXT:  Ltmp4:
+; ALL_X64-NEXT:    .cfi_def_cfa_offset 16
+; ALL_X64-NEXT:    pushq %r14
+; ALL_X64-NEXT:  Ltmp5:
+; ALL_X64-NEXT:    .cfi_def_cfa_offset 24
+; ALL_X64-NEXT:    pushq %rbx
+; ALL_X64-NEXT:  Ltmp6:
+; ALL_X64-NEXT:    .cfi_def_cfa_offset 32
+; ALL_X64-NEXT:  Ltmp7:
+; ALL_X64-NEXT:    .cfi_offset %rbx, -32
+; ALL_X64-NEXT:  Ltmp8:
+; ALL_X64-NEXT:    .cfi_offset %r14, -24
+; ALL_X64-NEXT:  Ltmp9:
+; ALL_X64-NEXT:    .cfi_offset %rbp, -16
+; ALL_X64-NEXT:    movl %esi, %r14d
+; ALL_X64-NEXT:    movl %edi, %ebp
+; ALL_X64-NEXT:    movl %edx, %esi
+; ALL_X64-NEXT:    callq _test11
+; ALL_X64-NEXT:    movzbl %al, %ebx
+; ALL_X64-NEXT:    movl %ebp, %edi
+; ALL_X64-NEXT:    movl %r14d, %esi
+; ALL_X64-NEXT:    movl %ebx, %edx
+; ALL_X64-NEXT:    callq _test10
+; ALL_X64-NEXT:    xorl %ecx, %ecx
+; ALL_X64-NEXT:    testb $1, %bl
+; ALL_X64-NEXT:    cmovel %ecx, %eax
+; ALL_X64-NEXT:    popq %rbx
+; ALL_X64-NEXT:    popq %r14
+; ALL_X64-NEXT:    popq %rbp
+; ALL_X64-NEXT:    retq
+;
+; KNL_X32-LABEL: test12:
+; KNL_X32:       ## BB#0:
+; KNL_X32-NEXT:    pushl %ebx
+; KNL_X32-NEXT:  Ltmp4:
+; KNL_X32-NEXT:    .cfi_def_cfa_offset 8
+; KNL_X32-NEXT:    pushl %edi
+; KNL_X32-NEXT:  Ltmp5:
+; KNL_X32-NEXT:    .cfi_def_cfa_offset 12
+; KNL_X32-NEXT:    pushl %esi
+; KNL_X32-NEXT:  Ltmp6:
+; KNL_X32-NEXT:    .cfi_def_cfa_offset 16
+; KNL_X32-NEXT:    subl $16, %esp
+; KNL_X32-NEXT:  Ltmp7:
+; KNL_X32-NEXT:    .cfi_def_cfa_offset 32
+; KNL_X32-NEXT:  Ltmp8:
+; KNL_X32-NEXT:    .cfi_offset %esi, -16
+; KNL_X32-NEXT:  Ltmp9:
+; KNL_X32-NEXT:    .cfi_offset %edi, -12
+; KNL_X32-NEXT:  Ltmp10:
+; KNL_X32-NEXT:    .cfi_offset %ebx, -8
+; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    movl %edi, (%esp)
+; KNL_X32-NEXT:    calll _test11
+; KNL_X32-NEXT:    movb %al, %bl
+; KNL_X32-NEXT:    movzbl %bl, %eax
+; KNL_X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    movl %edi, (%esp)
+; KNL_X32-NEXT:    calll _test10
+; KNL_X32-NEXT:    xorl %ecx, %ecx
+; KNL_X32-NEXT:    testb $1, %bl
+; KNL_X32-NEXT:    cmovel %ecx, %eax
+; KNL_X32-NEXT:    addl $16, %esp
+; KNL_X32-NEXT:    popl %esi
+; KNL_X32-NEXT:    popl %edi
+; KNL_X32-NEXT:    popl %ebx
+; KNL_X32-NEXT:    retl
   %cond = call i1 @test11(i32 %a1, i32 %b1)
   %res = call i32 @test10(i32 %a1, i32 %a2, i1 %cond)
   %res1 = select i1 %cond, i32 %res, i32 0
   ret i32 %res1
-}
\ No newline at end of file
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index a211bcd38c9c7..586a295450149 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
 
 ; CHECK-LABEL: sitof32
 ; CHECK: vcvtdq2ps %zmm
@@ -8,6 +8,70 @@ define <16 x float> @sitof32(<16 x i32> %a) nounwind {
   ret <16 x float> %b
 }
 
+; CHECK-LABEL: sltof864
+; CHECK: vcvtqq2pd
+define <8 x double> @sltof864(<8 x i64> %a) {
+  %b = sitofp <8 x i64> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+; CHECK-LABEL: sltof464
+; CHECK: vcvtqq2pd
+define <4 x double> @sltof464(<4 x i64> %a) {
+  %b = sitofp <4 x i64> %a to <4 x double>
+  ret <4 x double> %b
+}
+
+; CHECK-LABEL: sltof2f32
+; CHECK: vcvtqq2ps
+define <2 x float> @sltof2f32(<2 x i64> %a) {
+  %b = sitofp <2 x i64> %a to <2 x float>
+  ret <2 x float>%b
+}
+
+; CHECK-LABEL: sltof4f32_mem
+; CHECK: vcvtqq2psy (%rdi)
+define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
+  %a1 = load <4 x i64>, <4 x i64>* %a, align 8
+  %b = sitofp <4 x i64> %a1 to <4 x float>
+  ret <4 x float>%b
+}
+
+; CHECK-LABEL: f64tosl
+; CHECK: vcvttpd2qq
+define <4 x i64> @f64tosl(<4 x double> %a) {
+  %b = fptosi <4 x double> %a to <4 x i64>
+  ret <4 x i64> %b
+}
+
+; CHECK-LABEL: f32tosl
+; CHECK: vcvttps2qq
+define <4 x i64> @f32tosl(<4 x float> %a) {
+  %b = fptosi <4 x float> %a to <4 x i64>
+  ret <4 x i64> %b
+}
+
+; CHECK-LABEL: sltof432
+; CHECK: vcvtqq2ps
+define <4 x float> @sltof432(<4 x i64> %a) {
+  %b = sitofp <4 x i64> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+; CHECK-LABEL: ultof432
+; CHECK: vcvtuqq2ps
+define <4 x float> @ultof432(<4 x i64> %a) {
+  %b = uitofp <4 x i64> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+; CHECK-LABEL: ultof64
+; CHECK: vcvtuqq2pd
+define <8 x double> @ultof64(<8 x i64> %a) {
+  %b = uitofp <8 x i64> %a to <8 x double>
+  ret <8 x double> %b
+}
+
 ; CHECK-LABEL: fptosi00
 ; CHECK: vcvttps2dq %zmm
 ; CHECK: ret
@@ -64,16 +128,39 @@ define <8 x i32> @fptosi01(<8 x double> %a) {
   ret <8 x i32> %b
 }
 
+; CHECK-LABEL: fptosi03
+; CHECK: vcvttpd2dq %ymm
+; CHECK: ret
+define <4 x i32> @fptosi03(<4 x double> %a) {
+  %b = fptosi <4 x double> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
 ; CHECK-LABEL: fptrunc00
 ; CHECK: vcvtpd2ps %zmm
 ; CHECK-NEXT: vcvtpd2ps %zmm
-; CHECK-NEXT: vinsertf64x4    $1
+; CHECK-NEXT: vinsertf
 ; CHECK: ret
 define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
   %a = fptrunc <16 x double> %b to <16 x float>
   ret <16 x float> %a
 }
 
+; CHECK-LABEL: fptrunc01
+; CHECK: vcvtpd2ps %ymm
+define <4 x float> @fptrunc01(<4 x double> %b) {
+  %a = fptrunc <4 x double> %b to <4 x float>
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: fptrunc02
+; CHECK: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
+  %a = fptrunc <4 x double> %b to <4 x float>
+  %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
+  ret <4 x float> %c
+}
+
 ; CHECK-LABEL: fpext00
 ; CHECK: vcvtps2pd %ymm0, %zmm0
 ; CHECK: ret
@@ -82,6 +169,16 @@ define <8 x double> @fpext00(<8 x float> %b) nounwind {
   ret <8 x double> %a
 }
 
+; CHECK-LABEL: fpext01
+; CHECK: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
+; CHECK: ret
+define <4 x double> @fpext01(<4 x float> %b, <4 x double>%b1, <4 x double>%a1) {
+  %a = fpext <4 x float> %b to <4 x double>
+  %mask = fcmp ogt <4 x double>%a1, %b1
+  %c = select <4 x i1>%mask,  <4 x double>%a, <4 x double>zeroinitializer
+  ret <4 x double> %c
+}
+
 ; CHECK-LABEL: funcA
 ; CHECK: vcvtsi2sdq (%rdi){{.*}} encoding: [0x62
 ; CHECK: ret
@@ -182,12 +279,14 @@ define i32 @float_to_int(float %x) {
    ret i32 %res
 }
 
-; CHECK-LABEL: uitof64
-; CHECK: vcvtudq2pd
-; CHECK: vextracti64x4
-; CHECK: vcvtudq2pd
-; CHECK: ret
 define <16 x double> @uitof64(<16 x i32> %a) nounwind {
+; CHECK-LABEL: uitof64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm2
+; CHECK-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
+; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm1
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq 
   %b = uitofp <16 x i32> %a to <16 x double>
   ret <16 x double> %b
 }
@@ -257,7 +356,7 @@ define double @uitofp03(i32 %a) nounwind {
 }
 
 ; CHECK-LABEL: @sitofp_16i1_float
-; CHECK: vpbroadcastd
+; CHECK: vpmovm2d
 ; CHECK: vcvtdq2ps
 define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
   %mask = icmp slt <16 x i32> %a, zeroinitializer
@@ -301,7 +400,7 @@ define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
 
 
 ; CHECK-LABEL: @sitofp_8i1_double
-; CHECK: vpbroadcastq
+; CHECK: vpmovm2d
 ; CHECK: vcvtdq2pd
 define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
   %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
@@ -310,7 +409,7 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
 }
 
 ; CHECK-LABEL: @uitofp_16i8
-; CHECK:  vpmovzxbd  
+; CHECK:  vpmovzxbd
 ; CHECK: vcvtudq2ps
 define <16 x float> @uitofp_16i8(<16 x i8>%a) {
   %b = uitofp <16 x i8> %a to <16 x float>
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
new file mode 100644
index 0000000000000..bc15096844753
--- /dev/null
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -0,0 +1,1835 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+
+define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x8mem_to_8x16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT:    vpsllw $15, %xmm0, %xmm0
+; KNL-NEXT:    vpsraw $15, %xmm0, %xmm0
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8x8mem_to_8x16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbw (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = zext <8 x i8> %a to <8 x i16>
+  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
+  ret <8 x i16> %ret
+}
+
+define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x8mem_to_8x16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbw (%rdi), %xmm1
+; KNL-NEXT:    vpsllw $15, %xmm0, %xmm0
+; KNL-NEXT:    vpsraw $15, %xmm0, %xmm0
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_8x8mem_to_8x16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbw (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = sext <8 x i8> %a to <8 x i16>
+  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
+  ret <8 x i16> %ret
+}
+
+
+define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x8mem_to_16x16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm0, %ymm0
+; KNL-NEXT:    vpsraw $15, %ymm0, %ymm0
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_16x8mem_to_16x16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbw (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <16 x i8>,<16 x i8> *%i,align 1
+  %x   = zext <16 x i8> %a to <16 x i16>
+  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_16x8mem_to_16x16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    vpmovsxbw (%rdi), %ymm1
+; KNL-NEXT:    vpsllw $15, %ymm0, %ymm0
+; KNL-NEXT:    vpsraw $15, %ymm0, %ymm0
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_16x8mem_to_16x16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbw (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <16 x i8>,<16 x i8> *%i,align 1
+  %x   = sext <16 x i8> %a to <16 x i16>
+  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
+; KNL-LABEL: zext_16x8_to_16x16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_16x8_to_16x16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovzxbw %xmm0, %ymm0
+; SKX-NEXT:    retq
+  %x   = zext <16 x i8> %a to <16 x i16>
+  ret <16 x i16> %x
+}
+
+define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x8_to_16x16_mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm1, %ymm1
+; KNL-NEXT:    vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_16x8_to_16x16_mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxbw %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = zext <16 x i8> %a to <16 x i16>
+  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
+; ALL-LABEL: sext_16x8_to_16x16:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; ALL-NEXT:    retq
+  %x   = sext <16 x i8> %a to <16 x i16>
+  ret <16 x i16> %x
+}
+
+define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_16x8_to_16x16_mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; KNL-NEXT:    vpsllw $15, %ymm1, %ymm1
+; KNL-NEXT:    vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_16x8_to_16x16_mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovsxbw %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = sext <16 x i8> %a to <16 x i16>
+  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+  ret <16 x i16> %ret
+}
+
+define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_32x8mem_to_32x16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm3, %ymm3
+; KNL-NEXT:    vpsraw $15, %ymm3, %ymm3
+; KNL-NEXT:    vpand %ymm2, %ymm3, %ymm2
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm0, %ymm0
+; KNL-NEXT:    vpsraw $15, %ymm0, %ymm0
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm1
+; KNL-NEXT:    vmovaps %zmm2, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_32x8mem_to_32x16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT:    vpmovb2m %ymm0, %k1
+; SKX-NEXT:    vpmovzxbw (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <32 x i8>,<32 x i8> *%i,align 1
+  %x   = zext <32 x i8> %a to <32 x i16>
+  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
+define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_32x8mem_to_32x16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbw 16(%rdi), %ymm1
+; KNL-NEXT:    vpmovsxbw (%rdi), %ymm2
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm3, %ymm3
+; KNL-NEXT:    vpsraw $15, %ymm3, %ymm3
+; KNL-NEXT:    vpand %ymm2, %ymm3, %ymm2
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm0, %ymm0
+; KNL-NEXT:    vpsraw $15, %ymm0, %ymm0
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm1
+; KNL-NEXT:    vmovaps %zmm2, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_32x8mem_to_32x16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT:    vpmovb2m %ymm0, %k1
+; SKX-NEXT:    vpmovsxbw (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <32 x i8>,<32 x i8> *%i,align 1
+  %x   = sext <32 x i8> %a to <32 x i16>
+  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
+define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
+; KNL-LABEL: zext_32x8_to_32x16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    vmovaps %zmm2, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_32x8_to_32x16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovzxbw %ymm0, %zmm0
+; SKX-NEXT:    retq
+  %x   = zext <32 x i8> %a to <32 x i16>
+  ret <32 x i16> %x
+}
+
+define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_32x8_to_32x16_mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm3, %ymm3
+; KNL-NEXT:    vpsraw $15, %ymm3, %ymm3
+; KNL-NEXT:    vpand %ymm0, %ymm3, %ymm0
+; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm1, %ymm1
+; KNL-NEXT:    vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_32x8_to_32x16_mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm1, %ymm1
+; SKX-NEXT:    vpmovb2m %ymm1, %k1
+; SKX-NEXT:    vpmovzxbw %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = zext <32 x i8> %a to <32 x i16>
+  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
+define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
+; KNL-LABEL: sext_32x8_to_32x16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbw %xmm0, %ymm2
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbw %xmm0, %ymm1
+; KNL-NEXT:    vmovaps %zmm2, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_32x8_to_32x16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovsxbw %ymm0, %zmm0
+; SKX-NEXT:    retq
+  %x   = sext <32 x i8> %a to <32 x i16>
+  ret <32 x i16> %x
+}
+
+define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_32x8_to_32x16_mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT:    vpmovsxbw %xmm2, %ymm2
+; KNL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm3, %ymm3
+; KNL-NEXT:    vpsraw $15, %ymm3, %ymm3
+; KNL-NEXT:    vpand %ymm0, %ymm3, %ymm0
+; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm1, %ymm1
+; KNL-NEXT:    vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_32x8_to_32x16_mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm1, %ymm1
+; SKX-NEXT:    vpmovb2m %ymm1, %k1
+; SKX-NEXT:    vpmovsxbw %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = sext <32 x i8> %a to <32 x i16>
+  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
+define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x8mem_to_4x32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_4x8mem_to_4x32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i8>,<4 x i8> *%i,align 1
+  %x   = zext <4 x i8> %a to <4 x i32>
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_4x8mem_to_4x32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd (%rdi), %xmm1
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_4x8mem_to_4x32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i8>,<4 x i8> *%i,align 1
+  %x   = sext <4 x i8> %a to <4 x i32>
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x8mem_to_8x32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8x8mem_to_8x32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = zext <8 x i8> %a to <8 x i32>
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x8mem_to_8x32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovsxbd (%rdi), %ymm0
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_8x8mem_to_8x32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = sext <8 x i8> %a to <8 x i32>
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %ret
+}
+
+define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x8mem_to_16x32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovzxbd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_16x8mem_to_16x32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <16 x i8>,<16 x i8> *%i,align 1
+  %x   = zext <16 x i8> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_16x8mem_to_16x32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovsxbd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_16x8mem_to_16x32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <16 x i8>,<16 x i8> *%i,align 1
+  %x   = sext <16 x i8> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x8_to_16x32_mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpmovzxbd %xmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_16x8_to_16x32_mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxbd %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = zext <16 x i8> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_16x8_to_16x32_mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_16x8_to_16x32_mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovsxbd %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = sext <16 x i8> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+; ALL-LABEL: zext_16x8_to_16x32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovzxbd %xmm0, %zmm0
+; ALL-NEXT:    retq
+  %x = zext <16 x i8> %i to <16 x i32>
+  ret <16 x i32> %x
+}
+
+define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+; ALL-LABEL: sext_16x8_to_16x32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; ALL-NEXT:    retq
+  %x = sext <16 x i8> %i to <16 x i32>
+  ret <16 x i32> %x
+}
+
+define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_2x8mem_to_2x64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_2x8mem_to_2x64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vpmovq2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <2 x i8>,<2 x i8> *%i,align 1
+  %x   = zext <2 x i8> %a to <2 x i64>
+  %ret = select <2 x  i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_2x8mem_to_2x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpmovsxbq (%rdi), %xmm1
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_2x8mem_to_2x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vpmovq2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <2 x i8>,<2 x i8> *%i,align 1
+  %x   = sext <2 x i8> %a to <2 x i64>
+  %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone {
+; ALL-LABEL: sext_2x8mem_to_2x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxbq (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %a   = load <2 x i8>,<2 x i8> *%i,align 1
+  %x   = sext <2 x i8> %a to <2 x i64>
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x8mem_to_4x64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_4x8mem_to_4x64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i8>,<4 x i8> *%i,align 1
+  %x   = zext <4 x i8> %a to <4 x i64>
+  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_4x8mem_to_4x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT:    vpmovsxbq (%rdi), %ymm1
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_4x8mem_to_4x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i8>,<4 x i8> *%i,align 1
+  %x   = sext <4 x i8> %a to <4 x i64>
+  %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone {
+; ALL-LABEL: sext_4x8mem_to_4x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxbq (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %a   = load <4 x i8>,<4 x i8> *%i,align 1
+  %x   = sext <4 x i8> %a to <4 x i64>
+  ret <4 x i64> %x
+}
+
+define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x8mem_to_8x64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovzxbq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8x8mem_to_8x64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = zext <8 x i8> %a to <8 x i64>
+  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x8mem_to_8x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovsxbq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_8x8mem_to_8x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = sext <8 x i8> %a to <8 x i64>
+  %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone {
+; ALL-LABEL: sext_8x8mem_to_8x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxbq (%rdi), %zmm0
+; ALL-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = sext <8 x i8> %a to <8 x i64>
+  ret <8 x i64> %x
+}
+
+define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x16mem_to_4x32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_4x16mem_to_4x32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxwd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = zext <4 x i16> %a to <4 x i32>
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_4x16mem_to_4x32mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxwd (%rdi), %xmm1
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_4x16mem_to_4x32mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxwd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = sext <4 x i16> %a to <4 x i32>
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_4x16mem_to_4x32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxwd (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = sext <4 x i16> %a to <4 x i32>
+  ret <4 x i32> %x
+}
+
+
+define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x16mem_to_8x32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8x16mem_to_8x32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxwd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = zext <8 x i16> %a to <8 x i32>
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x16mem_to_8x32mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovsxwd (%rdi), %ymm0
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_8x16mem_to_8x32mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxwd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = sext <8 x i16> %a to <8 x i32>
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_8x16mem_to_8x32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxwd (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = sext <8 x i16> %a to <8 x i32>
+  ret <8 x i32> %x
+}
+
+define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x16_to_8x32mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8x16_to_8x32mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT:    vpmovw2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxwd %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = zext <8 x i16> %a to <8 x i32>
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone {
+; KNL-LABEL: zext_8x16_to_8x32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8x16_to_8x32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovzxwd %xmm0, %ymm0
+; SKX-NEXT:    retq
+  %x   = zext <8 x i16> %a to <8 x i32>
+  ret <8 x i32> %x
+}
+
+define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x16mem_to_16x32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovzxwd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_16x16mem_to_16x32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxwd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <16 x i16>,<16 x i16> *%i,align 1
+  %x   = zext <16 x i16> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_16x16mem_to_16x32mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovsxwd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_16x16mem_to_16x32mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxwd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <16 x i16>,<16 x i16> *%i,align 1
+  %x   = sext <16 x i16> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_16x16mem_to_16x32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxwd (%rdi), %zmm0
+; ALL-NEXT:    retq
+  %a   = load <16 x i16>,<16 x i16> *%i,align 1
+  %x   = sext <16 x i16> %a to <16 x i32>
+  ret <16 x i32> %x
+}
+define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x16_to_16x32mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpmovzxwd %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_16x16_to_16x32mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxwd %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = zext <16 x i16> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone {
+; ALL-LABEL: zext_16x16_to_16x32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovzxwd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %x   = zext <16 x i16> %a to <16 x i32>
+  ret <16 x i32> %x
+}
+
+define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_2x16mem_to_2x64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_2x16mem_to_2x64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vpmovq2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxwq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <2 x i16>,<2 x i16> *%i,align 1
+  %x   = zext <2 x i16> %a to <2 x i64>
+  %ret = select <2 x  i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_2x16mem_to_2x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpmovsxwq (%rdi), %xmm1
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_2x16mem_to_2x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vpmovq2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxwq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <2 x i16>,<2 x i16> *%i,align 1
+  %x   = sext <2 x i16> %a to <2 x i64>
+  %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_2x16mem_to_2x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxwq (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %a   = load <2 x i16>,<2 x i16> *%i,align 1
+  %x   = sext <2 x i16> %a to <2 x i64>
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x16mem_to_4x64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_4x16mem_to_4x64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxwq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = zext <4 x i16> %a to <4 x i64>
+  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_4x16mem_to_4x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT:    vpmovsxwq (%rdi), %ymm1
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_4x16mem_to_4x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxwq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = sext <4 x i16> %a to <4 x i64>
+  %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_4x16mem_to_4x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxwq (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = sext <4 x i16> %a to <4 x i64>
+  ret <4 x i64> %x
+}
+
+define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x16mem_to_8x64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovzxwq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8x16mem_to_8x64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxwq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = zext <8 x i16> %a to <8 x i64>
+  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x16mem_to_8x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovsxwq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_8x16mem_to_8x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxwq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = sext <8 x i16> %a to <8 x i64>
+  %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_8x16mem_to_8x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxwq (%rdi), %zmm0
+; ALL-NEXT:    retq
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = sext <8 x i16> %a to <8 x i64>
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x16_to_8x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpmovzxwq %xmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8x16_to_8x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT:    vpmovw2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxwq %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = zext <8 x i16> %a to <8 x i64>
+  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone {
+; ALL-LABEL: zext_8x16_to_8x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovzxwq %xmm0, %zmm0
+; ALL-NEXT:    retq
+  %ret   = zext <8 x i16> %a to <8 x i64>
+  ret <8 x i64> %ret
+}
+
+define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_2x32mem_to_2x64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_2x32mem_to_2x64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vpmovq2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxdq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <2 x i32>,<2 x i32> *%i,align 1
+  %x   = zext <2 x i32> %a to <2 x i64>
+  %ret = select <2 x  i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_2x32mem_to_2x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpmovsxdq (%rdi), %xmm1
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_2x32mem_to_2x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vpmovq2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxdq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <2 x i32>,<2 x i32> *%i,align 1
+  %x   = sext <2 x i32> %a to <2 x i64>
+  %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone {
+; ALL-LABEL: sext_2x32mem_to_2x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxdq (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %a   = load <2 x i32>,<2 x i32> *%i,align 1
+  %x   = sext <2 x i32> %a to <2 x i64>
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x32mem_to_4x64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_4x32mem_to_4x64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxdq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i32>,<4 x i32> *%i,align 1
+  %x   = zext <4 x i32> %a to <4 x i64>
+  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_4x32mem_to_4x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT:    vpmovsxdq (%rdi), %ymm1
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_4x32mem_to_4x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxdq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i32>,<4 x i32> *%i,align 1
+  %x   = sext <4 x i32> %a to <4 x i64>
+  %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone {
+; ALL-LABEL: sext_4x32mem_to_4x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxdq (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %a   = load <4 x i32>,<4 x i32> *%i,align 1
+  %x   = sext <4 x i32> %a to <4 x i64>
+  ret <4 x i64> %x
+}
+
+define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone {
+; ALL-LABEL: sext_4x32_to_4x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxdq %xmm0, %ymm0
+; ALL-NEXT:    retq
+  %x   = sext <4 x i32> %a to <4 x i64>
+  ret <4 x i64> %x
+}
+
+define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x32_to_4x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm1, %xmm1
+; KNL-NEXT:    vpsrad $31, %xmm1, %xmm1
+; KNL-NEXT:    vpmovsxdq %xmm1, %ymm1
+; KNL-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; KNL-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_4x32_to_4x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX-NEXT:    vpmovd2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxdq %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = zext <4 x i32> %a to <4 x i64>
+  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x32mem_to_8x64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovzxdq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8x32mem_to_8x64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxdq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i32>,<8 x i32> *%i,align 1
+  %x   = zext <8 x i32> %a to <8 x i64>
+  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x32mem_to_8x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpmovsxdq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_8x32mem_to_8x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxdq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i32>,<8 x i32> *%i,align 1
+  %x   = sext <8 x i32> %a to <8 x i64>
+  %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone {
+; ALL-LABEL: sext_8x32mem_to_8x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxdq (%rdi), %zmm0
+; ALL-NEXT:    retq
+  %a   = load <8 x i32>,<8 x i32> *%i,align 1
+  %x   = sext <8 x i32> %a to <8 x i64>
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone {
+; ALL-LABEL: sext_8x32_to_8x64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxdq %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %x   = sext <8 x i32> %a to <8 x i64>
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x32_to_8x64mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpmovzxdq %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8x32_to_8x64mask:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT:    vpmovw2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxdq %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = zext <8 x i32> %a to <8 x i64>
+  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone {
+; ALL-LABEL: fptrunc_test:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vcvtpd2ps %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %b = fptrunc <8 x double> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone {
+; ALL-LABEL: fpext_test:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vcvtps2pd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %b = fpext <8 x float> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define   <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
+; ALL-LABEL: zext_16i1_to_16xi32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    kmovw %edi, %k1
+; ALL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; ALL-NEXT:    retq
+  %a = bitcast i16 %b to <16 x i1>
+  %c = zext <16 x i1> %a to <16 x i32>
+  ret <16 x i32> %c
+}
+
+define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
+; KNL-LABEL: zext_8i1_to_8xi64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movzbl %dil, %eax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_8i1_to_8xi64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a = bitcast i8 %b to <8 x i1>
+  %c = zext <8 x i1> %a to <8 x i64>
+  ret <8 x i64> %c
+}
+
+define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
+; KNL-LABEL: trunc_16i8_to_16i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_16i8_to_16i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k0
+; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    retq
+  %mask_b = trunc <16 x i8>%a to <16 x i1>
+  %mask = bitcast <16 x i1> %mask_b to i16
+  ret i16 %mask
+}
+
+define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
+; KNL-LABEL: trunc_16i32_to_16i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_16i32_to_16i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %zmm0, %zmm0
+; SKX-NEXT:    vpmovd2m %zmm0, %k0
+; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    retq
+  %mask_b = trunc <16 x i32>%a to <16 x i1>
+  %mask = bitcast <16 x i1> %mask_b to i16
+  ret i16 %mask
+}
+
+define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
+; KNL-LABEL: trunc_4i32_to_4i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_4i32_to_4i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k0
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k1
+; SKX-NEXT:    kandw %k1, %k0, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    retq
+  %mask_a = trunc <4 x i32>%a to <4 x i1>
+  %mask_b = trunc <4 x i32>%b to <4 x i1>
+  %a_and_b = and <4 x i1>%mask_a, %mask_b
+  %res = sext <4 x i1>%a_and_b to <4 x i32>
+  ret <4 x i32>%res
+}
+
+
+define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
+; KNL-LABEL: trunc_8i16_to_8i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_8i16_to_8i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k0
+; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    retq
+  %mask_b = trunc <8 x i16>%a to <8 x i1>
+  %mask = bitcast <8 x i1> %mask_b to i8
+  ret i8 %mask
+}
+
+define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; KNL-LABEL: sext_8i1_8i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    knotw %k0, %k1
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_8i1_8i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
+; SKX-NEXT:    knotb %k0, %k0
+; SKX-NEXT:    vpmovm2d %k0, %ymm0
+; SKX-NEXT:    retq
+  %x = icmp slt <8 x i32> %a1, %a2
+  %x1 = xor <8 x i1>%x, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+  %y = sext <8 x i1> %x1 to <8 x i32>
+  ret <8 x i32> %y
+}
+
+
+define i16 @trunc_i32_to_i1(i32 %a) {
+; ALL-LABEL: trunc_i32_to_i1:
+; ALL:       ## BB#0:
+; ALL-NEXT:    andl $1, %edi
+; ALL-NEXT:    kmovw %edi, %k0
+; ALL-NEXT:    movw $-4, %ax
+; ALL-NEXT:    kmovw %eax, %k1
+; ALL-NEXT:    korw %k0, %k1, %k0
+; ALL-NEXT:    kmovw %k0, %eax
+; ALL-NEXT:    retq
+  %a_i = trunc i32 %a to i1
+  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
+  %res = bitcast <16 x i1> %maskv to i16
+  ret i16 %res
+}
+
+define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; KNL-LABEL: sext_8i1_8i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_8i1_8i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
+; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    retq
+  %x = icmp slt <8 x i32> %a1, %a2
+  %y = sext <8 x i1> %x to <8 x i16>
+  ret <8 x i16> %y
+}
+
+define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
+; KNL-LABEL: sext_16i1_16i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_16i1_16i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %zmm0
+; SKX-NEXT:    retq
+  %x = icmp slt <16 x i32> %a1, %a2
+  %y = sext <16 x i1> %x to <16 x i32>
+  ret <16 x i32> %y
+}
+
+define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; KNL-LABEL: sext_8i1_8i64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxdq %ymm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sext_8i1_8i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
+; SKX-NEXT:    vpmovm2q %k0, %zmm0
+; SKX-NEXT:    retq
+  %x = icmp slt <8 x i32> %a1, %a2
+  %y = sext <8 x i1> %x to <8 x i64>
+  ret <8 x i64> %y
+}
+
+define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
+; ALL-LABEL: extload_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovsxbq (%rdi), %zmm0
+; ALL-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; ALL-NEXT:    retq
+  %sign_load = load <8 x i8>, <8 x i8>* %a
+  %c = sext <8 x i8> %sign_load to <8 x i64>
+  store <8 x i64> %c, <8 x i64>* %res
+  ret void
+}
+
+define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
+; KNL-LABEL: test21:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    pushq %r15
+; KNL-NEXT:    pushq %r14
+; KNL-NEXT:    pushq %r13
+; KNL-NEXT:    pushq %r12
+; KNL-NEXT:    pushq %rbx
+; KNL-NEXT:    vpmovsxbd %xmm7, %zmm7
+; KNL-NEXT:    vpslld $31, %zmm7, %zmm7
+; KNL-NEXT:    vpmovsxbd %xmm6, %zmm6
+; KNL-NEXT:    vpslld $31, %zmm6, %zmm6
+; KNL-NEXT:    vpmovsxbd %xmm5, %zmm5
+; KNL-NEXT:    vpslld $31, %zmm5, %zmm5
+; KNL-NEXT:    vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT:    vpslld $31, %zmm4, %zmm4
+; KNL-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; KNL-NEXT:    kshiftlw $14, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kshiftlw $15, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $13, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kshiftlw $12, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %edi
+; KNL-NEXT:    kshiftlw $11, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    kshiftlw $10, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %r13d
+; KNL-NEXT:    kshiftlw $9, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %r8d
+; KNL-NEXT:    kshiftlw $8, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    kshiftlw $7, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %r11d
+; KNL-NEXT:    kshiftlw $6, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %ebx
+; KNL-NEXT:    kshiftlw $5, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %ebp
+; KNL-NEXT:    kshiftlw $4, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %r14d
+; KNL-NEXT:    kshiftlw $3, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %r15d
+; KNL-NEXT:    kshiftlw $2, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    kshiftlw $1, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %r12d
+; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; KNL-NEXT:    kshiftlw $0, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vmovd %eax, %xmm4
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $14, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $1, %edx, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT:    kshiftlw $15, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftlw $13, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $3, %edi, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %edi
+; KNL-NEXT:    kshiftlw $12, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $4, %esi, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %esi
+; KNL-NEXT:    kshiftlw $11, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $5, %r13d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r13d
+; KNL-NEXT:    kshiftlw $10, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $6, %r8d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r8d
+; KNL-NEXT:    kshiftlw $9, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $7, %r10d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    kshiftlw $8, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $8, %r11d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r11d
+; KNL-NEXT:    kshiftlw $7, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $9, %ebx, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %ebx
+; KNL-NEXT:    kshiftlw $6, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $10, %ebp, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %ebp
+; KNL-NEXT:    kshiftlw $5, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $11, %r14d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r14d
+; KNL-NEXT:    kshiftlw $4, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $12, %r15d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r15d
+; KNL-NEXT:    kshiftlw $3, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $13, %r9d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT:    kshiftlw $2, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $14, %r12d, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r12d
+; KNL-NEXT:    kshiftlw $1, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
+; KNL-NEXT:    kmovw %k0, %r9d
+; KNL-NEXT:    vptestmd %zmm6, %zmm6, %k0
+; KNL-NEXT:    kshiftlw $0, %k1, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vmovd %ecx, %xmm5
+; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kshiftlw $14, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT:    kshiftlw $15, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $2, %edi, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $13, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $3, %esi, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %edi
+; KNL-NEXT:    kshiftlw $12, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $4, %r13d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kshiftlw $11, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $5, %r8d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r8d
+; KNL-NEXT:    kshiftlw $10, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $6, %r10d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r13d
+; KNL-NEXT:    kshiftlw $9, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $7, %r11d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT:    kshiftlw $8, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $8, %ebx, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %ebx
+; KNL-NEXT:    kshiftlw $7, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $9, %ebp, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %ebp
+; KNL-NEXT:    kshiftlw $6, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $10, %r14d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    kshiftlw $5, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $11, %r15d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r11d
+; KNL-NEXT:    kshiftlw $4, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    kshiftlw $3, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $13, %r12d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r14d
+; KNL-NEXT:    kshiftlw $2, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $14, %r9d, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    kshiftlw $1, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vpinsrb $15, %edx, %xmm5, %xmm5
+; KNL-NEXT:    kmovw %k1, %r15d
+; KNL-NEXT:    vptestmd %zmm7, %zmm7, %k1
+; KNL-NEXT:    kshiftlw $0, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vmovd %eax, %xmm6
+; KNL-NEXT:    kmovw %k0, %r12d
+; KNL-NEXT:    kshiftlw $14, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $15, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $2, %edi, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kshiftlw $13, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftlw $12, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $4, %r8d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r8d
+; KNL-NEXT:    kshiftlw $11, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $5, %r13d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r13d
+; KNL-NEXT:    kshiftlw $10, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT:    kmovw %k0, %edi
+; KNL-NEXT:    kshiftlw $9, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $7, %ebx, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %ebx
+; KNL-NEXT:    kshiftlw $8, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $8, %ebp, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %ebp
+; KNL-NEXT:    kshiftlw $7, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $9, %r10d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    kshiftlw $6, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $10, %r11d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r11d
+; KNL-NEXT:    kshiftlw $5, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $11, %esi, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %esi
+; KNL-NEXT:    kshiftlw $4, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $12, %r14d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r14d
+; KNL-NEXT:    kshiftlw $3, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $13, %r9d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r9d
+; KNL-NEXT:    kshiftlw $2, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $14, %r15d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r15d
+; KNL-NEXT:    kshiftlw $1, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $15, %r12d, %xmm6, %xmm6
+; KNL-NEXT:    kmovw %k0, %r12d
+; KNL-NEXT:    kshiftlw $0, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vmovd %edx, %xmm7
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $3, %r8d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $4, %r13d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $5, %edi, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $6, %ebx, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $7, %ebp, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $8, %r10d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $9, %r11d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $10, %esi, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $11, %r14d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $12, %r9d, %xmm7, %xmm7
+; KNL-NEXT:    vpinsrb $13, %r15d, %xmm7, %xmm7
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm4, %ymm4
+; KNL-NEXT:    vpsraw $15, %ymm4, %ymm4
+; KNL-NEXT:    vpand %ymm0, %ymm4, %ymm0
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm4, %ymm4
+; KNL-NEXT:    vpsraw $15, %ymm4, %ymm4
+; KNL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm4, %ymm4
+; KNL-NEXT:    vpsraw $15, %ymm4, %ymm4
+; KNL-NEXT:    vpand %ymm2, %ymm4, %ymm2
+; KNL-NEXT:    vpinsrb $14, %r12d, %xmm7, %xmm4
+; KNL-NEXT:    vpinsrb $15, %edx, %xmm4, %xmm4
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm4, %ymm4
+; KNL-NEXT:    vpsraw $15, %ymm4, %ymm4
+; KNL-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; KNL-NEXT:    popq %rbx
+; KNL-NEXT:    popq %r12
+; KNL-NEXT:    popq %r13
+; KNL-NEXT:    popq %r14
+; KNL-NEXT:    popq %r15
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test21:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %zmm2, %zmm2
+; SKX-NEXT:    vpmovb2m %zmm2, %k1
+; SKX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; SKX-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm3 {%k1}
+; SKX-NEXT:    kshiftrq $32, %k1, %k1
+; SKX-NEXT:    vmovdqu16 %zmm1, %zmm2 {%k1}
+; SKX-NEXT:    vmovaps %zmm3, %zmm0
+; SKX-NEXT:    vmovaps %zmm2, %zmm1
+; SKX-NEXT:    retq
+  %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
+  ret <64 x i16> %ret
+}
+
diff --git a/test/CodeGen/X86/avx512-extract-subvector.ll b/test/CodeGen/X86/avx512-extract-subvector.ll
new file mode 100644
index 0000000000000..703f7832588ca
--- /dev/null
+++ b/test/CodeGen/X86/avx512-extract-subvector.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s
+
+
+define <8 x i16> @extract_subvector128_v32i16(<32 x i16> %x) nounwind {
+; SKX-LABEL: extract_subvector128_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; SKX-NEXT:    retq
+  %r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <8 x i16> %r1
+}
+
+define <8 x i16> @extract_subvector128_v32i16_first_element(<32 x i16> %x) nounwind {
+; SKX-LABEL: extract_subvector128_v32i16_first_element:
+; SKX:       ## BB#0:
+; SKX-NEXT:    retq
+  %r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %r1
+}
+
+define <16 x i8> @extract_subvector128_v64i8(<64 x i8> %x) nounwind {
+; SKX-LABEL: extract_subvector128_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; SKX-NEXT:    retq
+  %r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+  ret <16 x i8> %r1
+}
+
+define <16 x i8> @extract_subvector128_v64i8_first_element(<64 x i8> %x) nounwind {
+; SKX-LABEL: extract_subvector128_v64i8_first_element:
+; SKX:       ## BB#0:
+; SKX-NEXT:    retq
+  %r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %r1
+}
+
+
+define <16 x i16> @extract_subvector256_v32i16(<32 x i16> %x) nounwind {
+; SKX-LABEL: extract_subvector256_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; SKX-NEXT:    retq
+  %r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i16> %r1
+}
+
+define <32 x i8> @extract_subvector256_v64i8(<64 x i8> %x) nounwind {
+; SKX-LABEL: extract_subvector256_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; SKX-NEXT:    retq
+  %r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  ret <32 x i8> %r1
+}
diff --git a/test/CodeGen/X86/avx512-fma.ll b/test/CodeGen/X86/avx512-fma.ll
index ed046de005cf6..9279441a23c72 100644
--- a/test/CodeGen/X86/avx512-fma.ll
+++ b/test/CodeGen/X86/avx512-fma.ll
@@ -1,81 +1,93 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefix=SKX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=SKX
 
-; CHECK-LABEL: test_x86_fmadd_ps_z
-; CHECK: vfmadd213ps     %zmm2, %zmm1, %zmm0
-; CHECK: ret
 define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; ALL-LABEL: test_x86_fmadd_ps_z:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %res = fadd <16 x float> %x, %a2
   ret <16 x float> %res
 }
 
-; CHECK-LABEL: test_x86_fmsub_ps_z
-; CHECK: vfmsub213ps     %zmm2, %zmm1, %zmm0
-; CHECK: ret
 define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; ALL-LABEL: test_x86_fmsub_ps_z:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %res = fsub <16 x float> %x, %a2
   ret <16 x float> %res
 }
 
-; CHECK-LABEL: test_x86_fnmadd_ps_z
-; CHECK: vfnmadd213ps     %zmm2, %zmm1, %zmm0
-; CHECK: ret
 define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; ALL-LABEL: test_x86_fnmadd_ps_z:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vfnmadd213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %res = fsub <16 x float> %a2, %x
   ret <16 x float> %res
 }
 
-; CHECK-LABEL: test_x86_fnmsub_ps_z
-; CHECK: vfnmsub213ps     %zmm2, %zmm1, %zmm0
-; CHECK: ret
 define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; ALL-LABEL: test_x86_fnmsub_ps_z:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
-  %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, 
+  %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
                           float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
-						  float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, 
-						  float -0.000000e+00>, %x
+                          float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+                          float -0.000000e+00>, %x
   %res = fsub <16 x float> %y, %a2
   ret <16 x float> %res
 }
 
-; CHECK-LABEL: test_x86_fmadd_pd_z
-; CHECK: vfmadd213pd     %zmm2, %zmm1, %zmm0
-; CHECK: ret
 define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; ALL-LABEL: test_x86_fmadd_pd_z:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    retq
   %x = fmul <8 x double> %a0, %a1
   %res = fadd <8 x double> %x, %a2
   ret <8 x double> %res
 }
 
-; CHECK-LABEL: test_x86_fmsub_pd_z
-; CHECK: vfmsub213pd     %zmm2, %zmm1, %zmm0
-; CHECK: ret
 define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; ALL-LABEL: test_x86_fmsub_pd_z:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vfmsub213pd %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    retq
   %x = fmul <8 x double> %a0, %a1
   %res = fsub <8 x double> %x, %a2
   ret <8 x double> %res
 }
 
 define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
-; CHECK-LABEL: test_x86_fmsub_213:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: test_x86_fmsub_213:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1
+; ALL-NEXT:    vmovaps %zmm1, %zmm0
+; ALL-NEXT:    retq
   %x = fmul double %a0, %a1
   %res = fsub double %x, %a2
   ret double %res
 }
 
 define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
-; CHECK-LABEL: test_x86_fmsub_213_m:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmsub213sd (%rdi), %xmm0, %xmm1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: test_x86_fmsub_213_m:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vfmsub213sd (%rdi), %xmm0, %xmm1
+; KNL-NEXT:    vmovaps %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_x86_fmsub_213_m:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0
+; SKX-NEXT:    retq
   %a2 = load double , double *%a2_ptr
   %x = fmul double %a0, %a1
   %res = fsub double %x, %a2
@@ -83,11 +95,11 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
 }
 
 define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
-; CHECK-LABEL: test_x86_fmsub_231_m:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmsub231sd (%rdi), %xmm0, %xmm1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: test_x86_fmsub_231_m:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vfmsub231sd (%rdi), %xmm0, %xmm1
+; ALL-NEXT:    vmovaps %zmm1, %zmm0
+; ALL-NEXT:    retq
   %a2 = load double , double *%a2_ptr
   %x = fmul double %a0, %a2
   %res = fsub double %x, %a1
@@ -95,21 +107,21 @@ define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
 }
 
 define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
-; CHECK-LABEL: test231_br:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: test231_br:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1
+; ALL-NEXT:    vmovaps %zmm1, %zmm0
+; ALL-NEXT:    retq
   %b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
   %b2 = fadd <16 x float> %b1, %a2
   ret <16 x float> %b2
 }
 
 define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
-; CHECK-LABEL: test213_br:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: test213_br:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
+; ALL-NEXT:    retq
   %b1 = fmul <16 x float> %a1, %a2
   %b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
   ret <16 x float> %b2
@@ -117,16 +129,17 @@ define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
 
 ;mask (a*c+b , a)
 define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
-; CHECK-LABEL: test_x86_fmadd132_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxbd %xmm2, %zmm2
-; CHECK-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; CHECK-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; CHECK-NEXT:    vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq
+; KNL-LABEL: test_x86_fmadd132_ps:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL-NEXT:    vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_x86_fmadd132_ps:
 ; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
 ; SKX-NEXT:    vpmovb2m %xmm2, %k1
 ; SKX-NEXT:    vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
@@ -139,17 +152,18 @@ define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <1
 
 ;mask (a*c+b , b)
 define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
-; CHECK-LABEL: test_x86_fmadd231_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxbd %xmm2, %zmm2
-; CHECK-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; CHECK-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; CHECK-NEXT:    vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: test_x86_fmadd231_ps:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL-NEXT:    vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vmovaps %zmm1, %zmm0
+; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_x86_fmadd231_ps:
 ; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
 ; SKX-NEXT:    vpmovb2m %xmm2, %k1
 ; SKX-NEXT:    vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
@@ -163,17 +177,18 @@ define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <1
 
 ;mask (b*a+c , b)
 define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
-; CHECK-LABEL: test_x86_fmadd213_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovsxbd %xmm2, %zmm2
-; CHECK-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; CHECK-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; CHECK-NEXT:    vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: test_x86_fmadd213_ps:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL-NEXT:    vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vmovaps %zmm1, %zmm0
+; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_x86_fmadd213_ps:
 ; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
 ; SKX-NEXT:    vpmovb2m %xmm2, %k1
 ; SKX-NEXT:    vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 3fca5a89a6a48..3bc67cceaab5d 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
 
 declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
@@ -14,7 +15,7 @@ define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8*
 ; CHECK-LABEL: gather_mask_dps:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    kmovw %k1, %k2
+; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
@@ -29,7 +30,7 @@ define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %b
 ; CHECK-LABEL: gather_mask_dpd:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    kmovw %k1, %k2
+; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
 ; CHECK-NEXT:    vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
@@ -44,7 +45,7 @@ define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %ba
 ; CHECK-LABEL: gather_mask_qps:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    kmovw %k1, %k2
+; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
@@ -59,7 +60,7 @@ define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %b
 ; CHECK-LABEL: gather_mask_qpd:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    kmovw %k1, %k2
+; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
@@ -86,7 +87,7 @@ define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %ba
 ; CHECK-LABEL: gather_mask_dd:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    kmovw %k1, %k2
+; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
@@ -101,7 +102,7 @@ define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base,
 ; CHECK-LABEL: gather_mask_qd:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    kmovw %k1, %k2
+; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
@@ -116,7 +117,7 @@ define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base,
 ; CHECK-LABEL: gather_mask_qq:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    kmovw %k1, %k2
+; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
@@ -131,7 +132,7 @@ define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base,
 ; CHECK-LABEL: gather_mask_dq:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    kmovw %k1, %k2
+; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
 ; CHECK-NEXT:    vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
@@ -239,8 +240,8 @@ define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %
 define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: gather_qps:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
-; CHECK-NEXT:    kxnorw %k2, %k2, %k2
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorw %k0, %k0, %k2
 ; CHECK-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
@@ -256,7 +257,7 @@ declare  void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
 define void @prefetch(<8 x i64> %ind, i8* %base) {
 ; CHECK-LABEL: prefetch:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vgatherpf0qps (%rdi,%zmm0,4) {%k1}
 ; CHECK-NEXT:    vgatherpf1qps (%rdi,%zmm0,4) {%k1}
 ; CHECK-NEXT:    vscatterpf0qps (%rdi,%zmm0,2) {%k1}
@@ -278,12 +279,12 @@ define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
-; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,0), %xmm0 {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,2), %xmm0 {%k1}
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 0)
+  %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
   %res2 = fadd <2 x double> %res, %res1
   ret <2 x double> %res2
 }
@@ -311,12 +312,12 @@ define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
-; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,0), %ymm0 {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,2), %ymm0 {%k1}
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 0)
+  %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
   %res2 = fadd <4 x double> %res, %res1
   ret <4 x double> %res2
 }
@@ -329,7 +330,7 @@ define <8 x i32>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -347,12 +348,12 @@ define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1,
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
-; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,0), %xmm0 {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,2), %xmm0 {%k1}
 ; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 0)
+  %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
   %res2 = fadd <4 x float> %res, %res1
   ret <4 x float> %res2
 }
@@ -363,7 +364,7 @@ define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x
 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    kxnorw %k2, %k2, %k2
+; CHECK-NEXT:    kxnorw %k0, %k0, %k2
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
 ; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
@@ -383,12 +384,12 @@ define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1,
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
-; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,0), %xmm0 {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,2), %xmm0 {%k1}
 ; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 0)
+  %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
   %res2 = fadd <4 x float> %res, %res1
   ret <4 x float> %res2
 }
@@ -400,7 +401,7 @@ define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %k1, %k2
+; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
 ; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
@@ -419,12 +420,12 @@ define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
-; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,0), %xmm0 {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %xmm0 {%k1}
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0)
+  %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
   %res2 = fadd <2 x double> %res, %res1
   ret <2 x double> %res2
 }
@@ -452,12 +453,12 @@ define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
-; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,0), %ymm0 {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %ymm0 {%k1}
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0)
+  %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
   %res2 = fadd <4 x double> %res, %res1
   ret <4 x double> %res2
 }
@@ -485,12 +486,12 @@ define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1,
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
-; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,0), %xmm0 {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,2), %xmm0 {%k1}
 ; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0)
+  %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
   %res2 = fadd <4 x float> %res, %res1
   ret <4 x float> %res2
 }
@@ -501,14 +502,14 @@ define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x
 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    kxnorw %k2, %k2, %k2
+; CHECK-NEXT:    kxnorw %k0, %k0, %k2
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
-; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,0), %xmm0 {%k1}
+; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4)
-  %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 0)
+  %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 2)
   %res2 = add <4 x i32> %res, %res1
   ret <4 x i32> %res2
 }
@@ -521,12 +522,12 @@ define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1,
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
 ; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
-; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,0), %ymm0 {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm0 {%k1}
 ; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
-  %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 0)
+  %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2)
   %res2 = fadd <8 x float> %res, %res1
   ret <8 x float> %res2
 }
@@ -538,13 +539,13 @@ define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %k1, %k2
+; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
-; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,0), %ymm0 {%k1}
+; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
-  %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 0)
+  %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 2)
   %res2 = add <8 x i32> %res, %res1
   ret <8 x i32> %res2
 }
@@ -555,11 +556,11 @@ define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    kxnorw %k2, %k2, %k2
-; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,0) {%k2}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k2
+; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
 ; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 0)
+  call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4)
   ret void
 }
@@ -570,11 +571,11 @@ define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 0)
+  call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4)
   ret void
 }
@@ -585,11 +586,11 @@ define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 0)
+  call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
   ret void
 }
@@ -600,11 +601,11 @@ define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 0)
+  call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
   ret void
 }
@@ -615,11 +616,11 @@ define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 0)
+  call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4)
   ret void
 }
@@ -630,11 +631,11 @@ define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    kxnorw %k2, %k2, %k2
-; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,0) {%k2}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k2
+; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
 ; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 0)
+  call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4)
   ret void
 }
@@ -645,11 +646,11 @@ define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 0)
+  call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
   ret void
 }
@@ -660,11 +661,11 @@ define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 0)
+  call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
   ret void
 }
@@ -675,11 +676,11 @@ define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    kxnorw %k2, %k2, %k2
-; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,0) {%k2}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k2
+; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
 ; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 0)
+  call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4)
   ret void
 }
@@ -690,11 +691,11 @@ define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    kxnorw %k2, %k2, %k2
-; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,0) {%k2}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k2
+; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
 ; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 0)
+  call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4)
   ret void
 }
@@ -705,11 +706,11 @@ define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 0)
+  call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
   ret void
 }
@@ -720,11 +721,11 @@ define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    kxnorw %k2, %k2, %k2
-; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,0) {%k2}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k2
+; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
 ; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 0)
+  call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
   ret void
 }
@@ -735,11 +736,11 @@ define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 0)
+  call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4)
   ret void
 }
@@ -750,11 +751,11 @@ define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 0)
+  call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4)
   ret void
 }
@@ -765,11 +766,11 @@ define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 0)
+  call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
   ret void
 }
@@ -780,11 +781,11 @@ define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,0) {%k1}
-; CHECK-NEXT:    kxnorw %k1, %k1, %k1
+; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 0)
+  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
   ret void
 }
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 6f985f0bf3a77..41ec62c7e0475 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -12,14 +12,24 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
   ret <16 x float> %rrr3
 }
 
-;CHECK-LABEL: test2:
-;KNL: vinsertf32x4 $0
-;SKX: vinsertf64x2 $0
-;CHECK: vextractf32x4 $3
-;KNL: vinsertf32x4 $3
-;SKX: vinsertf64x2 $3
-;CHECK: ret
 define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
+; KNL-LABEL: test2:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vmovhpd (%rdi), %xmm0, %xmm2
+; KNL-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; KNL-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
+; KNL-NEXT:    vmovsd %xmm1, %xmm2, %xmm1
+; KNL-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test2:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vmovhpd (%rdi), %xmm0, %xmm2
+; SKX-NEXT:    vinsertf64x2 $0, %xmm2, %zmm0, %zmm0
+; SKX-NEXT:    vextractf64x2 $3, %zmm0, %xmm2
+; SKX-NEXT:    vmovsd %xmm1, %xmm2, %xmm1
+; SKX-NEXT:    vinsertf64x2 $3, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
   %rrr = load double, double* %br
   %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
   %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -36,12 +46,22 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
   ret <16 x float> %rrr2
 }
 
-;CHECK-LABEL: test4:
-;CHECK: vextracti32x4 $2
-;KNL: vinserti32x4 $0
-;SKX: vinserti64x2 $0
-;CHECK: ret
 define <8 x i64> @test4(<8 x i64> %x) nounwind {
+; KNL-LABEL: test4:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; KNL-NEXT:    vmovq %xmm1, %rax
+; KNL-NEXT:    vpinsrq $1, %rax, %xmm0, %xmm1
+; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test4:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vextracti64x2 $2, %zmm0, %xmm1
+; SKX-NEXT:    vmovq %xmm1, %rax
+; SKX-NEXT:    vpinsrq $1, %rax, %xmm0, %xmm1
+; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
   %eee = extractelement <8 x i64> %x, i32 4
   %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
   ret <8 x i64> %rrr2
@@ -142,7 +162,7 @@ define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
 ;CHECK: andl    $1, %eax
 ;CHECK: kmovw   %eax, %k0
 ;CHECK: movw    $-4
-;CHECK: korw    
+;CHECK: korw
 define i16 @test13(i32 %a, i32 %b) {
   %cmp_res = icmp ult i32 %a, %b
   %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
@@ -211,3 +231,476 @@ define i8 @test17(i1 *%addr, i8 %a) {
   ret i8 %x2
 }
 
+define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
+; SKX-LABEL: extract_v8i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrq $1, %xmm0, %rax
+; SKX-NEXT:    vextracti64x2 $1, %zmm0, %xmm0
+; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <8 x i64> %x, i32 1
+  %r2 = extractelement <8 x i64> %x, i32 3
+  store i64 %r2, i64* %dst, align 1
+  ret i64 %r1
+}
+
+define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
+; SKX-LABEL: extract_v4i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrq $1, %xmm0, %rax
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <4 x i64> %x, i32 1
+  %r2 = extractelement <4 x i64> %x, i32 3
+  store i64 %r2, i64* %dst, align 1
+  ret i64 %r1
+}
+
+define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
+; SKX-LABEL: extract_v2i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vmovq %xmm0, %rax
+; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <2 x i64> %x, i32 0
+  %r2 = extractelement <2 x i64> %x, i32 1
+  store i64 %r2, i64* %dst, align 1
+  ret i64 %r1
+}
+
+define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
+; SKX-LABEL: extract_v16i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrd $1, %xmm0, %eax
+; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
+; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <16 x i32> %x, i32 1
+  %r2 = extractelement <16 x i32> %x, i32 5
+  store i32 %r2, i32* %dst, align 1
+  ret i32 %r1
+}
+
+define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
+; SKX-LABEL: extract_v8i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrd $1, %xmm0, %eax
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <8 x i32> %x, i32 1
+  %r2 = extractelement <8 x i32> %x, i32 5
+  store i32 %r2, i32* %dst, align 1
+  ret i32 %r1
+}
+
+define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
+; SKX-LABEL: extract_v4i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrd $1, %xmm0, %eax
+; SKX-NEXT:    vpextrd $3, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <4 x i32> %x, i32 1
+  %r2 = extractelement <4 x i32> %x, i32 3
+  store i32 %r2, i32* %dst, align 1
+  ret i32 %r1
+}
+
+define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
+; SKX-LABEL: extract_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrw $1, %xmm0, %eax
+; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
+; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <32 x i16> %x, i32 1
+  %r2 = extractelement <32 x i16> %x, i32 9
+  store i16 %r2, i16* %dst, align 1
+  ret i16 %r1
+}
+
+define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
+; SKX-LABEL: extract_v16i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrw $1, %xmm0, %eax
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <16 x i16> %x, i32 1
+  %r2 = extractelement <16 x i16> %x, i32 9
+  store i16 %r2, i16* %dst, align 1
+  ret i16 %r1
+}
+
+define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
+; SKX-LABEL: extract_v8i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrw $1, %xmm0, %eax
+; SKX-NEXT:    vpextrw $3, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <8 x i16> %x, i32 1
+  %r2 = extractelement <8 x i16> %x, i32 3
+  store i16 %r2, i16* %dst, align 1
+  ret i16 %r1
+}
+
+define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
+; SKX-LABEL: extract_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrb $1, %xmm0, %eax
+; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
+; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <64 x i8> %x, i32 1
+  %r2 = extractelement <64 x i8> %x, i32 17
+  store i8 %r2, i8* %dst, align 1
+  ret i8 %r1
+}
+
+define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
+; SKX-LABEL: extract_v32i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrb $1, %xmm0, %eax
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <32 x i8> %x, i32 1
+  %r2 = extractelement <32 x i8> %x, i32 17
+  store i8 %r2, i8* %dst, align 1
+  ret i8 %r1
+}
+
+define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
+; SKX-LABEL: extract_v16i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrb $1, %xmm0, %eax
+; SKX-NEXT:    vpextrb $3, %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %r1 = extractelement <16 x i8> %x, i32 1
+  %r2 = extractelement <16 x i8> %x, i32 3
+  store i8 %r2, i8* %dst, align 1
+  ret i8 %r1
+}
+
+define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
+; SKX-LABEL: insert_v8i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vextracti64x2 $1, %zmm0, %xmm1
+; SKX-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
+; SKX-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %val = load i64, i64* %ptr
+  %r1 = insertelement <8 x i64> %x, i64 %val, i32 1
+  %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
+  ret <8 x i64> %r2
+}
+
+define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
+; SKX-LABEL: insert_v4i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
+; SKX-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %val = load i64, i64* %ptr
+  %r1 = insertelement <4 x i64> %x, i64 %val, i32 1
+  %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3
+  ret <4 x i64> %r2
+}
+
+define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
+; SKX-LABEL: insert_v2i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrq $3, %rdi, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %val = load i64, i64* %ptr
+  %r1 = insertelement <2 x i64> %x, i64 %val, i32 1
+  %r2 = insertelement <2 x i64> %r1, i64 %y, i32 3
+  ret <2 x i64> %r2
+}
+
+define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
+; SKX-LABEL: insert_v16i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; SKX-NEXT:    vpinsrd $1, %edi, %xmm1, %xmm1
+; SKX-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %val = load i32, i32* %ptr
+  %r1 = insertelement <16 x i32> %x, i32 %val, i32 1
+  %r2 = insertelement <16 x i32> %r1, i32 %y, i32 5
+  ret <16 x i32> %r2
+}
+
+define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
+; KNL-LABEL: insert_v8i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpinsrd $1, %edi, %xmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v8i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT:    vpinsrd $1, %edi, %xmm1, %xmm1
+; SKX-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %val = load i32, i32* %ptr
+  %r1 = insertelement <8 x i32> %x, i32 %val, i32 1
+  %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5
+  ret <8 x i32> %r2
+}
+
+define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
+; KNL-LABEL: insert_v4i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v4i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %val = load i32, i32* %ptr
+  %r1 = insertelement <4 x i32> %x, i32 %val, i32 1
+  %r2 = insertelement <4 x i32> %r1, i32 %y, i32 3
+  ret <4 x i32> %r2
+}
+
+define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
+; KNL-LABEL: insert_v32i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm2
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT:    vpinsrw $1, %edi, %xmm2, %xmm2
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; SKX-NEXT:    vpinsrw $1, %edi, %xmm1, %xmm1
+; SKX-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %val = load i16, i16* %ptr
+  %r1 = insertelement <32 x i16> %x, i16 %val, i32 1
+  %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9
+  ret <32 x i16> %r2
+}
+
+define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
+; KNL-LABEL: insert_v16i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpinsrw $1, %edi, %xmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v16i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT:    vpinsrw $1, %edi, %xmm1, %xmm1
+; SKX-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %val = load i16, i16* %ptr
+  %r1 = insertelement <16 x i16> %x, i16 %val, i32 1
+  %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
+  ret <16 x i16> %r2
+}
+
+define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) {
+; KNL-LABEL: insert_v8i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v8i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %val = load i16, i16* %ptr
+  %r1 = insertelement <8 x i16> %x, i16 %val, i32 1
+  %r2 = insertelement <8 x i16> %r1, i16 %y, i32 5
+  ret <8 x i16> %r2
+}
+
+define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
+; KNL-LABEL: insert_v64i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm2
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; KNL-NEXT:    vpinsrb $2, %edi, %xmm2, %xmm2
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; SKX-NEXT:    vpinsrb $2, %edi, %xmm1, %xmm1
+; SKX-NEXT:    vinserti32x4 $3, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %val = load i8, i8* %ptr
+  %r1 = insertelement <64 x i8> %x, i8 %val, i32 1
+  %r2 = insertelement <64 x i8> %r1, i8 %y, i32 50
+  ret <64 x i8> %r2
+}
+
+define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
+; SKX-LABEL: insert_v32i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT:    vpinsrb $1, %edi, %xmm1, %xmm1
+; SKX-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %val = load i8, i8* %ptr
+  %r1 = insertelement <32 x i8> %x, i8 %val, i32 1
+  %r2 = insertelement <32 x i8> %r1, i8 %y, i32 17
+  ret <32 x i8> %r2
+}
+
+define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) {
+; KNL-LABEL: insert_v16i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v16i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0
+; SKX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %val = load i8, i8* %ptr
+  %r1 = insertelement <16 x i8> %x, i8 %val, i32 3
+  %r2 = insertelement <16 x i8> %r1, i8 %y, i32 10
+  ret <16 x i8> %r2
+}
+
+define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) {
+; KNL-LABEL: test_insert_128_v8i64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
+; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insert_128_v8i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
+; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %r = insertelement <8 x i64> %x, i64 %y, i32 1
+  ret <8 x i64> %r
+}
+
+define <16 x i32> @test_insert_128_v16i32(<16 x i32> %x, i32 %y) {
+; KNL-LABEL: test_insert_128_v16i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm1
+; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insert_128_v16i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm1
+; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %r = insertelement <16 x i32> %x, i32 %y, i32 1
+  ret <16 x i32> %r
+}
+
+define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
+; KNL-LABEL: test_insert_128_v8f64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
+; KNL-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insert_128_v8f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vunpcklpd %xmm1, %xmm0, %xmm1
+; SKX-NEXT:    vinsertf64x2 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %r = insertelement <8 x double> %x, double %y, i32 1
+  ret <8 x double> %r
+}
+
+define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
+; KNL-LABEL: test_insert_128_v16f32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm1
+; KNL-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insert_128_v16f32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm1
+; SKX-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %r = insertelement <16 x float> %x, float %y, i32 1
+  ret <16 x float> %r
+}
+
+define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
+; KNL-LABEL: test_insert_128_v16i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpinsrw $2, %edi, %xmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insert_128_v16i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT:    vpinsrw $2, %edi, %xmm1, %xmm1
+; SKX-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %r = insertelement <16 x i16> %x, i16 %y, i32 10
+  ret <16 x i16> %r
+}
+
+define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
+; KNL-LABEL: test_insert_128_v32i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpinsrb $4, %edi, %xmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insert_128_v32i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT:    vpinsrb $4, %edi, %xmm1, %xmm1
+; SKX-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %r = insertelement <32 x i8> %x, i8 %y, i32 20
+  ret <32 x i8> %r
+}
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 7642cd4e6c5ca..764e136384854 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -1,60 +1,94 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
 declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
-; CHECK-LABEL: test_kortestz
-; CHECK: kortestw
-; CHECK: sete
 define i32 @test_kortestz(i16 %a0, i16 %a1) {
+; CHECK-LABEL: test_kortestz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kortestw %k0, %k1
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    kmovw %eax, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1)
   ret i32 %res
 }
 
 declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
-; CHECK-LABEL: test_kortestc
-; CHECK: kortestw
-; CHECK: sbbl
 define i32 @test_kortestc(i16 %a0, i16 %a1) {
+; CHECK-LABEL: test_kortestc:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kortestw %k0, %k1
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1)
   ret i32 %res
 }
 
 declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
-; CHECK-LABEL: test_kand
-; CHECK: kandw
-; CHECK: kandw
 define i16 @test_kand(i16 %a0, i16 %a1) {
+; CHECK-LABEL: test_kand:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movw $8, %ax
+; CHECK-NEXT:    kmovw %eax, %k0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kandw %k0, %k1, %k0
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kandw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
   ret i16 %t2
 }
 
 declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
-; CHECK-LABEL: test_knot
-; CHECK: knotw
 define i16 @test_knot(i16 %a0) {
+; CHECK-LABEL: test_knot:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
   ret i16 %res
 }
 
 declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
 
-; CHECK-LABEL: unpckbw_test
-; CHECK: kunpckbw
-; CHECK:ret
 define i16 @unpckbw_test(i16 %a0, i16 %a1) {
+; CHECK-LABEL: unpckbw_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
   ret i16 %res
 }
 
 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
-  ; CHECK: vrcp14ps {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x4c,0xc0]
+; CHECK-LABEL: test_rcp_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrcp14ps %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
 
 define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
-  ; CHECK: vrcp14pd {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x4c,0xc0]
+; CHECK-LABEL: test_rcp_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrcp14pd %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
@@ -63,7 +97,10 @@ declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i
 declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
 
 define <8 x double> @test7(<8 x double> %a) {
-; CHECK: vrndscalepd {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x0b]
+; CHECK-LABEL: test7:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrndscalepd $11, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
   ret <8 x double>%res
 }
@@ -71,121 +108,246 @@ define <8 x double> @test7(<8 x double> %a) {
 declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
 
 define <16 x float> @test8(<16 x float> %a) {
-; CHECK: vrndscaleps {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x0b]
+; CHECK-LABEL: test8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrndscaleps $11, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
   ret <16 x float>%res
 }
 
 define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
-  ; CHECK: vrsqrt14ps {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x4e,0xc0]
+; CHECK-LABEL: test_rsqrt_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrsqrt14ps %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
 
 define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
-  ; CHECK: vrsqrt14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4f,0xc0]
+; CHECK-LABEL: test_rsqrt14_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrsqrt14ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
 define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
-  ; CHECK: vrcp14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4d,0xc0]
+; CHECK-LABEL: test_rcp14_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
-  ; CHECK-LABEL: test_sqrt_pd_512
-  ; CHECK: vsqrtpd
-  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4) 
+; CHECK-LABEL: test_sqrt_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
-  ; CHECK-LABEL: test_sqrt_ps_512
-  ; CHECK: vsqrtps
-  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) 
+; CHECK-LABEL: test_sqrt_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
   ret <16 x float> %res
 }
 define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
-  ; CHECK-LABEL: test_sqrt_round_ps_512
-  ; CHECK: vsqrtps {rz-sae}
-  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 3) 
+; CHECK-LABEL: test_sqrt_round_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsqrtps {rz-sae}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 3)
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
-  ; CHECK-LABEL: test_getexp_pd_512
-  ; CHECK: vgetexppd
-  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4) 
+; CHECK-LABEL: test_getexp_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vgetexppd %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
   ret <8 x double> %res
 }
 define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) {
-  ; CHECK-LABEL: test_getexp_round_pd_512
-  ; CHECK: vgetexppd {sae}
-  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 8) 
+; CHECK-LABEL: test_getexp_round_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vgetexppd {sae}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 8)
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
-  ; CHECK-LABEL: test_getexp_ps_512
-  ; CHECK: vgetexpps
-  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) 
+; CHECK-LABEL: test_getexp_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vgetexpps %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) {
-  ; CHECK-LABEL: test_getexp_round_ps_512
-  ; CHECK: vgetexpps {sae}
-  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) 
+; CHECK-LABEL: test_getexp_round_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vgetexpps {sae}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
-define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) {
-  ; CHECK: vsqrtss {{.*}}encoding: [0x62
-  %res = call <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovaps %zmm2, %zmm3
+; CHECK-NEXT:    vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT:    vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2)
+  %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3)
+
+  %res.1 = fadd <4 x float> %res0, %res1
+  %res.2 = fadd <4 x float> %res2, %res3
+  %res   = fadd <4 x float> %res.1, %res.2
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float>, <4 x float>) nounwind readnone
 
-define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1) {
-  ; CHECK: vsqrtsd {{.*}}encoding: [0x62
-  %res = call <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovaps %zmm2, %zmm3
+; CHECK-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT:    vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2)
+  %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3)
+
+  %res.1 = fadd <2 x double> %res0, %res1
+  %res.2 = fadd <2 x double> %res2, %res3
+  %res   = fadd <2 x double> %res.1, %res.2
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double>, <2 x double>) nounwind readnone
 
 define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
-  ; CHECK: vcvtsd2si {{.*}}encoding: [0x62
+; CHECK-LABEL: test_x86_sse2_cvtsd2si64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
+; CHECK-NEXT:    retq
   %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
 
 define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
-  ; CHECK: vcvtsi2sdq {{.*}}encoding: [0x62
+; CHECK-LABEL: test_x86_sse2_cvtsi642sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
 
-define i64 @test_x86_sse2_cvttsd2si64(<2 x double> %a0) {
-  ; CHECK: vcvttsd2si {{.*}}encoding: [0x62
-  %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
-  ret i64 %res
+define i64 @test_x86_avx512_cvttsd2si64(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvttsd2si64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttsd2si %xmm0, %rcx
+; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+  %res0 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 4) ;
+  %res1 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 8) ;
+  %res2 = add i64 %res0, %res1
+  ret i64 %res2
+}
+declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttsd2usi %xmm0, %ecx
+; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    retq
+  %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
+  %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
+  %res2 = add i32 %res0, %res1
+  ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvttsd2si:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttsd2si %xmm0, %ecx
+; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    retq
+  %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
+  %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
+  %res2 = add i32 %res0, %res1
+  ret i32 %res2
 }
-declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
+declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
+
 
 
+define i64 @test_x86_avx512_cvttsd2usi64(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvttsd2usi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttsd2usi %xmm0, %rcx
+; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+  %res0 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 4) ;
+  %res1 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 8) ;
+  %res2 = add i64 %res0, %res1
+  ret i64 %res2
+}
+declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) nounwind readnone
+
 define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
-  ; CHECK: vcvtss2si {{.*}}encoding: [0x62
+; CHECK-LABEL: test_x86_sse_cvtss2si64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtss2si %xmm0, %rax
+; CHECK-NEXT:    retq
   %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
@@ -193,37 +355,139 @@ declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
 
 
 define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
-  ; CHECK: vcvtsi2ssq {{.*}}encoding: [0x62
+; CHECK-LABEL: test_x86_sse_cvtsi642ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
 
 
-define i64 @test_x86_sse_cvttss2si64(<4 x float> %a0) {
-  ; CHECK: vcvttss2si {{.*}}encoding: [0x62
-  %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ; <i64> [#uses=1]
-  ret i64 %res
+define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvttss2si:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %ecx
+; CHECK-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    retq
+  %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
+  %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
+  %res2 = add i32 %res0, %res1
+  ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
+
+define i64 @test_x86_avx512_cvttss2si64(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvttss2si64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttss2si %xmm0, %rcx
+; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+  %res0 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 4) ;
+  %res1 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 8) ;
+  %res2 = add i64 %res0, %res1
+  ret i64 %res2
+}
+declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvttss2usi:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %ecx
+; CHECK-NEXT:    vcvttss2usi %xmm0, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    retq
+  %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
+  %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
+  %res2 = add i32 %res0, %res1
+  ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
+
+define i64 @test_x86_avx512_cvttss2usi64(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvttss2usi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttss2usi %xmm0, %rcx
+; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+  %res0 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 4) ;
+  %res1 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 8) ;
+  %res2 = add i64 %res0, %res1
+  ret i64 %res2
 }
-declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
+declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
 
 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
-  ; CHECK: vcvtsd2usi {{.*}}encoding: [0x62
+; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtsd2usi %xmm0, %rax
+; CHECK-NEXT:    retq
   %res = call i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
 declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone
 
 define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
-  ; CHECK: vcvtph2ps  %ymm0, %zmm0    ## encoding: [0x62,0xf2,0x7d,0x48,0x13,0xc0]
+; CHECK-LABEL: test_x86_vcvtph2ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
   ret <16 x float> %res
 }
+
+define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
+; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
+; CHECK-LABEL: test_x86_vcvtph2ps_512_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
+; CHECK-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
+; CHECK-LABEL: test_x86_vcvtph2ps_512_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
 declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
 
 
 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
-  ; CHECK: vcvtps2ph $2, %zmm0, %ymm0  ## encoding: [0x62,0xf3,0x7d,0x48,0x1d,0xc0,0x02]
+; CHECK-LABEL: test_x86_vcvtps2ph_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm0
+; CHECK-NEXT:    retq
   %res = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
   ret <16 x i16> %res
 }
@@ -231,65 +495,124 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
 declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
 
 define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
-  ; CHECK: vbroadcastss
+; CHECK-LABEL: test_x86_vbroadcast_ss_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8*) nounwind readonly
 
 define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
-  ; CHECK: vbroadcastsd
+; CHECK-LABEL: test_x86_vbroadcast_sd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
 
-define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0) {
-  ; CHECK: vbroadcastss
-  %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float> %a0) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float>) nounwind readonly
-
-define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0) {
-  ; CHECK: vbroadcastsd
-  %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double> %a0) ; <<8 x double>> [#uses=1]
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double>) nounwind readonly
-
-define <16 x i32> @test_x86_pbroadcastd_512(<4 x i32>  %a0) {
-  ; CHECK: vpbroadcastd
-  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %a0) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
+define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
+; CHECK: kmovw   %edi, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+
+  %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1) 
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) 
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) 
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res2, %res3
+  ret <16 x float> %res4
+}
+declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
+
+
+define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
+; CHECK: kmovw   %eax, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+
+  %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1) 
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) 
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) 
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res2, %res3
+  ret <8 x double> %res4
+}
+declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
+
+define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
+  %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
+  %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res2, %res3
+  ret <16 x i32> %res4
 }
-declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>) nounwind readonly
+declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
 
 define <16 x i32> @test_x86_pbroadcastd_i32_512(i32  %a0) {
-  ; CHECK: vpbroadcastd
+; CHECK-LABEL: test_x86_pbroadcastd_i32_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32 %a0) ; <<16 x i32>> [#uses=1]
   ret <16 x i32> %res
 }
 declare <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32) nounwind readonly
 
-define <8 x i64> @test_x86_pbroadcastq_512(<2 x i64> %a0) {
-  ; CHECK: vpbroadcastq
-  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %a0) ; <<8 x i64>> [#uses=1]
-  ret <8 x i64> %res
+define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
+  %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
+  %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res2, %res3
+  ret <8 x i64> %res4
 }
-declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>) nounwind readonly
+declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
 
 define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) {
-  ; CHECK: vpbroadcastq
+; CHECK-LABEL: test_x86_pbroadcastq_i64_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64 %a0) ; <<8 x i64>> [#uses=1]
   ret <8 x i64> %res
 }
 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly
 
 define <16 x i32> @test_conflict_d(<16 x i32> %a) {
-  ; CHECK: movw $-1, %ax
-  ; CHECK: vpxor
-  ; CHECK: vpconflictd
+; CHECK-LABEL: test_conflict_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpconflictd %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
@@ -297,9 +620,10 @@ define <16 x i32> @test_conflict_d(<16 x i32> %a) {
 declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
 
 define <8 x i64> @test_conflict_q(<8 x i64> %a) {
-  ; CHECK: movb $-1, %al
-  ; CHECK: vpxor
-  ; CHECK: vpconflictq
+; CHECK-LABEL: test_conflict_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpconflictq %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
@@ -307,21 +631,32 @@ define <8 x i64> @test_conflict_q(<8 x i64> %a) {
 declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
 
 define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
-  ; CHECK: vpconflictd
+; CHECK-LABEL: test_maskz_conflict_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpconflictd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-  ; CHECK: vpconflictq
+; CHECK-LABEL: test_mask_conflict_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpconflictq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
-  ; CHECK: movw $-1, %ax
-  ; CHECK: vpxor
-  ; CHECK: vplzcntd
+; CHECK-LABEL: test_lzcnt_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntd %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
@@ -329,9 +664,10 @@ define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
 declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
 
 define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
-  ; CHECK: movb $-1, %al
-  ; CHECK: vpxor
-  ; CHECK: vplzcntq
+; CHECK-LABEL: test_lzcnt_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntq %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
@@ -340,37 +676,34 @@ declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) no
 
 
 define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-  ; CHECK: vplzcntd
+; CHECK-LABEL: test_mask_lzcnt_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vplzcntd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-  ; CHECK: vplzcntq
+; CHECK-LABEL: test_mask_lzcnt_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vplzcntq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret <8 x i64> %res
 }
 
-define <16 x i32> @test_ctlz_d(<16 x i32> %a) {
-  ; CHECK-LABEL: test_ctlz_d
-  ; CHECK: vplzcntd
-  %res = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) nounwind readonly
-
-define <8 x i64> @test_ctlz_q(<8 x i64> %a) {
-  ; CHECK-LABEL: test_ctlz_q
-  ; CHECK: vplzcntq
-  %res = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) nounwind readonly
-
 define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK: vblendmps %zmm1, %zmm0
+; CHECK-LABEL: test_x86_mask_blend_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vblendmps %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
@@ -378,14 +711,23 @@ define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x
 declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly
 
 define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK: vblendmpd %zmm1, %zmm0
+; CHECK-LABEL: test_x86_mask_blend_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vblendmpd %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
 
 define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) {
-  ; CHECK-LABEL: test_x86_mask_blend_pd_512_memop
-  ; CHECK: vblendmpd (%
+; CHECK-LABEL: test_x86_mask_blend_pd_512_memop:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vblendmpd (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %b = load <8 x double>, <8 x double>* %ptr
   %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a, <8 x double> %b, i8 %mask) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
@@ -393,28 +735,45 @@ define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x doub
 declare <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double>, <8 x double>, i8) nounwind readonly
 
 define <16 x i32> @test_x86_mask_blend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
-  ; CHECK: vpblendmd
+; CHECK-LABEL: test_x86_mask_blend_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32> %a1, <16 x i32> %a2, i16 %a0) ; <<16 x i32>> [#uses=1]
   ret <16 x i32> %res
 }
 declare <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
 
 define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
-  ; CHECK: vpblendmq
+; CHECK-LABEL: test_x86_mask_blend_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1]
   ret <8 x i64> %res
 }
 declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
 
  define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
- ;CHECK: vcmpleps {sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x02]
+; CHECK-LABEL: test_cmpps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
    %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
    ret i16 %res
  }
  declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
 
  define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
- ;CHECK: vcmpneqpd %zmm{{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc1,0x04]
+; CHECK-LABEL: test_cmppd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
    %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
    ret i8 %res
  }
@@ -422,7 +781,10 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no
 
  ; fp min - max
 define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
-  ; CHECK: vmaxpd
+; CHECK-LABEL: test_vmaxpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
                     <8 x double>zeroinitializer, i8 -1, i32 4)
   ret <8 x double> %res
@@ -431,7 +793,10 @@ declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>
                     <8 x double>, i8, i32)
 
 define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
-  ; CHECK: vminpd
+; CHECK-LABEL: test_vminpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vminpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1,
                     <8 x double>zeroinitializer, i8 -1, i32 4)
   ret <8 x double> %res
@@ -441,11 +806,14 @@ declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>
 
  declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsd{{.*}}{%k1} 
 define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpabsd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpabsd %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -454,11 +822,15 @@ define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32>
 
 declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsq{{.*}}{%k1} 
 define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpabsq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpabsq %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -466,21 +838,33 @@ define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x
 }
 
 define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK: vptestmq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc1]
+; CHECK-LABEL: test_vptestmq:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
   ret i8 %res
 }
 declare i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
 
 define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vptestmd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc1]
+; CHECK-LABEL: test_vptestmd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
   ret i16 %res
 }
 declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
 
 define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK: vmovups {{.*}}encoding: [0x62,0xf1,0x7c,0x49,0x11,0x07]
+; CHECK-LABEL: test_store1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovups %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
   ret void
 }
@@ -488,7 +872,11 @@ define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) {
 declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
 
 define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK: vmovupd {{.*}}encoding: [0x62,0xf1,0xfd,0x49,0x11,0x07]
+; CHECK-LABEL: test_store2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovupd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
   ret void
 }
@@ -565,32 +953,45 @@ declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
 
 define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_valign_q:
-; CHECK: valignq $2, %zmm1, %zmm0, %zmm0
-  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> zeroinitializer, i8 -1)
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    valignq $2, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
 ; CHECK-LABEL: test_mask_valign_q:
-; CHECK: valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
-  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> %src, i8 %mask)
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask)
   ret <8 x i64> %res
 }
 
-declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i8, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
 
 define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
 ; CHECK-LABEL: test_maskz_valign_d:
-; CHECK: valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x03,0xc1,0x05]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i8 5, <16 x i32> zeroinitializer, i16 %mask)
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
 
 define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
- ; CHECK-LABEL: test_mask_store_ss
- ; CHECK: vmovss %xmm0, (%rdi) {%k1}     ## encoding: [0x62,0xf1,0x7e,0x09,0x11,0x07]
+; CHECK-LABEL: test_mask_store_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovss %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
  call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask)
  ret void
 }
@@ -598,15 +999,22 @@ define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
 declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
 
 define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_pcmpeq_d
-; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##
+; CHECK-LABEL: test_pcmpeq_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
   ret i16 %res
 }
 
 define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_d
-; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpeq_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   ret i16 %res
 }
@@ -614,15 +1022,23 @@ define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
 declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
 
 define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_pcmpeq_q
-; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
+; CHECK-LABEL: test_pcmpeq_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
   ret i8 %res
 }
 
 define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_q
-; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpeq_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret i8 %res
 }
@@ -630,15 +1046,22 @@ define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
 
 define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_pcmpgt_d
-; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 ##
+; CHECK-LABEL: test_pcmpgt_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
   ret i16 %res
 }
 
 define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_d
-; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpgt_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   ret i16 %res
 }
@@ -646,15 +1069,23 @@ define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
 declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
 
 define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_pcmpgt_q
-; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 ##
+; CHECK-LABEL: test_pcmpgt_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
   ret i8 %res
 }
 
 define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_q
-; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpgt_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret i8 %res
 }
@@ -662,58 +1093,95 @@ define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
 
 define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK_LABEL: test_cmp_d_512
-; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##
+; CHECK-LABEL: test_cmp_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r8d
+; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r9d
+; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r10d
+; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %esi
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %edi
+; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    vmovd %r8d, %xmm0
+; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltd %zmm1, %zmm0, %k0 ##
   %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpled %zmm1, %zmm0, %k0 ##
   %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 ##
   %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 ##
   %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 ##
   %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnled %zmm1, %zmm0, %k0 ##
   %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordd %zmm1, %zmm0, %k0 ##
   %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
 
 define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK_LABEL: test_mask_cmp_d_512
-; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r8d
+; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r9d
+; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r10d
+; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %esi
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %edi
+; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    vmovd %r8d, %xmm0
+; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltd %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpled %zmm1, %zmm0, %k0 {%k1} ##
   %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 {%k1} ##
   %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ##
   %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ##
   %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnled %zmm1, %zmm0, %k0 {%k1} ##
   %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordd %zmm1, %zmm0, %k0 {%k1} ##
   %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
@@ -722,58 +1190,95 @@ define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)
 declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
 
 define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK_LABEL: test_ucmp_d_512
-; CHECK: vpcmpequd %zmm1, %zmm0, %k0 ##
+; CHECK-LABEL: test_ucmp_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r8d
+; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r9d
+; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r10d
+; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %esi
+; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %edi
+; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    vmovd %r8d, %xmm0
+; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltud %zmm1, %zmm0, %k0 ##
   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleud %zmm1, %zmm0, %k0 ##
   %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 ##
   %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 ##
   %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 ##
   %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 ##
   %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordud %zmm1, %zmm0, %k0 ##
   %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
 
 define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK_LABEL: test_mask_ucmp_d_512
-; CHECK: vpcmpequd %zmm1, %zmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r8d
+; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r9d
+; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r10d
+; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %esi
+; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %edi
+; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    vmovd %r8d, %xmm0
+; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ##
   %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 {%k1} ##
   %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 {%k1} ##
   %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ##
   %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ##
   %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordud %zmm1, %zmm0, %k0 {%k1} ##
   %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
@@ -782,58 +1287,112 @@ define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask
 declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
 
 define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
-; CHECK_LABEL: test_cmp_q_512
-; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
+; CHECK-LABEL: test_cmp_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r8d
+; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r9d
+; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r10d
+; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r11d
+; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %edi
+; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    movzbl %r8b, %esi
+; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r9b, %esi
+; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r10b, %esi
+; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r11b, %esi
+; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %dil, %esi
+; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %dl, %eax
+; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltq %zmm1, %zmm0, %k0 ##
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleq %zmm1, %zmm0, %k0 ##
   %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 ##
   %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 ##
   %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 ##
   %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 ##
   %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordq %zmm1, %zmm0, %k0 ##
   %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
 define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_cmp_q_512
-; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r8d
+; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r9d
+; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r10d
+; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r11d
+; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %edi
+; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    movzbl %r8b, %esi
+; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r9b, %esi
+; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r10b, %esi
+; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r11b, %esi
+; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %dil, %esi
+; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %dl, %eax
+; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltq %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ##
   %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 {%k1} ##
   %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ##
   %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ##
   %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 {%k1} ##
   %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordq %zmm1, %zmm0, %k0 {%k1} ##
   %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
@@ -842,58 +1401,112 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
-; CHECK_LABEL: test_ucmp_q_512
-; CHECK: vpcmpequq %zmm1, %zmm0, %k0 ##
+; CHECK-LABEL: test_ucmp_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r8d
+; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r9d
+; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r10d
+; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %r11d
+; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %edi
+; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    movzbl %r8b, %esi
+; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r9b, %esi
+; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r10b, %esi
+; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r11b, %esi
+; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %dil, %esi
+; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %dl, %eax
+; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 ##
   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 ##
   %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 ##
   %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 ##
   %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 ##
   %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 ##
   %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduq %zmm1, %zmm0, %k0 ##
   %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
 define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_ucmp_q_512
-; CHECK: vpcmpequq %zmm1, %zmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r8d
+; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r9d
+; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r10d
+; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %r11d
+; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %edi
+; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    movzbl %r8b, %esi
+; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r9b, %esi
+; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r10b, %esi
+; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %r11b, %esi
+; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %dil, %esi
+; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    movzbl %dl, %eax
+; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ##
   %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1} ##
   %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 {%k1} ##
   %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ##
   %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ##
   %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduq %zmm1, %zmm0, %k0 {%k1} ##
   %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
@@ -903,57 +1516,77 @@ declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounw
 
 define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
 ; CHECK-LABEL: test_mask_vextractf32x4:
-; CHECK: vextractf32x4 $2, %zmm1, %xmm0 {%k1}
-  %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i8 2, <4 x float> %b, i8 %mask)
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
   ret <4 x float> %res
 }
 
-declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i8, <4 x float>, i8)
+declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
 
 define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
 ; CHECK-LABEL: test_mask_vextracti64x4:
-; CHECK: vextracti64x4 $2, %zmm1, %ymm0 {%k1}
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i8 2, <4 x i64> %b, i8 %mask)
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vextracti64x4 $2, %zmm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
   ret <4 x i64> %res
 }
 
-declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
 
 define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
 ; CHECK-LABEL: test_maskz_vextracti32x4:
-; CHECK: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i8 2, <4 x i32> zeroinitializer, i8 %mask)
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i8, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
 
 define <4 x double> @test_vextractf64x4(<8 x double> %a) {
 ; CHECK-LABEL: test_vextractf64x4:
-; CHECK: vextractf64x4 $2, %zmm0, %ymm0 ##
-  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i8 2, <4 x double> zeroinitializer, i8 -1)
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf64x4 $2, %zmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
   ret <4 x double> %res
 }
 
-declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i8, <4 x double>, i8)
+declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
 
 define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
-  ; CHECK-LABEL: test_x86_avx512_pslli_d
-  ; CHECK: vpslld
+; CHECK-LABEL: test_x86_avx512_pslli_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_pslli_d
-  ; CHECK: vpslld $7, %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpslld $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d
-  ; CHECK: vpslld $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -961,22 +1594,33 @@ define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
 declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
-  ; CHECK-LABEL: test_x86_avx512_pslli_q
-  ; CHECK: vpsllq
+; CHECK-LABEL: test_x86_avx512_pslli_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_pslli_q
-  ; CHECK: vpsllq $7, %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q
-  ; CHECK: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -984,22 +1628,31 @@ define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
 declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
 
 define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
-  ; CHECK-LABEL: test_x86_avx512_psrli_d
-  ; CHECK: vpsrld
+; CHECK-LABEL: test_x86_avx512_psrli_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psrli_d
-  ; CHECK: vpsrld $7, %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d
-  ; CHECK: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -1007,22 +1660,33 @@ define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
 declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
-  ; CHECK-LABEL: test_x86_avx512_psrli_q
-  ; CHECK: vpsrlq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+; CHECK-LABEL: test_x86_avx512_psrli_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psrli_q
-  ; CHECK: vpsrlq $7, %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q
-  ; CHECK: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1030,22 +1694,31 @@ define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
 declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
 
 define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
-  ; CHECK-LABEL: test_x86_avx512_psrai_d
-  ; CHECK: vpsrad
+; CHECK-LABEL: test_x86_avx512_psrai_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psrai_d
-  ; CHECK: vpsrad $7, %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d
-  ; CHECK: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -1053,22 +1726,33 @@ define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
 declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
-  ; CHECK-LABEL: test_x86_avx512_psrai_q
-  ; CHECK: vpsraq
+; CHECK-LABEL: test_x86_avx512_psrai_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psrai_q
-  ; CHECK: vpsraq $7, %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q
-  ; CHECK: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1076,22 +1760,31 @@ define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
 declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
 
 define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psll_d
-  ; CHECK: vpslld
+; CHECK-LABEL: test_x86_avx512_psll_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psll_d
-  ; CHECK: vpslld %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psll_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psll_d
-  ; CHECK: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psll_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -1099,22 +1792,33 @@ define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i
 declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psll_q
-  ; CHECK: vpsllq
+; CHECK-LABEL: test_x86_avx512_psll_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psll_q
-  ; CHECK: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psll_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psll_q
-  ; CHECK: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1122,22 +1826,31 @@ define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8
 declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
 
 define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psrl_d
-  ; CHECK: vpsrld
+; CHECK-LABEL: test_x86_avx512_psrl_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psrl_d
-  ; CHECK: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psrl_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psrl_d
-  ; CHECK: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psrl_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -1145,22 +1858,33 @@ define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i
 declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psrl_q
-  ; CHECK: vpsrlq
+; CHECK-LABEL: test_x86_avx512_psrl_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psrl_q
-  ; CHECK: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psrl_q
-  ; CHECK: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1168,22 +1892,31 @@ define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8
 declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
 
 define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psra_d
-  ; CHECK: vpsrad
+; CHECK-LABEL: test_x86_avx512_psra_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psra_d
-  ; CHECK: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psra_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psra_d
-  ; CHECK: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psra_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -1191,22 +1924,33 @@ define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i
 declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psra_q
-  ; CHECK: vpsraq
+; CHECK-LABEL: test_x86_avx512_psra_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psra_q
-  ; CHECK: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psra_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psra_q
-  ; CHECK: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1214,22 +1958,31 @@ define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8
 declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
 
 define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psllv_d
-  ; CHECK: vpsllvd
+; CHECK-LABEL: test_x86_avx512_psllv_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psllv_d
-  ; CHECK: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psllv_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psllv_d
-  ; CHECK: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psllv_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -1237,22 +1990,33 @@ define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1,
 declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psllv_q
-  ; CHECK: vpsllvq
+; CHECK-LABEL: test_x86_avx512_psllv_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psllv_q
-  ; CHECK: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psllv_q
-  ; CHECK: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1261,22 +2025,31 @@ declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>,
 
 
 define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psrav_d
-  ; CHECK: vpsravd
+; CHECK-LABEL: test_x86_avx512_psrav_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psrav_d
-  ; CHECK: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psrav_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psrav_d
-  ; CHECK: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psrav_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -1284,22 +2057,33 @@ define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1,
 declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psrav_q
-  ; CHECK: vpsravq
+; CHECK-LABEL: test_x86_avx512_psrav_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psrav_q
-  ; CHECK: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psrav_q
-  ; CHECK: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1307,22 +2091,31 @@ define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8
 declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
 
 define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psrlv_d
-  ; CHECK: vpsrlvd
+; CHECK-LABEL: test_x86_avx512_psrlv_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psrlv_d
-  ; CHECK: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psrlv_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d
-  ; CHECK: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -1330,22 +2123,33 @@ define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1,
 declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK-LABEL: test_x86_avx512_psrlv_q
-  ; CHECK: vpsrlvq
+; CHECK-LABEL: test_x86_avx512_psrlv_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_psrlv_q
-  ; CHECK: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q
-  ; CHECK: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1353,8 +2157,10 @@ define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8
 declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) {
-  ; CHECK-LABEL: test_x86_avx512_psrlv_q_memop
-  ; CHECK: vpsrlvq (%
+; CHECK-LABEL: test_x86_avx512_psrlv_q_memop:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsrlvq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %ptr
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
@@ -1365,64 +2171,80 @@ declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>
 declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
 
 define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK-LABEL: test_vsubps_rn
-  ; CHECK: vsubps {rn-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x18,0x5c,0xc1]
+; CHECK-LABEL: test_vsubps_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 -1, i32 0)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK-LABEL: test_vsubps_rd
-  ; CHECK: vsubps {rd-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x38,0x5c,0xc1]
+; CHECK-LABEL: test_vsubps_rd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 -1, i32 1)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK-LABEL: test_vsubps_ru
-  ; CHECK: vsubps {ru-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x58,0x5c,0xc1]
+; CHECK-LABEL: test_vsubps_ru:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 -1, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK-LABEL: test_vsubps_rz
-  ; CHECK: vsubps {rz-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x78,0x5c,0xc1]
+; CHECK-LABEL: test_vsubps_rz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 -1, i32 3)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK-LABEL: test_vmulps_rn
-  ; CHECK: vmulps {rn-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x18,0x59,0xc1]
+; CHECK-LABEL: test_vmulps_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 -1, i32 0)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK-LABEL: test_vmulps_rd
-  ; CHECK: vmulps {rd-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x38,0x59,0xc1]
+; CHECK-LABEL: test_vmulps_rd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 -1, i32 1)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK-LABEL: test_vmulps_ru
-  ; CHECK: vmulps {ru-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x58,0x59,0xc1]
+; CHECK-LABEL: test_vmulps_ru:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 -1, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK-LABEL: test_vmulps_rz
-  ; CHECK: vmulps {rz-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x78,0x59,0xc1]
+; CHECK-LABEL: test_vmulps_rz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 -1, i32 3)
   ret <16 x float> %res
@@ -1430,32 +2252,44 @@ define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
 
 ;; mask float
 define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_vmulps_mask_rn
-  ; CHECK: vmulps {rn-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x59,0xc1]
+; CHECK-LABEL: test_vmulps_mask_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 %mask, i32 0)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_vmulps_mask_rd
-  ; CHECK: vmulps {rd-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x59,0xc1]
+; CHECK-LABEL: test_vmulps_mask_rd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 %mask, i32 1)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_vmulps_mask_ru
-  ; CHECK: vmulps {ru-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x59,0xc1]
+; CHECK-LABEL: test_vmulps_mask_ru:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 %mask, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_vmulps_mask_rz
-  ; CHECK: vmulps {rz-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xf9,0x59,0xc1]
+; CHECK-LABEL: test_vmulps_mask_rz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> zeroinitializer, i16 %mask, i32 3)
   ret <16 x float> %res
@@ -1463,32 +2297,48 @@ define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16
 
 ;; With Passthru value
 define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
-  ; CHECK-LABEL: test_vmulps_mask_passthru_rn
-  ; CHECK: vmulps {rn-sae}{{.*}}{%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x59,0xd1]
+; CHECK-LABEL: test_vmulps_mask_passthru_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> %passthru, i16 %mask, i32 0)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
-  ; CHECK-LABEL: test_vmulps_mask_passthru_rd
-  ; CHECK: vmulps {rd-sae}{{.*}}{%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x59,0xd1]
+; CHECK-LABEL: test_vmulps_mask_passthru_rd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> %passthru, i16 %mask, i32 1)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
-  ; CHECK-LABEL: test_vmulps_mask_passthru_ru
-  ; CHECK: vmulps {ru-sae}{{.*}}{%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x59,0xd1]
+; CHECK-LABEL: test_vmulps_mask_passthru_ru:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> %passthru, i16 %mask, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
-  ; CHECK-LABEL: test_vmulps_mask_passthru_rz
-  ; CHECK: vmulps {rz-sae}{{.*}}{%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x59,0xd1]
+; CHECK-LABEL: test_vmulps_mask_passthru_rz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
                     <16 x float> %passthru, i16 %mask, i32 3)
   ret <16 x float> %res
@@ -1496,47 +2346,69 @@ define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float>
 
 ;; mask double
 define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_vmulpd_mask_rn
-  ; CHECK: vmulpd {rn-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0x59,0xc1]
+; CHECK-LABEL: test_vmulpd_mask_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
                     <8 x double> zeroinitializer, i8 %mask, i32 0)
   ret <8 x double> %res
 }
 
 define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_vmulpd_mask_rd
-  ; CHECK: vmulpd {rd-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0x59,0xc1]
+; CHECK-LABEL: test_vmulpd_mask_rd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
                     <8 x double> zeroinitializer, i8 %mask, i32 1)
   ret <8 x double> %res
 }
 
 define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_vmulpd_mask_ru
-  ; CHECK: vmulpd {ru-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0x59,0xc1]
+; CHECK-LABEL: test_vmulpd_mask_ru:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
                     <8 x double> zeroinitializer, i8 %mask, i32 2)
   ret <8 x double> %res
 }
 
 define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
-  ; CHECK-LABEL: test_vmulpd_mask_rz
-  ; CHECK: vmulpd {rz-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xf9,0x59,0xc1]
+; CHECK-LABEL: test_vmulpd_mask_rz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
                     <8 x double> zeroinitializer, i8 %mask, i32 3)
   ret <8 x double> %res
 }
 
 define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_xor_epi32
-  ;CHECK: vpxord {{.*}}encoding: [0x62,0xf1,0x7d,0x48,0xef,0xc1]
+; CHECK-LABEL: test_xor_epi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_xor_epi32
-  ;CHECK: vpxord %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xef,0xd1]
+; CHECK-LABEL: test_mask_xor_epi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
@@ -1544,15 +2416,21 @@ define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %
 declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_or_epi32
-  ;CHECK: vpord {{.*}}encoding: [0x62,0xf1,0x7d,0x48,0xeb,0xc1]
+; CHECK-LABEL: test_or_epi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_or_epi32
-  ;CHECK: vpord %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xeb,0xd1]
+; CHECK-LABEL: test_mask_or_epi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
@@ -1560,15 +2438,21 @@ define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %p
 declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_and_epi32
-  ;CHECK: vpandd {{.*}}encoding: [0x62,0xf1,0x7d,0x48,0xdb,0xc1]
+; CHECK-LABEL: test_and_epi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_and_epi32
-  ;CHECK: vpandd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xdb,0xd1]
+; CHECK-LABEL: test_mask_and_epi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
@@ -1576,15 +2460,22 @@ define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %
 declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
-  ;CHECK-LABEL: test_xor_epi64
-  ;CHECK: vpxorq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xef,0xc1]
+; CHECK-LABEL: test_xor_epi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_xor_epi64
-  ;CHECK: vpxorq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xef,0xd1]
+; CHECK-LABEL: test_mask_xor_epi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
@@ -1592,15 +2483,22 @@ define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %pass
 declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
-  ;CHECK-LABEL: test_or_epi64
-  ;CHECK: vporq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xeb,0xc1]
+; CHECK-LABEL: test_or_epi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_or_epi64
-  ;CHECK: vporq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xeb,0xd1]
+; CHECK-LABEL: test_mask_or_epi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
@@ -1608,15 +2506,22 @@ define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passT
 declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
-  ;CHECK-LABEL: test_and_epi64
-  ;CHECK: vpandq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xdb,0xc1]
+; CHECK-LABEL: test_and_epi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_and_epi64
-  ;CHECK: vpandq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xdb,0xd1]
+; CHECK-LABEL: test_mask_and_epi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
@@ -1625,53 +2530,73 @@ declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i6
 
 
 define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_add_epi32_rr
-  ;CHECK: vpaddd %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc1]
+; CHECK-LABEL: test_mask_add_epi32_rr:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi32_rrk
-  ;CHECK: vpaddd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfe,0xd1]
+; CHECK-LABEL: test_mask_add_epi32_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi32_rrkz
-  ;CHECK: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfe,0xc1]
+; CHECK-LABEL: test_mask_add_epi32_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_add_epi32_rm
-  ;CHECK: vpaddd (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rm:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi32_rmk
-  ;CHECK: vpaddd (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfe,0x0f]
+; CHECK-LABEL: test_mask_add_epi32_rmk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi32_rmkz
-  ;CHECK: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
-  ;CHECK-LABEL: test_mask_add_epi32_rmb
-  ;CHECK: vpaddd (%rdi){1to16}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x58,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmb:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1680,8 +2605,12 @@ define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
 }
 
 define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi32_rmbk
-  ;CHECK: vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0xfe,0x0f]
+; CHECK-LABEL: test_mask_add_epi32_rmbk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1690,8 +2619,11 @@ define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i3
 }
 
 define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi32_rmbkz
-  ;CHECK: vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmbkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1702,53 +2634,73 @@ define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %ma
 declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_sub_epi32_rr
-  ;CHECK: vpsubd %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xfa,0xc1]
+; CHECK-LABEL: test_mask_sub_epi32_rr:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi32_rrk
-  ;CHECK: vpsubd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfa,0xd1]
+; CHECK-LABEL: test_mask_sub_epi32_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi32_rrkz
-  ;CHECK: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfa,0xc1]
+; CHECK-LABEL: test_mask_sub_epi32_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_sub_epi32_rm
-  ;CHECK: vpsubd (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rm:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi32_rmk
-  ;CHECK: vpsubd (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfa,0x0f]
+; CHECK-LABEL: test_mask_sub_epi32_rmk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi32_rmkz
-  ;CHECK: vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
-  ;CHECK-LABEL: test_mask_sub_epi32_rmb
-  ;CHECK: vpsubd (%rdi){1to16}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x58,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmb:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1757,8 +2709,12 @@ define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
 }
 
 define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi32_rmbk
-  ;CHECK: vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0xfa,0x0f]
+; CHECK-LABEL: test_mask_sub_epi32_rmbk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1767,8 +2723,11 @@ define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i3
 }
 
 define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi32_rmbkz
-  ;CHECK: vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmbkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1779,53 +2738,77 @@ define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %ma
 declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
-  ;CHECK-LABEL: test_mask_add_epi64_rr
-  ;CHECK: vpaddq %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1]
+; CHECK-LABEL: test_mask_add_epi64_rr:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi64_rrk
-  ;CHECK: vpaddq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0xd1]
+; CHECK-LABEL: test_mask_add_epi64_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi64_rrkz
-  ;CHECK: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0xc1]
+; CHECK-LABEL: test_mask_add_epi64_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_add_epi64_rm
-  ;CHECK: vpaddq (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0x07]
+; CHECK-LABEL: test_mask_add_epi64_rm:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi64_rmk
-  ;CHECK: vpaddq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0x0f]
+; CHECK-LABEL: test_mask_add_epi64_rmk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi64_rmkz
-  ;CHECK: vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0x07]
+; CHECK-LABEL: test_mask_add_epi64_rmkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
-  ;CHECK-LABEL: test_mask_add_epi64_rmb
-  ;CHECK: vpaddq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x58,0xd4,0x07]
+; CHECK-LABEL: test_mask_add_epi64_rmb:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1834,8 +2817,13 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
 }
 
 define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi64_rmbk
-  ;CHECK: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xd4,0x0f]
+; CHECK-LABEL: test_mask_add_epi64_rmbk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1844,8 +2832,12 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
 }
 
 define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_add_epi64_rmbkz
-  ;CHECK: vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xd4,0x07]
+; CHECK-LABEL: test_mask_add_epi64_rmbkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1856,53 +2848,77 @@ define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask)
 declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
-  ;CHECK-LABEL: test_mask_sub_epi64_rr
-  ;CHECK: vpsubq %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0xc1]
+; CHECK-LABEL: test_mask_sub_epi64_rr:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi64_rrk
-  ;CHECK: vpsubq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0xd1]
+; CHECK-LABEL: test_mask_sub_epi64_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi64_rrkz
-  ;CHECK: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0xc1]
+; CHECK-LABEL: test_mask_sub_epi64_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_sub_epi64_rm
-  ;CHECK: vpsubq (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0x07]
+; CHECK-LABEL: test_mask_sub_epi64_rm:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi64_rmk
-  ;CHECK: vpsubq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0x0f]
+; CHECK-LABEL: test_mask_sub_epi64_rmk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi64_rmkz
-  ;CHECK: vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0x07]
+; CHECK-LABEL: test_mask_sub_epi64_rmkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
-  ;CHECK-LABEL: test_mask_sub_epi64_rmb
-  ;CHECK: vpsubq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x58,0xfb,0x07]
+; CHECK-LABEL: test_mask_sub_epi64_rmb:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1911,8 +2927,13 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
 }
 
 define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi64_rmbk
-  ;CHECK: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xfb,0x0f]
+; CHECK-LABEL: test_mask_sub_epi64_rmbk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1921,8 +2942,12 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
 }
 
 define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_sub_epi64_rmbkz
-  ;CHECK: vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xfb,0x07]
+; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1933,53 +2958,77 @@ define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask)
 declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_mul_epi32_rr
-  ;CHECK: vpmuldq %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1]
+; CHECK-LABEL: test_mask_mul_epi32_rr:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epi32_rrk
-  ;CHECK: vpmuldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xd1]
+; CHECK-LABEL: test_mask_mul_epi32_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epi32_rrkz
-  ;CHECK: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
+; CHECK-LABEL: test_mask_mul_epi32_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_mul_epi32_rm
-  ;CHECK: vpmuldq (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rm:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epi32_rmk
-  ;CHECK: vpmuldq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0x0f]
+; CHECK-LABEL: test_mask_mul_epi32_rmk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epi32_rmkz
-  ;CHECK: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
-  ;CHECK-LABEL: test_mask_mul_epi32_rmb
-  ;CHECK: vpmuldq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf2,0xfd,0x58,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmb:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1989,8 +3038,13 @@ define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
 }
 
 define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epi32_rmbk
-  ;CHECK: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x28,0x0f]
+; CHECK-LABEL: test_mask_mul_epi32_rmbk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -2000,8 +3054,12 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
 }
 
 define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epi32_rmbkz
-  ;CHECK: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -2013,53 +3071,77 @@ define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask
 declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
 
 define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_mul_epu32_rr
-  ;CHECK: vpmuludq %zmm1, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+; CHECK-LABEL: test_mask_mul_epu32_rr:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epu32_rrk
-  ;CHECK: vpmuludq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xd1]
+; CHECK-LABEL: test_mask_mul_epu32_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epu32_rrkz
-  ;CHECK: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+; CHECK-LABEL: test_mask_mul_epu32_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_mul_epu32_rm
-  ;CHECK: vpmuludq (%rdi), %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rm:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epu32_rmk
-  ;CHECK: vpmuludq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0x0f]
+; CHECK-LABEL: test_mask_mul_epu32_rmk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epu32_rmkz
-  ;CHECK: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
-  ;CHECK-LABEL: test_mask_mul_epu32_rmb
-  ;CHECK: vpmuludq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmb:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -2069,8 +3151,13 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
 }
 
 define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epu32_rmbk
-  ;CHECK: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x0f]
+; CHECK-LABEL: test_mask_mul_epu32_rmbk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -2080,8 +3167,12 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
 }
 
 define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
-  ;CHECK-LABEL: test_mask_mul_epu32_rmbkz
-  ;CHECK: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -2093,53 +3184,73 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask
 declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
 
 define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_mullo_epi32_rr_512
-  ;CHECK: vpmulld %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi32_rr_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_mullo_epi32_rrk_512
-  ;CHECK: vpmulld %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x40,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi32_rrk_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-  ;CHECK-LABEL: test_mask_mullo_epi32_rrkz_512
-  ;CHECK: vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi32_rrkz_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_mullo_epi32_rm_512
-  ;CHECK: vpmulld (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi32_rm_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_mullo_epi32_rmk_512
-  ;CHECK: vpmulld (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi32_rmk_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
-  ;CHECK-LABEL: test_mask_mullo_epi32_rmkz_512
-  ;CHECK: vpmulld (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi32_rmkz_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-  ;CHECK-LABEL: test_mask_mullo_epi32_rmb_512
-  ;CHECK: vpmulld (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x58,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi32_rmb_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -2148,8 +3259,12 @@ define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-  ;CHECK-LABEL: test_mask_mullo_epi32_rmbk_512
-  ;CHECK: vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x59,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi32_rmbk_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -2158,8 +3273,11 @@ define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <1
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
-  ;CHECK-LABEL: test_mask_mullo_epi32_rmbkz_512
-  ;CHECK: vpmulld (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xd9,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi32_rmbkz_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -2170,359 +3288,515 @@ define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
 declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae
-  ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae
-  ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae
-  ;CHECK: vaddps  {ru-sae}, %zmm1, %zmm0, %zmm0  {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae
-  ;CHECK: vaddps  {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
   ret <16 x float> %res
 }
 
 
 define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_add_round_ps_current
-  ;CHECK: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_add_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae
-  ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae
-  ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae
-  ;CHECK: vaddps  {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae
-  ;CHECK: vaddps  {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
   ret <16 x float> %res
 }
 
 
 define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_add_round_ps_current
-  ;CHECK: vaddps %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_add_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
   ret <16 x float> %res
 }
 
 
 define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_add_round_ps_rn_sae
-  ;CHECK: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_add_round_ps_rn_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_add_round_ps_rd_sae
-  ;CHECK: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_add_round_ps_rd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_add_round_ps_ru_sae
-  ;CHECK: vaddps  {ru-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_add_round_ps_ru_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_add_round_ps_rz_sae
-  ;CHECK: vaddps  {rz-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_add_round_ps_rz_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_add_round_ps_current
-  ;CHECK: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_add_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae
-  ;CHECK: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae
-  ;CHECK: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae
-  ;CHECK: vsubps  {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae
-  ;CHECK: vsubps  {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
   ret <16 x float> %res
 }
 
 
 define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_sub_round_ps_current
-  ;CHECK: vsubps %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_sub_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vsubps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_sub_round_ps_rn_sae
-  ;CHECK: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_sub_round_ps_rd_sae
-  ;CHECK: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_sub_round_ps_ru_sae
-  ;CHECK: vsubps  {ru-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_sub_round_ps_rz_sae
-  ;CHECK: vsubps  {rz-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_sub_round_ps_current
-  ;CHECK: vsubps %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_sub_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsubps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae
-  ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae
-  ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae
-  ;CHECK: vdivps  {ru-sae}, %zmm1, %zmm0, %zmm0  {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae
-  ;CHECK: vdivps  {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
   ret <16 x float> %res
 }
 
 
 define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_div_round_ps_current
-  ;CHECK: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_div_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae
-  ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae
-  ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae
-  ;CHECK: vdivps  {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae
-  ;CHECK: vdivps  {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
   ret <16 x float> %res
 }
 
 
 define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_div_round_ps_current
-  ;CHECK: vdivps %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_div_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
   ret <16 x float> %res
 }
 
 
 define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_div_round_ps_rn_sae
-  ;CHECK: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_div_round_ps_rn_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_div_round_ps_rd_sae
-  ;CHECK: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_div_round_ps_rd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
   ret <16 x float> %res
 }
 define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_div_round_ps_ru_sae
-  ;CHECK: vdivps  {ru-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_div_round_ps_ru_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_div_round_ps_rz_sae
-  ;CHECK: vdivps  {rz-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_div_round_ps_rz_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_div_round_ps_current
-  ;CHECK: vdivps %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_div_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_min_round_ps_sae
-  ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_min_round_ps_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_min_round_ps_current
-  ;CHECK: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_min_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_min_round_ps_sae
-  ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_min_round_ps_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_min_round_ps_current
-  ;CHECK: vminps %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_min_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_min_round_ps_sae
-  ;CHECK: vminps {sae}, %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_min_round_ps_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_min_round_ps_current
-  ;CHECK: vminps %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_min_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_max_round_ps_sae
-  ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_max_round_ps_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_maskz_max_round_ps_current
-  ;CHECK: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_max_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_max_round_ps_sae
-  ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_max_round_ps_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_mask_max_round_ps_current
-  ;CHECK: vmaxps %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-LABEL: test_mm512_mask_max_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_max_round_ps_sae
-  ;CHECK: vmaxps {sae}, %zmm1, %zmm0, %zmm0 
+; CHECK-LABEL: test_mm512_max_round_ps_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
-  ;CHECK-LABEL: test_mm512_max_round_ps_current
-  ;CHECK: vmaxps %zmm1, %zmm0, %zmm0
+; CHECK-LABEL: test_mm512_max_round_ps_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
   ret <16 x float> %res
 }
@@ -2531,50 +3805,81 @@ declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>
 declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_add_ss_rn
-; CHECK: vaddss  {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_add_ss_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 0)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_add_ss_rd
-; CHECK: vaddss  {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_add_ss_rd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_add_ss_ru
-; CHECK: vaddss  {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_add_ss_ru:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 2)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_add_ss_rz
-; CHECK: vaddss  {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_add_ss_rz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 3)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_add_ss_current
-; CHECK: vaddss %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_add_ss_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
-; CHECK-LABEL: test_maskz_add_ss_rn
-; CHECK: vaddss  {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_maskz_add_ss_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 0)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_add_ss_rn
-; CHECK: vaddss  {rn-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_add_ss_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 0)
   ret <4 x float> %res
 }
@@ -2582,50 +3887,81 @@ define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
 declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
 
 define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_add_sd_rn
-; CHECK: vaddsd  {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_add_sd_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 0)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_add_sd_rd
-; CHECK: vaddsd  {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_add_sd_rd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_add_sd_ru
-; CHECK: vaddsd  {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_add_sd_ru:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 2)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_add_sd_rz
-; CHECK: vaddsd  {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_add_sd_rz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 3)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_add_sd_current
-; CHECK: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_add_sd_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
-; CHECK-LABEL: test_maskz_add_sd_rn
-; CHECK: vaddsd  {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_maskz_add_sd_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 0)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_add_sd_rn
-; CHECK: vaddsd  {rn-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_add_sd_rn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 0)
   ret <2 x double> %res
 }
@@ -2633,86 +3969,130 @@ define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
 declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_max_ss_sae
-; CHECK: vmaxss  {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_max_ss_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
-; CHECK-LABEL: test_maskz_max_ss_sae
-; CHECK: vmaxss  {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_maskz_max_ss_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_max_ss_sae
-; CHECK: vmaxss  {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_max_ss_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_max_ss
-; CHECK: vmaxss  %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_max_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
-; CHECK-LABEL: test_maskz_max_ss
-; CHECK: vmaxss  %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_maskz_max_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_max_ss
-; CHECK: vmaxss  %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_max_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
   ret <4 x float> %res
 }
 declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
 
 define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_max_sd_sae
-; CHECK: vmaxsd  {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_max_sd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
-; CHECK-LABEL: test_maskz_max_sd_sae
-; CHECK: vmaxsd  {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_maskz_max_sd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_max_sd_sae
-; CHECK: vmaxsd  {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_max_sd_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_max_sd
-; CHECK: vmaxsd  %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_max_sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
-; CHECK-LABEL: test_maskz_max_sd
-; CHECK: vmaxsd  %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_maskz_max_sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_max_sd
-; CHECK: vmaxsd  %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_max_sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
   ret <2 x double> %res
 }
@@ -2720,8 +4100,8 @@ define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
 define <2 x double> @test_x86_avx512_cvtsi2sd32(<2 x double> %a, i32 %b) {
 ; CHECK-LABEL: test_x86_avx512_cvtsi2sd32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsi2sdl %edi, {rz-sae}, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtsi2sdl %edi, {rz-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.cvtsi2sd32(<2 x double> %a, i32 %b, i32 3) ; <<<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
@@ -2730,8 +4110,8 @@ declare <2 x double> @llvm.x86.avx512.cvtsi2sd32(<2 x double>, i32, i32) nounwin
 define <2 x double> @test_x86_avx512_cvtsi2sd64(<2 x double> %a, i64 %b) {
 ; CHECK-LABEL: test_x86_avx512_cvtsi2sd64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsi2sdq %rdi, {rz-sae}, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtsi2sdq %rdi, {rz-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double> %a, i64 %b, i32 3) ; <<<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
@@ -2740,8 +4120,8 @@ declare <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double>, i64, i32) nounwin
 define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) {
 ; CHECK-LABEL: test_x86_avx512_cvtsi2ss32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsi2ssl %edi, {rz-sae}, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtsi2ssl %edi, {rz-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 3) ; <<<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -2750,8 +4130,8 @@ declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind
 define <4 x float> @test_x86_avx512_cvtsi2ss64(<4 x float> %a, i64 %b) {
 ; CHECK-LABEL: test_x86_avx512_cvtsi2ss64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsi2ssq %rdi, {rz-sae}, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtsi2ssq %rdi, {rz-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float> %a, i64 %b, i32 3) ; <<<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -2760,8 +4140,8 @@ declare <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float>, i64, i32) nounwind
 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b)
 ; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtusi2ssl %edi, {rd-sae}, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtusi2ssl %edi, {rd-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 {
   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1]
   ret <4 x float> %res
@@ -2770,9 +4150,9 @@ define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b)
 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32* %ptr)
 ; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl (%rdi), %eax 
-; CHECK-NEXT:    vcvtusi2ssl %eax, {rd-sae}, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    vcvtusi2ssl %eax, {rd-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 {
   %b = load i32, i32* %ptr
   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1]
@@ -2782,8 +4162,8 @@ define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32*
 define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b)
 ; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 {
   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
   ret <4 x float> %res
@@ -2793,7 +4173,7 @@ define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, i32* %ptr)
 ; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcvtusi2ssl (%rdi), %xmm0, %xmm0
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    retq
 {
   %b = load i32, i32* %ptr
   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
@@ -2804,8 +4184,8 @@ declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind r
 define <4 x float> @_mm_cvt_roundu64_ss (<4 x float> %a, i64 %b)
 ; CHECK-LABEL: _mm_cvt_roundu64_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtusi2ssq %rdi, {rd-sae}, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtusi2ssq %rdi, {rd-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 {
   %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 1) ; <<<4 x float>> [#uses=1]
   ret <4 x float> %res
@@ -2814,8 +4194,8 @@ define <4 x float> @_mm_cvt_roundu64_ss (<4 x float> %a, i64 %b)
 define <4 x float> @_mm_cvtu64_ss(<4 x float> %a, i64 %b)
 ; CHECK-LABEL: _mm_cvtu64_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtusi2ssq %rdi, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtusi2ssq %rdi, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 {
   %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 4) ; <<<4 x float>> [#uses=1]
   ret <4 x float> %res
@@ -2825,8 +4205,8 @@ declare <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float>, i64, i32) nounwind
 define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b)
 ; CHECK-LABEL: test_x86_avx512_mm_cvtu32_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 {
   %res = call <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double> %a, i32 %b) ; <<<2 x double>> [#uses=1]
   ret <2 x double> %res
@@ -2836,8 +4216,8 @@ declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind read
 define <2 x double> @test_x86_avx512_mm_cvtu64_sd(<2 x double> %a, i64 %b)
 ; CHECK-LABEL: test_x86_avx512_mm_cvtu64_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtusi2sdq %rdi, {rd-sae}, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtusi2sdq %rdi, {rd-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 {
   %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 1) ; <<<2 x double>> [#uses=1]
   ret <2 x double> %res
@@ -2846,8 +4226,8 @@ define <2 x double> @test_x86_avx512_mm_cvtu64_sd(<2 x double> %a, i64 %b)
 define <2 x double> @test_x86_avx512__mm_cvt_roundu64_sd(<2 x double> %a, i64 %b)
 ; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu64_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtusi2sdq %rdi, %xmm0, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vcvtusi2sdq %rdi, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 {
   %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 4) ; <<<2 x double>> [#uses=1]
   ret <2 x double> %res
@@ -2855,7 +4235,10 @@ define <2 x double> @test_x86_avx512__mm_cvt_roundu64_sd(<2 x double> %a, i64 %b
 declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64, i32) nounwind readnone
 
 define <8 x i64> @test_vpmaxq(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK: vpmaxsq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xc1]
+; CHECK-LABEL: test_vpmaxq:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %a0, <8 x i64> %a1,
                     <8 x i64>zeroinitializer, i8 -1)
   ret <8 x i64> %res
@@ -2863,7 +4246,10 @@ define <8 x i64> @test_vpmaxq(<8 x i64> %a0, <8 x i64> %a1) {
 declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <16 x i32> @test_vpminud(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpminud {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xc1]
+; CHECK-LABEL: test_vpminud:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %a0, <16 x i32> %a1,
                     <16 x i32>zeroinitializer, i16 -1)
   ret <16 x i32> %res
@@ -2871,29 +4257,39 @@ define <16 x i32> @test_vpminud(<16 x i32> %a0, <16 x i32> %a1) {
 declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32> @test_vpmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpmaxsd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xc1]
+; CHECK-LABEL: test_vpmaxsd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %a0, <16 x i32> %a1,
                     <16 x i32>zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_512
-; CHECK-NOT: call 
-; CHECK: vpmaxsd %zmm
-; CHECK: {%k1} 
 define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
   ret <16 x i32> %res2
 }
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_512
-; CHECK-NOT: call 
-; CHECK: vpmaxsq %zmm
-; CHECK: {%k1} 
 define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -2902,11 +4298,14 @@ define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %
 
 declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_512
-; CHECK-NOT: call 
-; CHECK: vpmaxud %zmm
-; CHECK: {%k1} 
 define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmaxud %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -2915,11 +4314,15 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32
 
 declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_512
-; CHECK-NOT: call 
-; CHECK: vpmaxuq %zmm
-; CHECK: {%k1} 
 define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -2928,11 +4331,14 @@ define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %
 
 declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_512
-; CHECK-NOT: call 
-; CHECK: vpminsd %zmm
-; CHECK: {%k1} 
 define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpminsd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -2941,22 +4347,29 @@ define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32
 
 declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_512
-; CHECK-NOT: call 
-; CHECK: vpminsq %zmm
-; CHECK: {%k1} 
 define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
   ret <8 x i64> %res2
 }
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_512
-; CHECK-NOT: call 
-; CHECK: vpminud %zmm
-; CHECK: {%k1} 
 define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -2965,11 +4378,15 @@ define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32
 
 declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_512
-; CHECK-NOT: call 
-; CHECK: vpminuq %zmm
-; CHECK: {%k1} 
 define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -2978,24 +4395,34 @@ define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %
 
 declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermi2d {{.*}}{%k1} 
-define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm3
+; CHECK-NEXT:    vpermi2d (%rdi), %zmm0, %zmm3 {%k1}
+; CHECK-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %x2 = load <16 x i32>, <16 x i32>* %x2p
   %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
   %res2 = add <16 x i32> %res, %res1
   ret <16 x i32> %res2
 }
 
 declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK:  vpermi2pd {{.*}}{%k1} 
 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm3
+; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1
+; CHECK-NEXT:    vaddpd %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
   %res2 = fadd <8 x double> %res, %res1
@@ -3004,11 +4431,15 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0,
 
 declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermi2ps {{.*}}{%k1} 
 define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm3
+; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1
+; CHECK-NEXT:    vaddps %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
   %res2 = fadd <16 x float> %res, %res1
@@ -3017,11 +4448,16 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0,
 
 declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermi2q {{.*}}{%k1} 
 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm3
+; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; CHECK-NEXT:    vpaddq %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -3030,37 +4466,54 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i
 
 declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2d {{.*}}{%k1} {z}
-define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm2
+; CHECK-NEXT:    vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpermt2d %zmm1, %zmm0, %zmm1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %x2 = load <16 x i32>, <16 x i32>* %x2p
   %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x1, i16 -1)
   %res2 = add <16 x i32> %res, %res1
   ret <16 x i32> %res2
 }
 
 declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
 
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2pd {{.*}}{%k1} {z}
-define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
+define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm2
+; CHECK-NEXT:    vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpermt2pd %zmm1, %zmm0, %zmm1
+; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %x2s = load double, double* %x2ptr
+  %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
+  %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
   %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
-  %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+  %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x1, i8 -1)
   %res2 = fadd <8 x double> %res, %res1
   ret <8 x double> %res2
 }
 
 declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
 
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2ps {{.*}}{%k1} {z}
 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm3
+; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm1
+; CHECK-NEXT:    vaddps %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
   %res2 = fadd <16 x float> %res, %res1
@@ -3070,11 +4523,16 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
 
 declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2q {{.*}}{%k1} {z}
 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm3
+; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; CHECK-NEXT:    vpaddq %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -3083,12 +4541,15 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
 
 declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2d {{.*}}{%k1}
-; CHECK-NOT: {z}
 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm3
+; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -3096,11 +4557,15 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16
 }
 
 declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vscalefpd{{.*}}{%k1} 
 define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
   %res2 = fadd <8 x double> %res, %res1
@@ -3108,13 +4573,1849 @@ define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8
 }
 
 declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vscalefps{{.*}}{%k1} 
 define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 2)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
   %res2 = fadd <16 x float> %res, %res1
   ret <16 x float> %res2
 }
+
+declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = k1[0],zmm0[0],k1[2],zmm0[2],k1[4],zmm0[4],k1[6],zmm0[6]
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res2, %res3
+  ret <8 x i64> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
+; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
+; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovqb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovqb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmovqb %zmm0, (%rdi)
+; CHECK-NEXT:    vpmovqb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovsqb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsqb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi)
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovusqb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovusqb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi)
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovqw %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmovqw %zmm0, (%rdi)
+; CHECK-NEXT:    vpmovqw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovsqw %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsqw %zmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi)
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovusqw %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovusqw %zmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi)
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmovqd %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovqd %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+    %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
+    %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
+    %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i32> %res0, %res1
+    %res4 = add <8 x i32> %res3, %res2
+    ret <8 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmovqd %zmm0, (%rdi)
+; CHECK-NEXT:    vpmovqd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmovsqd %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovsqd %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsqd %zmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+    %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
+    %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
+    %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i32> %res0, %res1
+    %res4 = add <8 x i32> %res3, %res2
+    ret <8 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi)
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmovusqd %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovusqd %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovusqd %zmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+    %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
+    %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
+    %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i32> %res0, %res1
+    %res4 = add <8 x i32> %res3, %res2
+    ret <8 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi)
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovdb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovdb %zmm0, (%rdi)
+; CHECK-NEXT:    vpmovdb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
+    call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovsdb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsdb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi)
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
+    call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovusdb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovusdb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi)
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
+    call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
+    ret void
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovdw %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+    %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
+    %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
+    %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
+    %res3 = add <16 x i16> %res0, %res1
+    %res4 = add <16 x i16> %res3, %res2
+    ret <16 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovdw %zmm0, (%rdi)
+; CHECK-NEXT:    vpmovdw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
+    call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
+    ret void
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovsdw %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsdw %zmm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+    %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
+    %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
+    %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
+    %res3 = add <16 x i16> %res0, %res1
+    %res4 = add <16 x i16> %res3, %res2
+    ret <16 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi)
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
+    call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
+    ret void
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovusdw %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovusdw %zmm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+    %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
+    %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
+    %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
+    %res3 = add <16 x i16> %res0, %res1
+    %res4 = add <16 x i16> %res3, %res2
+    ret <16 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi)
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
+    call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
+    ret void
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtdq2ps {rn-sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtpd2dq %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtpd2dq {rn-sae}, %zmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
+
+define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtpd2ps %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtpd2ps {ru-sae}, %zmm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 2)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtpd2udq {rn-sae}, %zmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtps2dq {rn-sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtps2pd {sae}, %ymm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtps2udq {rn-sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvttpd2dq {sae}, %zmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+
+declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvtudq2ps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtudq2ps {rn-sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvttpd2udq %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvttpd2udq {sae}, %zmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvttps2dq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvttps2dq {sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvttps2udq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvttps2udq {sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+
+declare <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32)
+define <4 x float>@test_int_x86_avx512_mask_scalef_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vscalefss %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vscalefss {rn-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 8)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32)
+define <2 x double>@test_int_x86_avx512_mask_scalef_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vscalefsd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vscalefsd {rn-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 8)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_getexp_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovaps %zmm2, %zmm3
+; CHECK-NEXT:    vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
+  %res3 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
+
+  %res.1 = fadd <4 x float> %res0, %res1
+  %res.2 = fadd <4 x float> %res2, %res3
+  %res   = fadd <4 x float> %res.1, %res.2
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_getexp_sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovaps %zmm2, %zmm3
+; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm4
+; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddpd %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
+  %res3 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
+
+  %res.1 = fadd <2 x double> %res0, %res1
+  %res.2 = fadd <2 x double> %res2, %res3
+  %res   = fadd <2 x double> %res.1, %res.2
+  ret <2 x double> %res
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
+
+define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    shlb $7, %al
+; CHECK-NEXT:    sarb $7, %al
+; CHECK-NEXT:    retq
+
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
+  ret i8 %res4
+}
+
+define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT:    vcmplesd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    korw %k0, %k1, %k0
+; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vcmpneqsd %xmm1, %xmm0, %k2
+; CHECK-NEXT:    korw %k1, %k2, %k1
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k2
+; CHECK-NEXT:    kandw %k2, %k1, %k1
+; CHECK-NEXT:    korw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    shlb $7, %al
+; CHECK-NEXT:    sarb $7, %al
+; CHECK-NEXT:    retq
+
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
+
+  %res11 = or i8 %res1, %res2
+  %res12 = or i8 %res3, %res4
+  %res13 = or i8 %res11, %res12
+  ret i8 %res13
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
+
+define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcmpunordss %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    shlb $7, %al
+; CHECK-NEXT:    sarb $7, %al
+; CHECK-NEXT:    retq
+
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
+  ret i8 %res2
+}
+
+
+define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmpless %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    vcmpneqss %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k2
+; CHECK-NEXT:    kandw %k2, %k1, %k1
+; CHECK-NEXT:    kandw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    shlb $7, %al
+; CHECK-NEXT:    sarb $7, %al
+; CHECK-NEXT:    retq
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
+
+  %res11 = and i8 %res1, %res2
+  %res12 = and i8 %res3, %res4
+  %res13 = and i8 %res11, %res12
+  ret i8 %res13
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
+
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res3, %res2
+  ret <8 x double> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vgetmantpd $11, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vgetmantpd $11,{sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vgetmantps $11, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vgetmantps $11,{sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovaps %zmm2, %zmm3
+; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT:    vgetmantsd $11,{sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm0
+; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res  = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 8)
+  %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 -1, i32 4)
+  %res11 = fadd <2 x double> %res, %res1
+  %res12 = fadd <2 x double> %res2, %res3
+  %res13 = fadd <2 x double> %res11, %res12
+  ret <2 x double> %res13
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm4
+; CHECK-NEXT:    vgetmantss $11,{sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res  = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 8)
+  %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 4)
+  %res11 = fadd <4 x float> %res, %res1
+  %res12 = fadd <4 x float> %res2, %res3
+  %res13 = fadd <4 x float> %res11, %res12
+  ret <4 x float> %res13
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 = zmm2[0],k1[1],zmm2[3],k1[2],zmm2[5],k1[4],zmm2[6],k1[6]
+; CHECK-NEXT:    vshufpd {{.*#+}} zmm3 = k1[0],zmm0[1],k1[3],zmm0[2],k1[5],zmm0[4],k1[6],zmm0[6]
+; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
+
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res3, %res2
+  ret <8 x double> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} zmm2 = zmm2[2,1],k1[1,0],zmm2[6,5],k1[5,4],zmm2[10,9],k1[9,8],zmm2[14,13],k1[13,12]
+; CHECK-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 = zmm1[0,1,3,2,5,4,6,6]
+; CHECK-NEXT:    vpermilpd {{.*#+}} zmm2 = k1[0,1,3,2,5,4,6,6]
+; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 = zmm0[0,1,3,2,5,4,6,6]
+; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res3, %res2
+  ret <8 x double> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 = zmm1[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm2 = k1[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res3, %res2
+  ret <16 x float> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res2, %res3
+  ret <8 x double> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res2, %res3
+  ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i8)
+
+define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i8 %x4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i8 -1)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i8 %x4)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res2, %res3
+  ret <16 x float> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i8)
+
+define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i8 %x4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i8 -1)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i8 %x4)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res2, %res3
+  ret <16 x i32> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res2, %res3
+  ret <8 x double> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res2, %res3
+  ret <8 x i64> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float>, <4 x float>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<4 x float> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double>, <2 x double>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<2 x double> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
+
+define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
+
+define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res2, %res3
+  ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res2, %res3
+  ret <16 x float> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res2, %res3
+  ret <8 x double> %res4
+}
+
+define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
+
+define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomiss %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
+declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2)
+  ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  ret <2 x double> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
+; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm1 {%k1}
+; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm0
+; CHECK: vaddps %zmm1, %zmm0, %zmm0
+; CHECK: vaddps %zmm0, %zmm2, %zmm0
+
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
+  %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
+  %res4 = fadd <16 x float> %res1, %res2
+  %res5 = fadd <16 x float> %res3, %res4
+  ret <16 x float> %res5
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
+; CHECK: kmovw %eax, %k1
+; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
+; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm1 {%k1}
+; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm0
+; CHECK: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK: vaddpd %zmm0, %zmm2, %zmm0
+
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
+  %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <8 x double> %res1, %res2
+  %res5 = fadd <8 x double> %res3, %res4
+  ret <8 x double> %res5
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
+; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm1 {%k1}
+; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm0
+; CHECK: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK: vpaddd %zmm0, %zmm2, %zmm0
+
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
+  %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
+  %res4 = add <16 x i32> %res1, %res2
+  %res5 = add <16 x i32> %res3, %res4
+  ret <16 x i32> %res5
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
+; CHECK: kmovw %eax, %k1
+; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
+; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm1 {%k1}
+; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm0
+; CHECK: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK: vpaddq %zmm0, %zmm2, %zmm0
+
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
+  %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
+  %res4 = add <8 x i64> %res1, %res2
+  %res5 = add <8 x i64> %res3, %res4
+  ret <8 x i64> %res5
+}
+
diff --git a/test/CodeGen/X86/avx512-logic.ll b/test/CodeGen/X86/avx512-logic.ll
index 140ce3b1ec56b..c973b706e8fc2 100644
--- a/test/CodeGen/X86/avx512-logic.ll
+++ b/test/CodeGen/X86/avx512-logic.ll
@@ -1,9 +1,14 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+
 
-; CHECK-LABEL: vpandd
-; CHECK: vpandd %zmm
-; CHECK: ret
 define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: vpandd:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; ALL-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
   %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
@@ -12,10 +17,12 @@ entry:
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpord
-; CHECK: vpord %zmm
-; CHECK: ret
 define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: vpord:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; ALL-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
   %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
@@ -24,10 +31,12 @@ entry:
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpxord
-; CHECK: vpxord %zmm
-; CHECK: ret
 define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: vpxord:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; ALL-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
   %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
@@ -36,10 +45,12 @@ entry:
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpandq
-; CHECK: vpandq %zmm
-; CHECK: ret
 define <8 x i64> @vpandq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: vpandq:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; ALL-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
   %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
@@ -47,10 +58,12 @@ entry:
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vporq
-; CHECK: vporq %zmm
-; CHECK: ret
 define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: vporq:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; ALL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
   %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
@@ -58,10 +71,12 @@ entry:
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpxorq
-; CHECK: vpxorq %zmm
-; CHECK: ret
 define <8 x i64> @vpxorq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: vpxorq:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; ALL-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
   %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
@@ -70,28 +85,31 @@ entry:
 }
 
 
-; CHECK-LABEL: orq_broadcast
-; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK: ret
 define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
+; ALL-LABEL: orq_broadcast:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; ALL-NEXT:    retq
   %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
   ret <8 x i64> %b
 }
 
-; CHECK-LABEL: andd512fold
-; CHECK: vpandd (%
-; CHECK: ret
 define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
+; ALL-LABEL: andd512fold:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vpandd (%rdi), %zmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   %a = load <16 x i32>, <16 x i32>* %x, align 4
   %b = and <16 x i32> %y, %a
   ret <16 x i32> %b
 }
 
-; CHECK-LABEL: andqbrst
-; CHECK: vpandq  (%rdi){1to8}, %zmm
-; CHECK: ret
 define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
+; ALL-LABEL: andqbrst:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vpandq (%rdi){1to8}, %zmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   %a = load i64, i64* %ap, align 8
   %b = insertelement <8 x i64> undef, i64 %a, i32 0
@@ -99,3 +117,93 @@ entry:
   %d = and <8 x i64> %p1, %c
   ret <8 x i64>%d
 }
+
+define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; KNL-LABEL: and_v64i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: and_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = and <64 x i8> %a, %b
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; KNL-LABEL: or_v64i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vorps %ymm3, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: or_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = or <64 x i8> %a, %b
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; KNL-LABEL: xor_v64i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: xor_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = xor <64 x i8> %a, %b
+  ret <64 x i8> %res
+}
+
+define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; KNL-LABEL: and_v32i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: and_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = and <32 x i16> %a, %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; KNL-LABEL: or_v32i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vorps %ymm3, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: or_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = or <32 x i16> %a, %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; KNL-LABEL: xor_v32i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: xor_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = xor <32 x i16> %a, %b
+  ret <32 x i16> %res
+}
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index d2efd7d6db6e4..015c70a6ba08c 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1,39 +1,48 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL --check-prefix=CHECK
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX --check-prefix=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
 
-; CHECK-LABEL: mask16
-; CHECK: kmovw
-; CHECK-NEXT: knotw
-; CHECK-NEXT: kmovw
 define i16 @mask16(i16 %x) {
+; CHECK-LABEL: mask16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <16 x i1> %m1 to i16
   ret i16 %ret
 }
 
-; CHECK-LABEL: mask8
-; KNL: kmovw
-; KNL-NEXT: knotw
-; KNL-NEXT: kmovw
-; SKX: kmovb
-; SKX-NEXT: knotb
-; SKX-NEXT: kmovb
-
 define i8 @mask8(i8 %x) {
+; KNL-LABEL: mask8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movzbl %dil, %eax
+; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    knotw %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: mask8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb %edi, %k0
+; SKX-NEXT:    knotb %k0, %k0
+; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <8 x i1> %m1 to i8
   ret i8 %ret
 }
 
-; CHECK-LABEL: mask16_mem
-; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
-; CHECK-NEXT: knotw
-; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
-; CHECK: ret
-
 define void @mask16_mem(i16* %ptr) {
+; CHECK-LABEL: mask16_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw (%rdi), %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    kmovw %k0, (%rdi)
+; CHECK-NEXT:    retq
   %x = load i16, i16* %ptr, align 4
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -42,15 +51,20 @@ define void @mask16_mem(i16* %ptr) {
   ret void
 }
 
-; CHECK-LABEL: mask8_mem
-; KNL: kmovw ([[ARG1]]), %k{{[0-7]}}
-; KNL-NEXT: knotw
-; KNL-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
-; SKX: kmovb ([[ARG1]]), %k{{[0-7]}}
-; SKX-NEXT: knotb
-; SKX-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]])
-
 define void @mask8_mem(i8* %ptr) {
+; KNL-LABEL: mask8_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw (%rdi), %k0
+; KNL-NEXT:    knotw %k0, %k0
+; KNL-NEXT:    kmovw %k0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: mask8_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb (%rdi), %k0
+; SKX-NEXT:    knotb %k0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
   %x = load i8, i8* %ptr, align 4
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -59,11 +73,16 @@ define void @mask8_mem(i8* %ptr) {
   ret void
 }
 
-; CHECK-LABEL: mand16
-; CHECK: kandw
-; CHECK: kxorw
-; CHECK: korw
 define i16 @mand16(i16 %x, i16 %y) {
+; CHECK-LABEL: mand16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kandw %k1, %k0, %k2
+; CHECK-NEXT:    kxorw %k1, %k0, %k0
+; CHECK-NEXT:    korw %k0, %k2, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %ma = bitcast i16 %x to <16 x i1>
   %mb = bitcast i16 %y to <16 x i1>
   %mc = and <16 x i1> %ma, %mb
@@ -73,56 +92,68 @@ define i16 @mand16(i16 %x, i16 %y) {
   ret i16 %ret
 }
 
-; CHECK-LABEL: shuf_test1
-; CHECK: kshiftrw        $8
 define i8 @shuf_test1(i16 %v) nounwind {
+; KNL-LABEL: shuf_test1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kshiftrw $8, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuf_test1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovw %edi, %k0
+; SKX-NEXT:    kshiftrw $8, %k0, %k0
+; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    retq
    %v1 = bitcast i16 %v to <16 x i1>
    %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    %mask1 = bitcast <8 x i1> %mask to i8
    ret i8 %mask1
 }
 
-; CHECK-LABEL: zext_test1
-; CHECK: kshiftlw
-; CHECK: kshiftrw
-; CHECK: kmovw
-
 define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: zext_test1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kshiftlw $10, %k0, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
   %res = zext i1 %cmp_res.i1 to i32
   ret i32 %res
-}
-
-; CHECK-LABEL: zext_test2
-; CHECK: kshiftlw
-; CHECK: kshiftrw
-; CHECK: kmovw
-
-define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
+}define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
   %res = zext i1 %cmp_res.i1 to i16
   ret i16 %res
-}
-
-; CHECK-LABEL: zext_test3
-; CHECK: kshiftlw
-; CHECK: kshiftrw
-; CHECK: kmovw
-
-define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
+}define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
   %res = zext i1 %cmp_res.i1 to i8
   ret i8 %res
 }
 
-; CHECK-LABEL: conv1
-; KNL: kmovw   %k0, %eax
-; KNL: movb    %al, (%rdi)
-; SKX: kmovb   %k0, (%rdi)
 define i8 @conv1(<8 x i1>* %R) {
+; KNL-LABEL: conv1:
+; KNL:       ## BB#0: ## %entry
+; KNL-NEXT:    kxnorw %k0, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    movb $-2, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    movb $-2, %al
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: conv1:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kxnorw %k0, %k0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    movb $-2, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    movb $-2, %al
+; SKX-NEXT:    retq
 entry:
   store <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %R
 
@@ -133,12 +164,27 @@ entry:
   ret i8 %mask_convert
 }
 
-; SKX-LABEL: test4
-; SKX: vpcmpgt
-; SKX: knot
-; SKX: vpcmpgt
-; SKX: vpmovm2d
 define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) {
+; KNL-LABEL: test4:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm1
+; KNL-NEXT:    vpmovqd %zmm1, %ymm1
+; KNL-NEXT:    vpslld $31, %xmm1, %xmm1
+; KNL-NEXT:    vpsrad $31, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test4:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpgtq %ymm3, %ymm2, %k0
+; SKX-NEXT:    knotw %k0, %k1
+; SKX-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    retq
   %x_gt_y = icmp sgt <4 x i64> %x, %y
   %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1
   %res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1
@@ -146,30 +192,27 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1
   ret <4 x i32> %resse
 }
 
-; SKX-LABEL: test5
-; SKX: vpcmpgt
-; SKX: knot
-; SKX: vpcmpgt
-; SKX: vpmovm2q
 define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) {
+; KNL-LABEL: test5:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test5:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpgtq %xmm0, %xmm1, %k0
+; SKX-NEXT:    knotw %k0, %k1
+; SKX-NEXT:    vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; SKX-NEXT:    vpmovm2q %k0, %xmm0
+; SKX-NEXT:    retq
   %x_gt_y = icmp slt <2 x i64> %x, %y
   %x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1
   %res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1
   %resse = sext <2 x i1>%res to <2 x i64>
   ret <2 x i64> %resse
-}
-
-; KNL-LABEL: test6
-; KNL: vpmovsxbd
-; KNL: vpandd
-; KNL: kmovw   %eax, %k1
-; KNL vptestmd {{.*}}, %k0 {%k1}
-
-; SKX-LABEL: test6
-; SKX: vpmovb2m
-; SKX: kmovw   %eax, %k1
-; SKX: kandw
-define void @test6(<16 x i1> %mask)  {
+}define void @test6(<16 x i1> %mask)  {
 allocas:
   %a= and <16 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
   %b = bitcast <16 x i1> %a to i16
@@ -182,19 +225,30 @@ true:
 false:
   ret void
 }
-
-; KNL-LABEL: test7
-; KNL: vpmovsxwq
-; KNL: vpandq
-; KNL: vptestmq {{.*}}, %k0
-; KNL: korw
-
-; SKX-LABEL: test7
-; SKX: vpmovw2m
-; SKX: kmovb   %eax, %k1
-; SKX: korb
-
 define void @test7(<8 x i1> %mask)  {
+; KNL-LABEL: test7:
+; KNL:       ## BB#0: ## %allocas
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    movb $85, %al
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test7:
+; SKX:       ## BB#0: ## %allocas
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k0
+; SKX-NEXT:    movb $85, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    testb %al, %al
+; SKX-NEXT:    retq
 allocas:
   %a= or <8 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
   %b = bitcast <8 x i1> %a to i8
@@ -207,22 +261,35 @@ true:
 false:
   ret void
 }
-
-; KNL-LABEL: test8
-; KNL: vpxord  %zmm2, %zmm2, %zmm2
-; KNL: jg
-; KNL: vpcmpltud       %zmm2, %zmm1, %k1
-; KNL: jmp
-; KNL: vpcmpgtd        %zmm2, %zmm0, %k1
-
-; SKX-LABEL: test8
-; SKX: jg
-; SKX: vpcmpltud {{.*}}, %k0
-; SKX: vpmovm2b
-; SKX: vpcmpgtd {{.*}}, %k0
-; SKX: vpmovm2b
-
 define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
+; KNL-LABEL: test8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    jg LBB14_1
+; KNL-NEXT:  ## BB#2:
+; KNL-NEXT:    vpcmpltud %zmm2, %zmm1, %k1
+; KNL-NEXT:    jmp LBB14_3
+; KNL-NEXT:  LBB14_1:
+; KNL-NEXT:    vpcmpgtd %zmm2, %zmm0, %k1
+; KNL-NEXT:  LBB14_3:
+; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    jg LBB14_1
+; SKX-NEXT:  ## BB#2:
+; SKX-NEXT:    vpcmpltud %zmm2, %zmm1, %k0
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    retq
+; SKX-NEXT:  LBB14_1:
+; SKX-NEXT:    vpcmpgtd %zmm2, %zmm0, %k0
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    retq
   %cond = icmp sgt i32 %a1, %b1
   %cmp1 = icmp sgt <16 x i32> %a, zeroinitializer
   %cmp2 = icmp ult <16 x i32> %b, zeroinitializer
@@ -230,91 +297,121 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
   %res = sext <16 x i1> %mix to <16 x i8>
   ret <16 x i8> %res
 }
-
-; KNL-LABEL: test9
-; KNL: jg
-; KNL: vpmovsxbd       %xmm1, %zmm0
-; KNL: jmp
-; KNL: vpmovsxbd       %xmm0, %zmm0
-
-; SKX-LABEL: test9
-; SKX: vpmovb2m        %xmm1, %k0
-; SKX: vpmovm2b        %k0, %xmm0
-; SKX: retq
-; SKX: vpmovb2m        %xmm0, %k0
-; SKX: vpmovm2b        %k0, %xmm0
-
 define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
+; KNL-LABEL: test9:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    jg LBB15_1
+; KNL-NEXT:  ## BB#2:
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT:    jmp LBB15_3
+; KNL-NEXT:  LBB15_1:
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:  LBB15_3:
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test9:
+; SKX:       ## BB#0:
+; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    jg LBB15_1
+; SKX-NEXT:  ## BB#2:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm0
+; SKX-NEXT:    jmp LBB15_3
+; SKX-NEXT:  LBB15_1:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:  LBB15_3:
+; SKX-NEXT:    vpmovb2m %xmm0, %k0
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    retq
   %mask = icmp sgt i32 %a1, %b1
   %c = select i1 %mask, <16 x i1>%a, <16 x i1>%b
   ret <16 x i1>%c
-}
-
-; KNL-LABEL: test10
-; KNL: jg
-; KNL: vpmovsxwq       %xmm1, %zmm0
-; KNL: jmp
-; KNL: vpmovsxwq       %xmm0, %zmm0
-
-; SKX-LABEL: test10
-; SKX: jg
-; SKX: vpmovw2m        %xmm1, %k0
-; SKX: vpmovm2w        %k0, %xmm0
-; SKX: retq
-; SKX: vpmovw2m        %xmm0, %k0
-; SKX: vpmovm2w        %k0, %xmm0
-define <8 x i1> @test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) {
+}define <8 x i1> @test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) {
   %mask = icmp sgt i32 %a1, %b1
   %c = select i1 %mask, <8 x i1>%a, <8 x i1>%b
   ret <8 x i1>%c
 }
 
-; SKX-LABEL: test11
-; SKX: jg
-; SKX: vpmovd2m        %xmm1, %k0
-; SKX: vpmovm2d        %k0, %xmm0
-; SKX: retq
-; SKX: vpmovd2m        %xmm0, %k0
-; SKX: vpmovm2d        %k0, %xmm0
 define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
+; KNL-LABEL: test11:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    jg LBB17_2
+; KNL-NEXT:  ## BB#1:
+; KNL-NEXT:    vmovaps %zmm1, %zmm0
+; KNL-NEXT:  LBB17_2:
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test11:
+; SKX:       ## BB#0:
+; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    jg LBB17_1
+; SKX-NEXT:  ## BB#2:
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm0
+; SKX-NEXT:    jmp LBB17_3
+; SKX-NEXT:  LBB17_1:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:  LBB17_3:
+; SKX-NEXT:    vpmovd2m %xmm0, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    retq
   %mask = icmp sgt i32 %a1, %b1
   %c = select i1 %mask, <4 x i1>%a, <4 x i1>%b
   ret <4 x i1>%c
 }
 
-; KNL-LABEL: test12
-; KNL: movl    %edi, %eax
 define i32 @test12(i32 %x, i32 %y)  {
+; CHECK-LABEL: test12:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
   %a = bitcast i16 21845 to <16 x i1>
   %b = extractelement <16 x i1> %a, i32 0
   %c = select i1 %b, i32 %x, i32 %y
   ret i32 %c
 }
 
-; KNL-LABEL: test13
-; KNL: movl    %esi, %eax
 define i32 @test13(i32 %x, i32 %y)  {
+; CHECK-LABEL: test13:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %a = bitcast i16 21845 to <16 x i1>
   %b = extractelement <16 x i1> %a, i32 3
   %c = select i1 %b, i32 %x, i32 %y
   ret i32 %c
-}
-
-; SKX-LABEL: test14
-; SKX: movb     $11, %al
-; SKX: kmovb    %eax, %k0
-; SKX: vpmovm2d %k0, %xmm0
-
-define <4 x i1> @test14()  {
+}define <4 x i1> @test14()  {
   %a = bitcast i16 21845 to <16 x i1>
   %b = extractelement <16 x i1> %a, i32 2
   %c = insertelement <4 x i1> <i1 true, i1 false, i1 false, i1 true>, i1 %b, i32 1
   ret <4 x i1> %c
 }
 
-; KNL-LABEL: test15
-; KNL: cmovgw
 define <16 x i1> @test15(i32 %x, i32 %y)  {
+; KNL-LABEL: test15:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    movw $21845, %ax ## imm = 0x5555
+; KNL-NEXT:    movw $1, %cx
+; KNL-NEXT:    cmovgw %ax, %cx
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test15:
+; SKX:       ## BB#0:
+; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    movw $21845, %ax ## imm = 0x5555
+; SKX-NEXT:    movw $1, %cx
+; SKX-NEXT:    cmovgw %ax, %cx
+; SKX-NEXT:    kmovw %ecx, %k0
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    retq
   %a = bitcast i16 21845 to <16 x i1>
   %b = bitcast i16 1 to <16 x i1>
   %mask = icmp sgt i32 %x, %y
@@ -322,27 +419,914 @@ define <16 x i1> @test15(i32 %x, i32 %y)  {
   ret <16 x i1> %c
 }
 
-; SKX-LABEL: test16
-; SKX: kxnorw  %k1, %k1, %k1
-; SKX: kshiftrw        $15, %k1, %k1
-; SKX: kshiftlq        $5, %k1, %k1
-; SKX: korq    %k1, %k0, %k0
-; SKX: vpmovm2b        %k0, %zmm0
 define <64 x i8> @test16(i64 %x) {
+; KNL-LABEL: test16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Ltmp0:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Ltmp1:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Ltmp2:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    pushq %r15
+; KNL-NEXT:    pushq %r14
+; KNL-NEXT:    pushq %r13
+; KNL-NEXT:    pushq %r12
+; KNL-NEXT:    pushq %rbx
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:  Ltmp3:
+; KNL-NEXT:    .cfi_offset %rbx, -56
+; KNL-NEXT:  Ltmp4:
+; KNL-NEXT:    .cfi_offset %r12, -48
+; KNL-NEXT:  Ltmp5:
+; KNL-NEXT:    .cfi_offset %r13, -40
+; KNL-NEXT:  Ltmp6:
+; KNL-NEXT:    .cfi_offset %r14, -32
+; KNL-NEXT:  Ltmp7:
+; KNL-NEXT:    .cfi_offset %r15, -24
+; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    shrq $32, %rax
+; KNL-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    movl $271, %eax ## imm = 0x10F
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    movl %edi, %ecx
+; KNL-NEXT:    andl $1, %ecx
+; KNL-NEXT:    vmovd %ecx, %xmm0
+; KNL-NEXT:    movl $257, %ecx ## imm = 0x101
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $258, %ecx ## imm = 0x102
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $259, %ecx ## imm = 0x103
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $260, %ecx ## imm = 0x104
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $261, %ecx ## imm = 0x105
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $262, %ecx ## imm = 0x106
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $263, %ecx ## imm = 0x107
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $264, %ecx ## imm = 0x108
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $265, %ecx ## imm = 0x109
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $266, %ecx ## imm = 0x10A
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $267, %ecx ## imm = 0x10B
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $268, %ecx ## imm = 0x10C
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $269, %ecx ## imm = 0x10D
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    movl $270, %ecx ## imm = 0x10E
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm1
+; KNL-NEXT:    movl $1, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm0
+; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; KNL-NEXT:    movq %r15, %rdx
+; KNL-NEXT:    shrq $17, %rdx
+; KNL-NEXT:    andb $1, %dl
+; KNL-NEXT:    je LBB22_2
+; KNL-NEXT:  ## BB#1:
+; KNL-NEXT:    movb $-1, %dl
+; KNL-NEXT:  LBB22_2:
+; KNL-NEXT:    movq %r15, %r11
+; KNL-NEXT:    shrq $16, %r11
+; KNL-NEXT:    andb $1, %r11b
+; KNL-NEXT:    je LBB22_4
+; KNL-NEXT:  ## BB#3:
+; KNL-NEXT:    movb $-1, %r11b
+; KNL-NEXT:  LBB22_4:
+; KNL-NEXT:    movq %r15, %r10
+; KNL-NEXT:    shrq $18, %r10
+; KNL-NEXT:    andb $1, %r10b
+; KNL-NEXT:    je LBB22_6
+; KNL-NEXT:  ## BB#5:
+; KNL-NEXT:    movb $-1, %r10b
+; KNL-NEXT:  LBB22_6:
+; KNL-NEXT:    movq %r15, %r9
+; KNL-NEXT:    shrq $19, %r9
+; KNL-NEXT:    andb $1, %r9b
+; KNL-NEXT:    je LBB22_8
+; KNL-NEXT:  ## BB#7:
+; KNL-NEXT:    movb $-1, %r9b
+; KNL-NEXT:  LBB22_8:
+; KNL-NEXT:    movq %r15, %rbx
+; KNL-NEXT:    shrq $20, %rbx
+; KNL-NEXT:    andb $1, %bl
+; KNL-NEXT:    je LBB22_10
+; KNL-NEXT:  ## BB#9:
+; KNL-NEXT:    movb $-1, %bl
+; KNL-NEXT:  LBB22_10:
+; KNL-NEXT:    movq %r15, %r12
+; KNL-NEXT:    shrq $21, %r12
+; KNL-NEXT:    andb $1, %r12b
+; KNL-NEXT:    je LBB22_12
+; KNL-NEXT:  ## BB#11:
+; KNL-NEXT:    movb $-1, %r12b
+; KNL-NEXT:  LBB22_12:
+; KNL-NEXT:    movq %r15, %r14
+; KNL-NEXT:    shrq $22, %r14
+; KNL-NEXT:    andb $1, %r14b
+; KNL-NEXT:    je LBB22_14
+; KNL-NEXT:  ## BB#13:
+; KNL-NEXT:    movb $-1, %r14b
+; KNL-NEXT:  LBB22_14:
+; KNL-NEXT:    movq %r15, %r8
+; KNL-NEXT:    shrq $23, %r8
+; KNL-NEXT:    andb $1, %r8b
+; KNL-NEXT:    je LBB22_16
+; KNL-NEXT:  ## BB#15:
+; KNL-NEXT:    movb $-1, %r8b
+; KNL-NEXT:  LBB22_16:
+; KNL-NEXT:    movq %r15, %r13
+; KNL-NEXT:    shrq $24, %r13
+; KNL-NEXT:    andb $1, %r13b
+; KNL-NEXT:    je LBB22_18
+; KNL-NEXT:  ## BB#17:
+; KNL-NEXT:    movb $-1, %r13b
+; KNL-NEXT:  LBB22_18:
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $25, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_20
+; KNL-NEXT:  ## BB#19:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_20:
+; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $26, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_22
+; KNL-NEXT:  ## BB#21:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_22:
+; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    movl $272, %esi ## imm = 0x110
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $27, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_24
+; KNL-NEXT:  ## BB#23:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_24:
+; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    movl $273, %eax ## imm = 0x111
+; KNL-NEXT:    bextrl %esi, %edi, %esi
+; KNL-NEXT:    movq %r15, %rcx
+; KNL-NEXT:    shrq $28, %rcx
+; KNL-NEXT:    andb $1, %cl
+; KNL-NEXT:    je LBB22_26
+; KNL-NEXT:  ## BB#25:
+; KNL-NEXT:    movb $-1, %cl
+; KNL-NEXT:  LBB22_26:
+; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vmovd %esi, %xmm2
+; KNL-NEXT:    movl $274, %esi ## imm = 0x112
+; KNL-NEXT:    movq %r15, %rcx
+; KNL-NEXT:    shrq $29, %rcx
+; KNL-NEXT:    andb $1, %cl
+; KNL-NEXT:    je LBB22_28
+; KNL-NEXT:  ## BB#27:
+; KNL-NEXT:    movb $-1, %cl
+; KNL-NEXT:  LBB22_28:
+; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %esi, %edi, %eax
+; KNL-NEXT:    movzbl %r11b, %esi
+; KNL-NEXT:    movq %r15, %rcx
+; KNL-NEXT:    shrq $30, %rcx
+; KNL-NEXT:    andb $1, %cl
+; KNL-NEXT:    je LBB22_30
+; KNL-NEXT:  ## BB#29:
+; KNL-NEXT:    movb $-1, %cl
+; KNL-NEXT:  LBB22_30:
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; KNL-NEXT:    movl $275, %eax ## imm = 0x113
+; KNL-NEXT:    bextrl %eax, %edi, %r11d
+; KNL-NEXT:    movzbl %dl, %edx
+; KNL-NEXT:    vmovd %esi, %xmm3
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $31, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_32
+; KNL-NEXT:  ## BB#31:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_32:
+; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    vpinsrb $3, %r11d, %xmm2, %xmm2
+; KNL-NEXT:    movl $276, %eax ## imm = 0x114
+; KNL-NEXT:    bextrl %eax, %edi, %esi
+; KNL-NEXT:    movl $277, %r11d ## imm = 0x115
+; KNL-NEXT:    vpinsrb $1, %edx, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r10b, %r10d
+; KNL-NEXT:    movb %r15b, %al
+; KNL-NEXT:    shrb %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_34
+; KNL-NEXT:  ## BB#33:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_34:
+; KNL-NEXT:    vpinsrb $4, %esi, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %r11d, %edi, %edx
+; KNL-NEXT:    movl $278, %r11d ## imm = 0x116
+; KNL-NEXT:    vpinsrb $2, %r10d, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r9b, %esi
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    movq %r15, %rcx
+; KNL-NEXT:    shlq $63, %rcx
+; KNL-NEXT:    sarq $63, %rcx
+; KNL-NEXT:    vmovd %ecx, %xmm4
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %al
+; KNL-NEXT:    shrb $2, %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_36
+; KNL-NEXT:  ## BB#35:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_36:
+; KNL-NEXT:    vpinsrb $5, %edx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %r11d, %edi, %edx
+; KNL-NEXT:    movl $279, %r9d ## imm = 0x117
+; KNL-NEXT:    vpinsrb $3, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %bl, %ebx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %al
+; KNL-NEXT:    shrb $3, %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_38
+; KNL-NEXT:  ## BB#37:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_38:
+; KNL-NEXT:    vpinsrb $6, %edx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %r9d, %edi, %edx
+; KNL-NEXT:    movl $280, %esi ## imm = 0x118
+; KNL-NEXT:    vpinsrb $4, %ebx, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r12b, %ebx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %al
+; KNL-NEXT:    shrb $4, %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_40
+; KNL-NEXT:  ## BB#39:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_40:
+; KNL-NEXT:    vpinsrb $7, %edx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %esi, %edi, %ecx
+; KNL-NEXT:    movl $281, %edx ## imm = 0x119
+; KNL-NEXT:    vpinsrb $5, %ebx, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r14b, %esi
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %al
+; KNL-NEXT:    shrb $5, %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_42
+; KNL-NEXT:  ## BB#41:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_42:
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %edx, %edi, %ecx
+; KNL-NEXT:    movl $282, %edx ## imm = 0x11A
+; KNL-NEXT:    vpinsrb $6, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r8b, %esi
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %bl
+; KNL-NEXT:    shrb $6, %bl
+; KNL-NEXT:    andb $1, %bl
+; KNL-NEXT:    je LBB22_44
+; KNL-NEXT:  ## BB#43:
+; KNL-NEXT:    movb $-1, %bl
+; KNL-NEXT:  LBB22_44:
+; KNL-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %edx, %edi, %eax
+; KNL-NEXT:    movl $283, %ecx ## imm = 0x11B
+; KNL-NEXT:    vpinsrb $7, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r13b, %esi
+; KNL-NEXT:    movzbl %bl, %edx
+; KNL-NEXT:    vpinsrb $6, %edx, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %bl
+; KNL-NEXT:    shrb $7, %bl
+; KNL-NEXT:    je LBB22_46
+; KNL-NEXT:  ## BB#45:
+; KNL-NEXT:    movb $-1, %bl
+; KNL-NEXT:  LBB22_46:
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    movl $284, %edx ## imm = 0x11C
+; KNL-NEXT:    vpinsrb $8, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload
+; KNL-NEXT:    movzbl %al, %esi
+; KNL-NEXT:    movzbl %bl, %eax
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $8, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_48
+; KNL-NEXT:  ## BB#47:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_48:
+; KNL-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %edx, %edi, %ecx
+; KNL-NEXT:    movl $285, %edx ## imm = 0x11D
+; KNL-NEXT:    vpinsrb $9, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
+; KNL-NEXT:    movzbl %sil, %esi
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $9, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_50
+; KNL-NEXT:  ## BB#49:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_50:
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %edx, %edi, %ecx
+; KNL-NEXT:    movl $286, %edx ## imm = 0x11E
+; KNL-NEXT:    vpinsrb $10, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
+; KNL-NEXT:    movzbl %sil, %esi
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $10, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_52
+; KNL-NEXT:  ## BB#51:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_52:
+; KNL-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %edx, %edi, %edx
+; KNL-NEXT:    vpinsrb $11, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT:    movzbl %cl, %ecx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $11, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_54
+; KNL-NEXT:  ## BB#53:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_54:
+; KNL-NEXT:    vpinsrb $14, %edx, %xmm2, %xmm2
+; KNL-NEXT:    shrl $31, %edi
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT:    movzbl %cl, %ecx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $12, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_56
+; KNL-NEXT:  ## BB#55:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_56:
+; KNL-NEXT:    vpinsrb $15, %edi, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $13, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT:    movzbl %cl, %ecx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $13, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_58
+; KNL-NEXT:  ## BB#57:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_58:
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm3, %xmm2
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT:    movzbl %cl, %ecx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm3
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $14, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB22_60
+; KNL-NEXT:  ## BB#59:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB22_60:
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm1
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm2
+; KNL-NEXT:    shrq $15, %r15
+; KNL-NEXT:    andb $1, %r15b
+; KNL-NEXT:    je LBB22_62
+; KNL-NEXT:  ## BB#61:
+; KNL-NEXT:    movb $-1, %r15b
+; KNL-NEXT:  LBB22_62:
+; KNL-NEXT:    movzbl %r15b, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    leaq -40(%rbp), %rsp
+; KNL-NEXT:    popq %rbx
+; KNL-NEXT:    popq %r12
+; KNL-NEXT:    popq %r13
+; KNL-NEXT:    popq %r14
+; KNL-NEXT:    popq %r15
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovq %rdi, %k0
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    kshiftrw $15, %k1, %k1
+; SKX-NEXT:    kshiftlq $5, %k1, %k1
+; SKX-NEXT:    korq %k1, %k0, %k0
+; SKX-NEXT:    vpmovm2b %k0, %zmm0
+; SKX-NEXT:    retq
   %a = bitcast i64 %x to <64 x i1>
   %b = insertelement <64 x i1>%a, i1 true, i32 5
   %c = sext <64 x i1>%b to <64 x i8>
   ret <64 x i8>%c
 }
 
-; SKX-LABEL: test17
-; SKX: setg    %al
-; SKX: andl    $1, %eax
-; SKX: kmovw   %eax, %k1
-; SKX: kshiftlq        $5, %k1, %k1
-; SKX: korq    %k1, %k0, %k0
-; SKX: vpmovm2b        %k0, %zmm0
 define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
+; KNL-LABEL: test17:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Ltmp8:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Ltmp9:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Ltmp10:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    pushq %r15
+; KNL-NEXT:    pushq %r14
+; KNL-NEXT:    pushq %r13
+; KNL-NEXT:    pushq %r12
+; KNL-NEXT:    pushq %rbx
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:  Ltmp11:
+; KNL-NEXT:    .cfi_offset %rbx, -56
+; KNL-NEXT:  Ltmp12:
+; KNL-NEXT:    .cfi_offset %r12, -48
+; KNL-NEXT:  Ltmp13:
+; KNL-NEXT:    .cfi_offset %r13, -40
+; KNL-NEXT:  Ltmp14:
+; KNL-NEXT:    .cfi_offset %r14, -32
+; KNL-NEXT:  Ltmp15:
+; KNL-NEXT:    .cfi_offset %r15, -24
+; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    shrq $32, %rax
+; KNL-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vmovd %eax, %xmm0
+; KNL-NEXT:    movl $257, %eax ## imm = 0x101
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $258, %eax ## imm = 0x102
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $259, %eax ## imm = 0x103
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $260, %eax ## imm = 0x104
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $261, %eax ## imm = 0x105
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $262, %eax ## imm = 0x106
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $263, %eax ## imm = 0x107
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $264, %eax ## imm = 0x108
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $265, %eax ## imm = 0x109
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $266, %eax ## imm = 0x10A
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $267, %eax ## imm = 0x10B
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $268, %eax ## imm = 0x10C
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $269, %eax ## imm = 0x10D
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $270, %eax ## imm = 0x10E
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT:    movl $271, %eax ## imm = 0x10F
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm1
+; KNL-NEXT:    cmpl %edx, %esi
+; KNL-NEXT:    setg %al
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm0
+; KNL-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; KNL-NEXT:    movq %r15, %rdx
+; KNL-NEXT:    shrq $17, %rdx
+; KNL-NEXT:    andb $1, %dl
+; KNL-NEXT:    je LBB23_2
+; KNL-NEXT:  ## BB#1:
+; KNL-NEXT:    movb $-1, %dl
+; KNL-NEXT:  LBB23_2:
+; KNL-NEXT:    movq %r15, %r11
+; KNL-NEXT:    shrq $16, %r11
+; KNL-NEXT:    andb $1, %r11b
+; KNL-NEXT:    je LBB23_4
+; KNL-NEXT:  ## BB#3:
+; KNL-NEXT:    movb $-1, %r11b
+; KNL-NEXT:  LBB23_4:
+; KNL-NEXT:    movq %r15, %r10
+; KNL-NEXT:    shrq $18, %r10
+; KNL-NEXT:    andb $1, %r10b
+; KNL-NEXT:    je LBB23_6
+; KNL-NEXT:  ## BB#5:
+; KNL-NEXT:    movb $-1, %r10b
+; KNL-NEXT:  LBB23_6:
+; KNL-NEXT:    movq %r15, %r9
+; KNL-NEXT:    shrq $19, %r9
+; KNL-NEXT:    andb $1, %r9b
+; KNL-NEXT:    je LBB23_8
+; KNL-NEXT:  ## BB#7:
+; KNL-NEXT:    movb $-1, %r9b
+; KNL-NEXT:  LBB23_8:
+; KNL-NEXT:    movq %r15, %rbx
+; KNL-NEXT:    shrq $20, %rbx
+; KNL-NEXT:    andb $1, %bl
+; KNL-NEXT:    je LBB23_10
+; KNL-NEXT:  ## BB#9:
+; KNL-NEXT:    movb $-1, %bl
+; KNL-NEXT:  LBB23_10:
+; KNL-NEXT:    movq %r15, %r12
+; KNL-NEXT:    shrq $21, %r12
+; KNL-NEXT:    andb $1, %r12b
+; KNL-NEXT:    je LBB23_12
+; KNL-NEXT:  ## BB#11:
+; KNL-NEXT:    movb $-1, %r12b
+; KNL-NEXT:  LBB23_12:
+; KNL-NEXT:    movq %r15, %r14
+; KNL-NEXT:    shrq $22, %r14
+; KNL-NEXT:    andb $1, %r14b
+; KNL-NEXT:    je LBB23_14
+; KNL-NEXT:  ## BB#13:
+; KNL-NEXT:    movb $-1, %r14b
+; KNL-NEXT:  LBB23_14:
+; KNL-NEXT:    movq %r15, %r8
+; KNL-NEXT:    shrq $23, %r8
+; KNL-NEXT:    andb $1, %r8b
+; KNL-NEXT:    je LBB23_16
+; KNL-NEXT:  ## BB#15:
+; KNL-NEXT:    movb $-1, %r8b
+; KNL-NEXT:  LBB23_16:
+; KNL-NEXT:    movq %r15, %r13
+; KNL-NEXT:    shrq $24, %r13
+; KNL-NEXT:    andb $1, %r13b
+; KNL-NEXT:    je LBB23_18
+; KNL-NEXT:  ## BB#17:
+; KNL-NEXT:    movb $-1, %r13b
+; KNL-NEXT:  LBB23_18:
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $25, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_20
+; KNL-NEXT:  ## BB#19:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_20:
+; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $26, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_22
+; KNL-NEXT:  ## BB#21:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_22:
+; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    movl $272, %esi ## imm = 0x110
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $27, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_24
+; KNL-NEXT:  ## BB#23:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_24:
+; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    movl $273, %eax ## imm = 0x111
+; KNL-NEXT:    bextrl %esi, %edi, %esi
+; KNL-NEXT:    movq %r15, %rcx
+; KNL-NEXT:    shrq $28, %rcx
+; KNL-NEXT:    andb $1, %cl
+; KNL-NEXT:    je LBB23_26
+; KNL-NEXT:  ## BB#25:
+; KNL-NEXT:    movb $-1, %cl
+; KNL-NEXT:  LBB23_26:
+; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    bextrl %eax, %edi, %eax
+; KNL-NEXT:    vmovd %esi, %xmm2
+; KNL-NEXT:    movl $274, %esi ## imm = 0x112
+; KNL-NEXT:    movq %r15, %rcx
+; KNL-NEXT:    shrq $29, %rcx
+; KNL-NEXT:    andb $1, %cl
+; KNL-NEXT:    je LBB23_28
+; KNL-NEXT:  ## BB#27:
+; KNL-NEXT:    movb $-1, %cl
+; KNL-NEXT:  LBB23_28:
+; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %esi, %edi, %eax
+; KNL-NEXT:    movzbl %r11b, %esi
+; KNL-NEXT:    movq %r15, %rcx
+; KNL-NEXT:    shrq $30, %rcx
+; KNL-NEXT:    andb $1, %cl
+; KNL-NEXT:    je LBB23_30
+; KNL-NEXT:  ## BB#29:
+; KNL-NEXT:    movb $-1, %cl
+; KNL-NEXT:  LBB23_30:
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; KNL-NEXT:    movl $275, %eax ## imm = 0x113
+; KNL-NEXT:    bextrl %eax, %edi, %r11d
+; KNL-NEXT:    movzbl %dl, %edx
+; KNL-NEXT:    vmovd %esi, %xmm3
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $31, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_32
+; KNL-NEXT:  ## BB#31:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_32:
+; KNL-NEXT:    movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT:    vpinsrb $3, %r11d, %xmm2, %xmm2
+; KNL-NEXT:    movl $276, %eax ## imm = 0x114
+; KNL-NEXT:    bextrl %eax, %edi, %esi
+; KNL-NEXT:    movl $277, %r11d ## imm = 0x115
+; KNL-NEXT:    vpinsrb $1, %edx, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r10b, %r10d
+; KNL-NEXT:    movb %r15b, %al
+; KNL-NEXT:    shrb %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_34
+; KNL-NEXT:  ## BB#33:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_34:
+; KNL-NEXT:    vpinsrb $4, %esi, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %r11d, %edi, %edx
+; KNL-NEXT:    movl $278, %r11d ## imm = 0x116
+; KNL-NEXT:    vpinsrb $2, %r10d, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r9b, %esi
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    movq %r15, %rcx
+; KNL-NEXT:    shlq $63, %rcx
+; KNL-NEXT:    sarq $63, %rcx
+; KNL-NEXT:    vmovd %ecx, %xmm4
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %al
+; KNL-NEXT:    shrb $2, %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_36
+; KNL-NEXT:  ## BB#35:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_36:
+; KNL-NEXT:    vpinsrb $5, %edx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %r11d, %edi, %edx
+; KNL-NEXT:    movl $279, %r9d ## imm = 0x117
+; KNL-NEXT:    vpinsrb $3, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %bl, %ebx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %al
+; KNL-NEXT:    shrb $3, %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_38
+; KNL-NEXT:  ## BB#37:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_38:
+; KNL-NEXT:    vpinsrb $6, %edx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %r9d, %edi, %edx
+; KNL-NEXT:    movl $280, %esi ## imm = 0x118
+; KNL-NEXT:    vpinsrb $4, %ebx, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r12b, %ebx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %al
+; KNL-NEXT:    shrb $4, %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_40
+; KNL-NEXT:  ## BB#39:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_40:
+; KNL-NEXT:    vpinsrb $7, %edx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %esi, %edi, %ecx
+; KNL-NEXT:    movl $281, %edx ## imm = 0x119
+; KNL-NEXT:    vpinsrb $5, %ebx, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r14b, %esi
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %al
+; KNL-NEXT:    shrb $5, %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_42
+; KNL-NEXT:  ## BB#41:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_42:
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %edx, %edi, %ecx
+; KNL-NEXT:    movl $282, %edx ## imm = 0x11A
+; KNL-NEXT:    vpinsrb $6, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r8b, %esi
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %bl
+; KNL-NEXT:    shrb $6, %bl
+; KNL-NEXT:    andb $1, %bl
+; KNL-NEXT:    je LBB23_44
+; KNL-NEXT:  ## BB#43:
+; KNL-NEXT:    movb $-1, %bl
+; KNL-NEXT:  LBB23_44:
+; KNL-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %edx, %edi, %eax
+; KNL-NEXT:    movl $283, %ecx ## imm = 0x11B
+; KNL-NEXT:    vpinsrb $7, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movzbl %r13b, %esi
+; KNL-NEXT:    movzbl %bl, %edx
+; KNL-NEXT:    vpinsrb $6, %edx, %xmm4, %xmm4
+; KNL-NEXT:    movb %r15b, %bl
+; KNL-NEXT:    shrb $7, %bl
+; KNL-NEXT:    je LBB23_46
+; KNL-NEXT:  ## BB#45:
+; KNL-NEXT:    movb $-1, %bl
+; KNL-NEXT:  LBB23_46:
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %ecx, %edi, %ecx
+; KNL-NEXT:    movl $284, %edx ## imm = 0x11C
+; KNL-NEXT:    vpinsrb $8, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload
+; KNL-NEXT:    movzbl %al, %esi
+; KNL-NEXT:    movzbl %bl, %eax
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $8, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_48
+; KNL-NEXT:  ## BB#47:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_48:
+; KNL-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %edx, %edi, %ecx
+; KNL-NEXT:    movl $285, %edx ## imm = 0x11D
+; KNL-NEXT:    vpinsrb $9, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
+; KNL-NEXT:    movzbl %sil, %esi
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $9, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_50
+; KNL-NEXT:  ## BB#49:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_50:
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %edx, %edi, %ecx
+; KNL-NEXT:    movl $286, %edx ## imm = 0x11E
+; KNL-NEXT:    vpinsrb $10, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
+; KNL-NEXT:    movzbl %sil, %esi
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $10, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_52
+; KNL-NEXT:  ## BB#51:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_52:
+; KNL-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    bextrl %edx, %edi, %edx
+; KNL-NEXT:    vpinsrb $11, %esi, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT:    movzbl %cl, %ecx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $11, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_54
+; KNL-NEXT:  ## BB#53:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_54:
+; KNL-NEXT:    vpinsrb $14, %edx, %xmm2, %xmm2
+; KNL-NEXT:    shrl $31, %edi
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT:    movzbl %cl, %ecx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $12, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_56
+; KNL-NEXT:  ## BB#55:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_56:
+; KNL-NEXT:    vpinsrb $15, %edi, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $13, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT:    movzbl %cl, %ecx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $13, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_58
+; KNL-NEXT:  ## BB#57:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_58:
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm3, %xmm2
+; KNL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT:    movzbl %cl, %ecx
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm3
+; KNL-NEXT:    movq %r15, %rax
+; KNL-NEXT:    shrq $14, %rax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    je LBB23_60
+; KNL-NEXT:  ## BB#59:
+; KNL-NEXT:    movb $-1, %al
+; KNL-NEXT:  LBB23_60:
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm1
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm2
+; KNL-NEXT:    shrq $15, %r15
+; KNL-NEXT:    andb $1, %r15b
+; KNL-NEXT:    je LBB23_62
+; KNL-NEXT:  ## BB#61:
+; KNL-NEXT:    movb $-1, %r15b
+; KNL-NEXT:  LBB23_62:
+; KNL-NEXT:    movzbl %r15b, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    leaq -40(%rbp), %rsp
+; KNL-NEXT:    popq %rbx
+; KNL-NEXT:    popq %r12
+; KNL-NEXT:    popq %r13
+; KNL-NEXT:    popq %r14
+; KNL-NEXT:    popq %r15
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test17:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovq %rdi, %k0
+; SKX-NEXT:    cmpl %edx, %esi
+; SKX-NEXT:    setg %al
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kshiftlq $5, %k1, %k1
+; SKX-NEXT:    korq %k1, %k0, %k0
+; SKX-NEXT:    vpmovm2b %k0, %zmm0
+; SKX-NEXT:    retq
   %a = bitcast i64 %x to <64 x i1>
   %b = icmp sgt i32 %y, %z
   %c = insertelement <64 x i1>%a, i1 %b, i32 5
@@ -350,8 +1334,38 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
   ret <64 x i8>%d
 }
 
-; KNL-LABEL: test18
 define <8 x i1> @test18(i8 %a, i16 %y) {
+; KNL-LABEL: test18:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movzbl %dil, %eax
+; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kshiftlw $7, %k1, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kshiftlw $6, %k1, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kshiftlw $6, %k1, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kshiftlw $7, %k2, %k1
+; KNL-NEXT:    korw %k1, %k0, %k1
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqw %zmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test18:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb %edi, %k0
+; SKX-NEXT:    kmovw %esi, %k1
+; SKX-NEXT:    kshiftlw $6, %k1, %k2
+; SKX-NEXT:    kshiftrw $15, %k2, %k2
+; SKX-NEXT:    kshiftlw $7, %k1, %k1
+; SKX-NEXT:    kshiftrw $15, %k1, %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    kshiftlb $6, %k2, %k2
+; SKX-NEXT:    korb %k2, %k0, %k0
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    retq
   %b = bitcast i8 %a to <8 x i1>
   %b1 = bitcast i16 %y to <16 x i1>
   %el1 = extractelement <16 x i1>%b1, i32 8
@@ -360,50 +1374,76 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
   %d = insertelement <8 x i1>%c, i1 %el2, i32 6
   ret <8 x i1>%d
 }
-
-; KNL-LABEL: test19
-; KNL: movzbl  %dil, %eax
-; KNL: kmovw   %eax, %k0
-; KNL: kshiftlw        $13, %k0, %k0
-; KNL: kshiftrw        $15, %k0, %k0
-; KNL: kmovw   %k0, %eax
-; KNL: andl    $1, %eax
-; KNL: testb   %al, %al
-
-define <8 x i1> @test19(i8 %a) {
-  %b = bitcast i8 %a to <8 x i1>
-  %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
-  ret <8 x i1> %c
+define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
+; KNL-LABEL: test21:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm3, %ymm3
+; KNL-NEXT:    vpsraw $15, %ymm3, %ymm3
+; KNL-NEXT:    vpand %ymm0, %ymm3, %ymm0
+; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; KNL-NEXT:    vpsllw $15, %ymm2, %ymm2
+; KNL-NEXT:    vpsraw $15, %ymm2, %ymm2
+; KNL-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test21:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm1, %ymm1
+; SKX-NEXT:    vpmovb2m %ymm1, %k1
+; SKX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
 }
 
-; KNL-LABEL: test20
-; KNL: movzbl  %dil, %eax
-; KNL: kmovw   %eax, %k0
-; KNL: kshiftlw        $13, %k0, %k1
-; KNL: kshiftrw        $15, %k1, %k1
-; KNL: kshiftlw        $12, %k0, %k0
-; KNL: kshiftrw        $15, %k0, %k0
-; KNL: kshiftlw        $4, %k0, %k0
-; KNL: kshiftlw        $1, %k1, %k2
-; KNL: korw    %k0, %k2, %k0
-; KNL: kshiftlw        $6, %k1, %k1
-; KNL: korw    %k1, %k0, %k1
-define <8 x i1> @test20(i8 %a, i16 %y) {
-  %b = bitcast i8 %a to <8 x i1>
-  %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
-  ret <8 x i1> %c
+define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
+; KNL-LABEL: test22:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpextrd $3, %xmm0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vpextrd $2, %xmm0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vpextrd $1, %xmm0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vmovd %xmm0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test22:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vpmovd2m %xmm0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  store <4 x i1> %a, <4 x i1>* %addr
+  ret void
 }
 
-; KNL-LABEL: test21
-; KNL: vpand %ymm
-; KNL: vextracti128    $1, %ymm2
-; KNL: vpand %ymm
-
-; SKX-LABEL: test21
-; SKX: vpmovb2m
-; SKX: vmovdqu16 {{.*}}%k1
-
-define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
-  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
-  ret <32 x i16> %ret
+define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
+; KNL-LABEL: test23:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test23:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vpmovq2m %xmm0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  store <2 x i1> %a, <2 x i1>* %addr
+  ret void
 }
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
new file mode 100644
index 0000000000000..c54010cd91b99
--- /dev/null
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512dq -mattr=+avx512vl| FileCheck %s
+
+define <8 x i1> @test(<2 x i1> %a) {
+; CHECK-LABEL: test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovq2m %xmm0, %k0
+; CHECK-NEXT:    kshiftlb $2, %k0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    retq
+  %res = shufflevector <2 x i1> %a, <2 x i1> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i1> %res
+}
+
+define <8 x i1> @test1(<2 x i1> %a) {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovq2m %xmm0, %k0
+; CHECK-NEXT:    kshiftlb $4, %k0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    retq
+  %res = shufflevector <2 x i1> %a, <2 x i1> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef>
+  ret <8 x i1> %res
+}
+
+define <8 x i1> @test2(<2 x i1> %a) {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovq2m %xmm0, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %zmm0
+; CHECK-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; CHECK-NEXT:    vpsllq $63, %zmm0, %zmm0
+; CHECK-NEXT:    vpmovq2m %zmm0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    retq
+  %res = shufflevector <2 x i1> %a, <2 x i1> zeroinitializer, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef>
+  ret <8 x i1> %res
+}
+
+define <8 x i1> @test3(<4 x i1> %a) {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovd2m %xmm0, %k0
+; CHECK-NEXT:    kshiftlb $4, %k0, %k0
+; CHECK-NEXT:    kshiftrb $4, %k0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    retq
+
+  %res = shufflevector <4 x i1> %a, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i1> %res
+}
+
+define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovd2m %xmm0, %k0
+; CHECK-NEXT:    kshiftlb $4, %k0, %k0
+; CHECK-NEXT:    kshiftrb $4, %k0, %k1
+; CHECK-NEXT:    korb %k0, %k1, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    retq
+
+  %res = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i1> %res
+}
+
+define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovq2m %xmm0, %k0
+; CHECK-NEXT:    kshiftlw $2, %k0, %k0
+; CHECK-NEXT:    kshiftrw $2, %k0, %k1
+; CHECK-NEXT:    korw %k0, %k1, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    retq
+
+  %res = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i1> %res
+}
+
+define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) {
+; CHECK-LABEL: test6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovq2m %xmm0, %k0
+; CHECK-NEXT:    kshiftlw $2, %k0, %k0
+; CHECK-NEXT:    kshiftrw $2, %k0, %k1
+; CHECK-NEXT:    korw %k0, %k1, %k0
+; CHECK-NEXT:    kunpckbw %k0, %k0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %xmm0
+; CHECK-NEXT:    retq
+
+  %res = shufflevector <2 x i1> %a, <2 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i1> %res
+}
+
+define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) {
+; CHECK-LABEL: test7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovd2m %xmm0, %k0
+; CHECK-NEXT:    kshiftlb $4, %k0, %k0
+; CHECK-NEXT:    kshiftrb $4, %k0, %k1
+; CHECK-NEXT:    korb %k0, %k1, %k0
+; CHECK-NEXT:    kunpckbw %k0, %k0, %k0
+; CHECK-NEXT:    kunpckwd %k0, %k0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %ymm0
+; CHECK-NEXT:    retq
+
+  %res = shufflevector <4 x i1> %a, <4 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i1> %res
+}
+
+define <64 x i1> @test8(<8 x i1> %a, <8 x i1>%b) {
+; CHECK-LABEL: test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm1, %xmm1
+; CHECK-NEXT:    vpmovw2m %xmm1, %k0
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    kunpckdq %k1, %k0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %zmm0
+; CHECK-NEXT:    retq
+
+  %res = shufflevector <8 x i1> %a, <8 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <64 x i1> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll
deleted file mode 100644
index f25458972e42a..0000000000000
--- a/test/CodeGen/X86/avx512-trunc-ext.ll
+++ /dev/null
@@ -1,961 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX 
- 
- 
-; KNL-LABEL: trunc_16x32_to_16x8
-; KNL: vpmovdb
-; KNL: ret
-define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) nounwind readnone {
-  %x = trunc <16 x i32> %i to <16 x i8>
-  ret <16 x i8> %x
-}
-
-; KNL-LABEL: trunc_8x64_to_8x16
-; KNL: vpmovqw
-; KNL: ret
-define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) nounwind readnone {
-  %x = trunc <8 x i64> %i to <8 x i16>
-  ret <8 x i16> %x
-}
-
-;SKX-LABEL: zext_8x8mem_to_8x16:                  
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm0, %k1     
-;SKX-NEXT:  vpmovzxbw (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq                            
-define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i8>,<8 x i8> *%i,align 1
-  %x   = zext <8 x i8> %a to <8 x i16>  
-  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer  
-  ret <8 x i16> %ret
-}
-
-;SKX-LABEL: sext_8x8mem_to_8x16:                  
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm0, %k1     
-;SKX-NEXT:  vpmovsxbw (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq                       
-define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i8>,<8 x i8> *%i,align 1
-  %x   = sext <8 x i8> %a to <8 x i16>  
-  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer  
-  ret <8 x i16> %ret
-}
-
-;SKX-LABEL: zext_16x8mem_to_16x16:                
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %xmm0, %k1     
-;SKX-NEXT:  vpmovzxbw (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq            
-define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
-  %a   = load <16 x i8>,<16 x i8> *%i,align 1
-  %x   = zext <16 x i8> %a to <16 x i16>  
-  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer  
-  ret <16 x i16> %ret
-}
-
-;SKX-LABEL: sext_16x8mem_to_16x16:                
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %xmm0, %k1     
-;SKX-NEXT:  vpmovsxbw (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq  
-define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
-  %a   = load <16 x i8>,<16 x i8> *%i,align 1
-  %x   = sext <16 x i8> %a to <16 x i16>  
-  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer  
-  ret <16 x i16> %ret
-}
-
-;SKX-LABEL: zext_16x8_to_16x16:                   
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovzxbw %xmm0, %ymm0    
-;SKX-NEXT:  retq  
-define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {  
-  %x   = zext <16 x i8> %a to <16 x i16>  
-  ret <16 x i16> %x
-}
-
-;SKX-LABEL: zext_16x8_to_16x16_mask:              
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %xmm1, %k1     
-;SKX-NEXT:  vpmovzxbw %xmm0, %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq 
-define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {  
-  %x   = zext <16 x i8> %a to <16 x i16> 
-  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer   
-  ret <16 x i16> %ret
-}
-
-;SKX-LABEL: sext_16x8_to_16x16:                   
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxbw %xmm0, %ymm0    
-;SKX-NEXT:  retq
-define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {  
-  %x   = sext <16 x i8> %a to <16 x i16>  
-  ret <16 x i16> %x
-}
-
-;SKX-LABEL: sext_16x8_to_16x16_mask:              
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %xmm1, %k1     
-;SKX-NEXT:  vpmovsxbw %xmm0, %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq 
-define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {  
-  %x   = sext <16 x i8> %a to <16 x i16> 
-  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer   
-  ret <16 x i16> %ret
-}
-
-;SKX-LABEL: zext_32x8mem_to_32x16:                
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %ymm0, %k1     
-;SKX-NEXT:  vpmovzxbw (%rdi), %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq                       
-define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
-  %a   = load <32 x i8>,<32 x i8> *%i,align 1
-  %x   = zext <32 x i8> %a to <32 x i16>  
-  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer  
-  ret <32 x i16> %ret
-}
-
-;SKX-LABEL: sext_32x8mem_to_32x16:                
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %ymm0, %k1     
-;SKX-NEXT:  vpmovsxbw (%rdi), %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
-  %a   = load <32 x i8>,<32 x i8> *%i,align 1
-  %x   = sext <32 x i8> %a to <32 x i16>  
-  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer  
-  ret <32 x i16> %ret
-}
-
-;SKX-LABEL: zext_32x8_to_32x16:                   
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovzxbw %ymm0, %zmm0    
-;SKX-NEXT:  retq 
-define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {  
-  %x   = zext <32 x i8> %a to <32 x i16>  
-  ret <32 x i16> %x
-}
-
-;SKX-LABEL: zext_32x8_to_32x16_mask:              
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %ymm1, %k1    
-;SKX-NEXT:  vpmovzxbw %ymm0, %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {  
-  %x   = zext <32 x i8> %a to <32 x i16>
-  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer  
-  ret <32 x i16> %ret
-}
-
-;SKX-LABEL: sext_32x8_to_32x16:                   
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxbw %ymm0, %zmm0    
-;SKX-NEXT:  retq
-define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {  
-  %x   = sext <32 x i8> %a to <32 x i16>  
-  ret <32 x i16> %x
-}
-
-;SKX-LABEL: sext_32x8_to_32x16_mask:              
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %ymm1, %k1     
-;SKX-NEXT:  vpmovsxbw %ymm0, %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {  
-  %x   = sext <32 x i8> %a to <32 x i16>
-  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer  
-  ret <32 x i16> %ret
-}
-
-;SKX-LABEL: zext_4x8mem_to_4x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m    %xmm0, %k1      
-;SKX-NEXT:  vpmovzxbd    (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq                            
-define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
-  %a   = load <4 x i8>,<4 x i8> *%i,align 1
-  %x   = zext <4 x i8> %a to <4 x i32>  
-  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer  
-  ret <4 x i32> %ret
-}
-
-;SKX-LABEL: sext_4x8mem_to_4x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m    %xmm0, %k1      
-;SKX-NEXT:  vpmovsxbd    (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq       
-define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
-  %a   = load <4 x i8>,<4 x i8> *%i,align 1
-  %x   = sext <4 x i8> %a to <4 x i32>  
-  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer  
-  ret <4 x i32> %ret
-}
-
-;SKX-LABEL: zext_8x8mem_to_8x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m    %xmm0, %k1      
-;SKX-NEXT:  vpmovzxbd    (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq    
-define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i8>,<8 x i8> *%i,align 1
-  %x   = zext <8 x i8> %a to <8 x i32>  
-  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer  
-  ret <8 x i32> %ret
-}
-
-;SKX-LABEL: sext_8x8mem_to_8x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m    %xmm0, %k1      
-;SKX-NEXT:  vpmovsxbd    (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq         
-define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i8>,<8 x i8> *%i,align 1
-  %x   = sext <8 x i8> %a to <8 x i32>  
-  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer  
-  ret <8 x i32> %ret
-}
-
-;KNL-LABEL: zext_16x8mem_to_16x32:   
-;KNL:       vpmovzxbd    (%rdi), %zmm0 {%k1} {z} 
-;KNL-NEXT:  retq 
-define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
-  %a   = load <16 x i8>,<16 x i8> *%i,align 1
-  %x   = zext <16 x i8> %a to <16 x i32>
-  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
-  ret <16 x i32> %ret
-}
-
-;KNL-LABEL: sext_16x8mem_to_16x32:   
-;KNL:       vpmovsxbd    (%rdi), %zmm0 {%k1} {z} 
-;KNL-NEXT:  retq  
-define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
-  %a   = load <16 x i8>,<16 x i8> *%i,align 1
-  %x   = sext <16 x i8> %a to <16 x i32>
-  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
-  ret <16 x i32> %ret
-}
-
-;KNL-LABEL: zext_16x8_to_16x32_mask:                    
-;KNL:       vpmovzxbd %xmm0, %zmm0 {%k1} {z} 
-;KNL-NEXT:  retq                
-define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
-  %x   = zext <16 x i8> %a to <16 x i32>
-  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
-  ret <16 x i32> %ret
-}
-
-;KNL-LABEL: sext_16x8_to_16x32_mask:                    
-;KNL:       vpmovsxbd %xmm0, %zmm0 {%k1} {z} 
-;KNL-NEXT:  retq
-define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
-  %x   = sext <16 x i8> %a to <16 x i32>
-  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
-  ret <16 x i32> %ret
-}
-
-; KNL-LABEL: zext_16x8_to_16x32
-; KNL: vpmovzxbd {{.*}}%zmm
-; KNL: ret
-define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
-  %x = zext <16 x i8> %i to <16 x i32>
-  ret <16 x i32> %x
-}
-
-; KNL-LABEL: sext_16x8_to_16x32
-; KNL: vpmovsxbd {{.*}}%zmm
-; KNL: ret
-define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
-  %x = sext <16 x i8> %i to <16 x i32>
-  ret <16 x i32> %x
-}
-
-;SKX-LABEL: zext_2x8mem_to_2x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovq2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxbq (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
-  %a   = load <2 x i8>,<2 x i8> *%i,align 1
-  %x   = zext <2 x i8> %a to <2 x i64>
-  %ret = select <2 x  i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
-  ret <2 x i64> %ret
-}
-;SKX-LABEL: sext_2x8mem_to_2x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovq2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxbq (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
-  %a   = load <2 x i8>,<2 x i8> *%i,align 1
-  %x   = sext <2 x i8> %a to <2 x i64>
-  %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
-  ret <2 x i64> %ret
-}
-;SKX-LABEL: sext_2x8mem_to_2x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxbq (%rdi), %xmm0   
-;SKX-NEXT:  retq
-define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone {
-  %a   = load <2 x i8>,<2 x i8> *%i,align 1
-  %x   = sext <2 x i8> %a to <2 x i64>
-  ret <2 x i64> %x
-}
-
-;SKX-LABEL: zext_4x8mem_to_4x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxbq (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
-  %a   = load <4 x i8>,<4 x i8> *%i,align 1
-  %x   = zext <4 x i8> %a to <4 x i64>
-  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
-  ret <4 x i64> %ret
-}
-
-;SKX-LABEL: sext_4x8mem_to_4x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxbq (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
-  %a   = load <4 x i8>,<4 x i8> *%i,align 1
-  %x   = sext <4 x i8> %a to <4 x i64>
-  %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
-  ret <4 x i64> %ret
-}
-
-;SKX-LABEL: sext_4x8mem_to_4x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxbq (%rdi), %ymm0   
-;SKX-NEXT:  retq
-define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone {
-  %a   = load <4 x i8>,<4 x i8> *%i,align 1
-  %x   = sext <4 x i8> %a to <4 x i64>
-  ret <4 x i64> %x
-}
-
-;KNL-LABEL: zext_8x8mem_to_8x64:
-;KNL:       vpmovzxbq (%rdi), %zmm0 {%k1} {z} 
-;KNL-NEXT:  retq
-define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i8>,<8 x i8> *%i,align 1
-  %x   = zext <8 x i8> %a to <8 x i64>
-  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
-  ret <8 x i64> %ret
-}
-
-;KNL-LABEL: sext_8x8mem_to_8x64mask:
-;KNL:       vpmovsxbq (%rdi), %zmm0 {%k1} {z} 
-;KNL-NEXT:  retq
-define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i8>,<8 x i8> *%i,align 1
-  %x   = sext <8 x i8> %a to <8 x i64>
-  %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
-  ret <8 x i64> %ret
-}
-
-;KNL-LABEL: sext_8x8mem_to_8x64:
-;KNL:       vpmovsxbq (%rdi), %zmm0   
-;KNL-NEXT:  retq
-define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone {
-  %a   = load <8 x i8>,<8 x i8> *%i,align 1
-  %x   = sext <8 x i8> %a to <8 x i64>
-  ret <8 x i64> %x
-}
-
-;SKX-LABEL: zext_4x16mem_to_4x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxwd (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
-  %a   = load <4 x i16>,<4 x i16> *%i,align 1
-  %x   = zext <4 x i16> %a to <4 x i32>
-  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
-  ret <4 x i32> %ret
-}
-
-;SKX-LABEL: sext_4x16mem_to_4x32mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxwd (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
-  %a   = load <4 x i16>,<4 x i16> *%i,align 1
-  %x   = sext <4 x i16> %a to <4 x i32>
-  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
-  ret <4 x i32> %ret
-}
-
-;SKX-LABEL: sext_4x16mem_to_4x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxwd (%rdi), %xmm0   
-;SKX-NEXT:  retq
-define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone {
-  %a   = load <4 x i16>,<4 x i16> *%i,align 1
-  %x   = sext <4 x i16> %a to <4 x i32>
-  ret <4 x i32> %x
-}
-
-
-;SKX-LABEL: zext_8x16mem_to_8x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxwd (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i16>,<8 x i16> *%i,align 1
-  %x   = zext <8 x i16> %a to <8 x i32>
-  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
-  ret <8 x i32> %ret
-}
-
-;SKX-LABEL: sext_8x16mem_to_8x32mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxwd (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i16>,<8 x i16> *%i,align 1
-  %x   = sext <8 x i16> %a to <8 x i32>
-  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
-  ret <8 x i32> %ret
-}
-
-;SKX-LABEL: sext_8x16mem_to_8x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxwd (%rdi), %ymm0   
-;SKX-NEXT:  retq
-define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone {
-  %a   = load <8 x i16>,<8 x i16> *%i,align 1
-  %x   = sext <8 x i16> %a to <8 x i32>
-  ret <8 x i32> %x
-}
-
-;SKX-LABEL: zext_8x16_to_8x32mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm1, %k1
-;SKX-NEXT:  vpmovzxwd %xmm0, %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
-  %x   = zext <8 x i16> %a to <8 x i32>
-  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
-  ret <8 x i32> %ret
-}
-
-;SKX-LABEL: zext_8x16_to_8x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovzxwd %xmm0, %ymm0    
-;SKX-NEXT:  retq
-define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone {
-  %x   = zext <8 x i16> %a to <8 x i32>
-  ret <8 x i32> %x
-}
-
-;SKX-LABEL: zext_16x16mem_to_16x32:
-;KNL-LABEL: zext_16x16mem_to_16x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxwd (%rdi), %zmm0 {%k1} {z} 
-;KNL:       vpmovzxwd (%rdi), %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
-  %a   = load <16 x i16>,<16 x i16> *%i,align 1
-  %x   = zext <16 x i16> %a to <16 x i32>
-  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
-  ret <16 x i32> %ret
-}
-
-;SKX-LABEL: sext_16x16mem_to_16x32mask:
-;KNL-LABEL: sext_16x16mem_to_16x32mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxwd (%rdi), %zmm0 {%k1} {z} 
-;KNL:       vpmovsxwd (%rdi), %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
-  %a   = load <16 x i16>,<16 x i16> *%i,align 1
-  %x   = sext <16 x i16> %a to <16 x i32>
-  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
-  ret <16 x i32> %ret
-}
-
-;SKX-LABEL: sext_16x16mem_to_16x32:
-;KNL-LABEL: sext_16x16mem_to_16x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxwd (%rdi), %zmm0   
-;KNL:       vpmovsxwd (%rdi), %zmm0   
-;SKX-NEXT:  retq
-define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone {
-  %a   = load <16 x i16>,<16 x i16> *%i,align 1
-  %x   = sext <16 x i16> %a to <16 x i32>
-  ret <16 x i32> %x
-}
-;SKX-LABEL: zext_16x16_to_16x32mask:
-;KNL-LABEL: zext_16x16_to_16x32mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovb2m  %xmm1, %k1
-;SKX-NEXT:  vpmovzxwd %ymm0, %zmm0 {%k1} {z} 
-;KNL:       vpmovzxwd %ymm0, %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone {
-  %x   = zext <16 x i16> %a to <16 x i32>
-  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
-  ret <16 x i32> %ret
-}
-
-;SKX-LABEL: zext_16x16_to_16x32:
-;KNL-LABEL: zext_16x16_to_16x32:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovzxwd %ymm0, %zmm0    
-;KNL:       vpmovzxwd %ymm0, %zmm0    
-;SKX-NEXT:  retq
-define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone {
-  %x   = zext <16 x i16> %a to <16 x i32>
-  ret <16 x i32> %x
-}
-
-;SKX-LABEL: zext_2x16mem_to_2x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovq2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxwq (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
-  %a   = load <2 x i16>,<2 x i16> *%i,align 1
-  %x   = zext <2 x i16> %a to <2 x i64>
-  %ret = select <2 x  i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
-  ret <2 x i64> %ret
-}
-
-;SKX-LABEL: sext_2x16mem_to_2x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovq2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxwq (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
-  %a   = load <2 x i16>,<2 x i16> *%i,align 1
-  %x   = sext <2 x i16> %a to <2 x i64>
-  %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
-  ret <2 x i64> %ret
-}
-
-;SKX-LABEL: sext_2x16mem_to_2x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxwq (%rdi), %xmm0   
-;SKX-NEXT:  retq
-define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone {
-  %a   = load <2 x i16>,<2 x i16> *%i,align 1
-  %x   = sext <2 x i16> %a to <2 x i64>
-  ret <2 x i64> %x
-}
-
-;SKX-LABEL: zext_4x16mem_to_4x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxwq (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
-  %a   = load <4 x i16>,<4 x i16> *%i,align 1
-  %x   = zext <4 x i16> %a to <4 x i64>
-  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
-  ret <4 x i64> %ret
-}
-
-;SKX-LABEL: sext_4x16mem_to_4x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxwq (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
-  %a   = load <4 x i16>,<4 x i16> *%i,align 1
-  %x   = sext <4 x i16> %a to <4 x i64>
-  %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
-  ret <4 x i64> %ret
-}
-
-;SKX-LABEL: sext_4x16mem_to_4x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxwq (%rdi), %ymm0   
-;SKX-NEXT:  retq
-define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone {
-  %a   = load <4 x i16>,<4 x i16> *%i,align 1
-  %x   = sext <4 x i16> %a to <4 x i64>
-  ret <4 x i64> %x
-}
-
-;SKX-LABEL: zext_8x16mem_to_8x64:
-;KNL-LABEL: zext_8x16mem_to_8x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxwq (%rdi), %zmm0 {%k1} {z} 
-;KNL:       vpmovzxwq (%rdi), %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i16>,<8 x i16> *%i,align 1
-  %x   = zext <8 x i16> %a to <8 x i64>
-  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
-  ret <8 x i64> %ret
-}
-
-;SKX-LABEL: sext_8x16mem_to_8x64mask:
-;KNL-LABEL: sext_8x16mem_to_8x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxwq (%rdi), %zmm0 {%k1} {z} 
-;KNL:       vpmovsxwq (%rdi), %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i16>,<8 x i16> *%i,align 1
-  %x   = sext <8 x i16> %a to <8 x i64>
-  %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
-  ret <8 x i64> %ret
-}
-
-;SKX-LABEL: sext_8x16mem_to_8x64:
-;KNL-LABEL: sext_8x16mem_to_8x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxwq (%rdi), %zmm0   
-;KNL:       vpmovsxwq (%rdi), %zmm0   
-;SKX-NEXT:  retq
-define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone {
-  %a   = load <8 x i16>,<8 x i16> *%i,align 1
-  %x   = sext <8 x i16> %a to <8 x i64>
-  ret <8 x i64> %x
-}
-
-;SKX-LABEL: zext_8x16_to_8x64mask:
-;KNL-LABEL: zext_8x16_to_8x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm1, %k1
-;SKX-NEXT:  vpmovzxwq %xmm0, %zmm0 {%k1} {z} 
-;KNL:       vpmovzxwq %xmm0, %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
-  %x   = zext <8 x i16> %a to <8 x i64>
-  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
-  ret <8 x i64> %ret
-}
-
-;SKX-LABEL: zext_8x16_to_8x64:
-;KNL-LABEL: zext_8x16_to_8x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovzxwq %xmm0, %zmm0    
-;KNL:       vpmovzxwq %xmm0, %zmm0    
-;SKX-NEXT:  retq
-; KNL: ret
-define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone {
-  %ret   = zext <8 x i16> %a to <8 x i64>
-  ret <8 x i64> %ret
-}
-
-;SKX-LABEL: zext_2x32mem_to_2x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovq2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxdq (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
-  %a   = load <2 x i32>,<2 x i32> *%i,align 1
-  %x   = zext <2 x i32> %a to <2 x i64>
-  %ret = select <2 x  i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
-  ret <2 x i64> %ret
-}
-
-;SKX-LABEL: sext_2x32mem_to_2x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovq2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxdq (%rdi), %xmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
-  %a   = load <2 x i32>,<2 x i32> *%i,align 1
-  %x   = sext <2 x i32> %a to <2 x i64>
-  %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
-  ret <2 x i64> %ret
-}
-
-;SKX-LABEL: sext_2x32mem_to_2x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxdq (%rdi), %xmm0   
-;SKX-NEXT:  retq
-define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone {
-  %a   = load <2 x i32>,<2 x i32> *%i,align 1
-  %x   = sext <2 x i32> %a to <2 x i64>
-  ret <2 x i64> %x
-}
-
-;SKX-LABEL: zext_4x32mem_to_4x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxdq (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
-  %a   = load <4 x i32>,<4 x i32> *%i,align 1
-  %x   = zext <4 x i32> %a to <4 x i64>
-  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
-  ret <4 x i64> %ret
-}
-
-;SKX-LABEL: sext_4x32mem_to_4x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxdq (%rdi), %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
-  %a   = load <4 x i32>,<4 x i32> *%i,align 1
-  %x   = sext <4 x i32> %a to <4 x i64>
-  %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
-  ret <4 x i64> %ret
-}
-
-;SKX-LABEL: sext_4x32mem_to_4x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxdq (%rdi), %ymm0   
-;SKX-NEXT:  retq
-define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone {
-  %a   = load <4 x i32>,<4 x i32> *%i,align 1
-  %x   = sext <4 x i32> %a to <4 x i64>
-  ret <4 x i64> %x
-}
-
-;SKX-LABEL: sext_4x32_to_4x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxdq %xmm0, %ymm0    
-;SKX-NEXT:  retq
-define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone {
-  %x   = sext <4 x i32> %a to <4 x i64>
-  ret <4 x i64> %x
-}
-
-;SKX-LABEL: zext_4x32_to_4x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovd2m  %xmm1, %k1
-;SKX-NEXT:  vpmovzxdq %xmm0, %ymm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind readnone {
-  %x   = zext <4 x i32> %a to <4 x i64>
-  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
-  ret <4 x i64> %ret
-}
-
-;SKX-LABEL: zext_8x32mem_to_8x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm0, %k1
-;SKX-NEXT:  vpmovzxdq (%rdi), %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i32>,<8 x i32> *%i,align 1
-  %x   = zext <8 x i32> %a to <8 x i64>
-  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
-  ret <8 x i64> %ret
-}
-
-;SKX-LABEL: sext_8x32mem_to_8x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm0, %k1
-;SKX-NEXT:  vpmovsxdq (%rdi), %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
-  %a   = load <8 x i32>,<8 x i32> *%i,align 1
-  %x   = sext <8 x i32> %a to <8 x i64>
-  %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
-  ret <8 x i64> %ret
-}
-
-;SKX-LABEL: sext_8x32mem_to_8x64:
-;KNL-LABEL: sext_8x32mem_to_8x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxdq (%rdi), %zmm0   
-;KNL:       vpmovsxdq (%rdi), %zmm0   
-;SKX-NEXT:  retq
-define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone {
-  %a   = load <8 x i32>,<8 x i32> *%i,align 1
-  %x   = sext <8 x i32> %a to <8 x i64>
-  ret <8 x i64> %x
-}
-
-;SKX-LABEL: sext_8x32_to_8x64:
-;KNL-LABEL: sext_8x32_to_8x64:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovsxdq %ymm0, %zmm0    
-;KNL:       vpmovsxdq %ymm0, %zmm0    
-;SKX-NEXT:  retq
-define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone {
-  %x   = sext <8 x i32> %a to <8 x i64>
-  ret <8 x i64> %x
-}
-
-;SKX-LABEL: zext_8x32_to_8x64mask:
-;KNL-LABEL: zext_8x32_to_8x64mask:
-;SKX:       ## BB#0:
-;SKX-NEXT:  vpmovw2m  %xmm1, %k1
-;SKX-NEXT:  vpmovzxdq %ymm0, %zmm0 {%k1} {z} 
-;KNL:       vpmovzxdq %ymm0, %zmm0 {%k1} {z} 
-;SKX-NEXT:  retq
-define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone {
-  %x   = zext <8 x i32> %a to <8 x i64>
-  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
-  ret <8 x i64> %ret
-}
-;KNL-LABEL: fptrunc_test
-;KNL: vcvtpd2ps {{.*}}%zmm
-;KNL: ret
-define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone {
-  %b = fptrunc <8 x double> %a to <8 x float>
-  ret <8 x float> %b
-}
-
-;KNL-LABEL: fpext_test
-;KNL: vcvtps2pd {{.*}}%zmm
-;KNL: ret
-define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone {
-  %b = fpext <8 x float> %a to <8 x double>
-  ret <8 x double> %b
-}
-
-; KNL-LABEL: zext_16i1_to_16xi32
-; KNL: vpbroadcastd LCP{{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL: ret
-define   <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
-  %a = bitcast i16 %b to <16 x i1>
-  %c = zext <16 x i1> %a to <16 x i32>
-  ret <16 x i32> %c
-}
-
-; KNL-LABEL: zext_8i1_to_8xi64
-; KNL: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL: ret
-define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
-  %a = bitcast i8 %b to <8 x i1>
-  %c = zext <8 x i1> %a to <8 x i64>
-  ret <8 x i64> %c
-}
-
-; KNL-LABEL: trunc_16i8_to_16i1
-; KNL: vpmovsxbd
-; KNL: vpandd
-; KNL: vptestmd
-; KNL: ret
-; SKX-LABEL: trunc_16i8_to_16i1
-; SKX: vpmovb2m %xmm
-define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
-  %mask_b = trunc <16 x i8>%a to <16 x i1>
-  %mask = bitcast <16 x i1> %mask_b to i16
-  ret i16 %mask
-}
-
-; KNL-LABEL: trunc_16i32_to_16i1
-; KNL: vpandd
-; KNL: vptestmd
-; KNL: ret
-; SKX-LABEL: trunc_16i32_to_16i1
-; SKX: vpmovd2m %zmm
-define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
-  %mask_b = trunc <16 x i32>%a to <16 x i1>
-  %mask = bitcast <16 x i1> %mask_b to i16
-  ret i16 %mask
-}
-
-; SKX-LABEL: trunc_4i32_to_4i1
-; SKX: vpmovd2m        %xmm
-; SKX: kandw
-; SKX: vpmovm2d
-define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
-  %mask_a = trunc <4 x i32>%a to <4 x i1>
-  %mask_b = trunc <4 x i32>%b to <4 x i1>
-  %a_and_b = and <4 x i1>%mask_a, %mask_b
-  %res = sext <4 x i1>%a_and_b to <4 x i32>
-  ret <4 x i32>%res
-}
-
-; KNL-LABEL: trunc_8i16_to_8i1
-; KNL: vpmovsxwq
-; KNL: vpandq LCP{{.*}}(%rip){1to8}
-; KNL: vptestmq
-; KNL: ret
-
-; SKX-LABEL: trunc_8i16_to_8i1
-; SKX: vpmovw2m %xmm
-define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
-  %mask_b = trunc <8 x i16>%a to <8 x i1>
-  %mask = bitcast <8 x i1> %mask_b to i8
-  ret i8 %mask
-}
-
-; KNL-LABEL: sext_8i1_8i32
-; KNL: vpbroadcastq  LCP{{.*}}(%rip), %zmm0 {%k1} {z}
-; SKX: vpmovm2d
-; KNL: ret
-define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
-  %x = icmp slt <8 x i32> %a1, %a2
-  %x1 = xor <8 x i1>%x, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-  %y = sext <8 x i1> %x1 to <8 x i32>
-  ret <8 x i32> %y
-}
-
-; KNL-LABEL: trunc_v16i32_to_v16i16
-; KNL: vpmovdw
-; KNL: ret
-define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) {
-  %1 = trunc <16 x i32> %x to <16 x i16>
-  ret <16 x i16> %1
-}
-
-; KNL-LABEL: trunc_i32_to_i1
-; KNL: movw    $-4, %ax
-; KNL: kmovw   %eax, %k1
-; KNL: korw
-define i16 @trunc_i32_to_i1(i32 %a) {
-  %a_i = trunc i32 %a to i1
-  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
-  %res = bitcast <16 x i1> %maskv to i16
-  ret i16 %res
-}
-
-; KNL-LABEL: sext_8i1_8i16
-; SKX: vpmovm2w
-; KNL: ret
-define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
-  %x = icmp slt <8 x i32> %a1, %a2
-  %y = sext <8 x i1> %x to <8 x i16>
-  ret <8 x i16> %y
-}
-
-; KNL-LABEL: sext_16i1_16i32
-; SKX: vpmovm2d
-; KNL: ret
-define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
-  %x = icmp slt <16 x i32> %a1, %a2
-  %y = sext <16 x i1> %x to <16 x i32>
-  ret <16 x i32> %y
-}
-
-; KNL-LABEL: sext_8i1_8i64
-; SKX: vpmovm2q
-; KNL: ret
-define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
-  %x = icmp slt <8 x i32> %a1, %a2
-  %y = sext <8 x i1> %x to <8 x i64>
-  ret <8 x i64> %y
-}
-
-; KNL-LABEL: @extload_v8i64
-; KNL: vpmovsxbq
-define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
-  %sign_load = load <8 x i8>, <8 x i8>* %a
-  %c = sext <8 x i8> %sign_load to <8 x i64>
-  store <8 x i64> %c, <8 x i64>* %res
-  ret void
-}
-
-;SKX-LABEL: test21:
-;SKX:       vmovdqu16 %zmm0, %zmm3 {%k1}
-;SKX-NEXT:  kshiftrq  $32, %k1, %k1
-;SKX-NEXT:  vmovdqu16 %zmm1, %zmm2 {%k1}
-define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
-  %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
-  ret <64 x i16> %ret
-}  
-
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
new file mode 100644
index 0000000000000..e4e5c2b8a1d5b
--- /dev/null
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -0,0 +1,488 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+
+ attributes #0 = { nounwind }
+
+define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 {
+; ALL-LABEL: trunc_16x32_to_16x8:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    retq
+  %x = trunc <16 x i32> %i to <16 x i8>
+  ret <16 x i8> %x
+}
+
+define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 {
+; ALL-LABEL: trunc_8x64_to_8x16:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovqw %zmm0, %xmm0
+; ALL-NEXT:    retq
+  %x = trunc <8 x i64> %i to <8 x i16>
+  ret <8 x i16> %x
+}
+
+define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 {
+; ALL-LABEL: trunc_v16i32_to_v16i16:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovdw %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %1 = trunc <16 x i32> %x to <16 x i16>
+  ret <16 x i16> %1
+}
+
+define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 {
+; ALL-LABEL: trunc_qb_512:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovqw %zmm0, %xmm0
+; ALL-NEXT:    retq
+  %x = trunc <8 x i64> %i to <8 x i8>
+  ret <8 x i8> %x
+}
+
+define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 {
+; ALL-LABEL: trunc_qb_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovqb %zmm0, (%rdi)
+; ALL-NEXT:    retq
+    %x = trunc <8 x i64> %i to <8 x i8>
+    store <8 x i8> %x, <8 x i8>* %res
+    ret void
+}
+
+define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
+; KNL-LABEL: trunc_qb_256:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qb_256:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovqd %ymm0, %xmm0
+; SKX-NEXT:    retq
+  %x = trunc <4 x i64> %i to <4 x i8>
+  ret <4 x i8> %x
+}
+
+define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
+; KNL-LABEL: trunc_qb_256_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vmovd %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qb_256_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovqb %ymm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <4 x i64> %i to <4 x i8>
+    store <4 x i8> %x, <4 x i8>* %res
+    ret void
+}
+
+define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 {
+; ALL-LABEL: trunc_qb_128:
+; ALL:       ## BB#0:
+; ALL-NEXT:    retq
+  %x = trunc <2 x i64> %i to <2 x i8>
+  ret <2 x i8> %x
+}
+
+define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 {
+; KNL-LABEL: trunc_qb_128_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vmovd %xmm0, %eax
+; KNL-NEXT:    movw %ax, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qb_128_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovqb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <2 x i64> %i to <2 x i8>
+    store <2 x i8> %x, <2 x i8>* %res
+    ret void
+}
+
+define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 {
+; ALL-LABEL: trunc_qw_512:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovqw %zmm0, %xmm0
+; ALL-NEXT:    retq
+  %x = trunc <8 x i64> %i to <8 x i16>
+  ret <8 x i16> %x
+}
+
+define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 {
+; ALL-LABEL: trunc_qw_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovqw %zmm0, (%rdi)
+; ALL-NEXT:    retq
+    %x = trunc <8 x i64> %i to <8 x i16>
+    store <8 x i16> %x, <8 x i16>* %res
+    ret void
+}
+
+define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
+; KNL-LABEL: trunc_qw_256:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qw_256:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovqd %ymm0, %xmm0
+; SKX-NEXT:    retq
+  %x = trunc <4 x i64> %i to <4 x i16>
+  ret <4 x i16> %x
+}
+
+define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 {
+; KNL-LABEL: trunc_qw_256_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qw_256_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovqw %ymm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <4 x i64> %i to <4 x i16>
+    store <4 x i16> %x, <4 x i16>* %res
+    ret void
+}
+
+define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
+; ALL-LABEL: trunc_qw_128:
+; ALL:       ## BB#0:
+; ALL-NEXT:    retq
+  %x = trunc <2 x i64> %i to <2 x i16>
+  ret <2 x i16> %x
+}
+
+define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 {
+; KNL-LABEL: trunc_qw_128_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; KNL-NEXT:    vmovd %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qw_128_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovqw %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <2 x i64> %i to <2 x i16>
+    store <2 x i16> %x, <2 x i16>* %res
+    ret void
+}
+
+define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 {
+; ALL-LABEL: trunc_qd_512:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovqd %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %x = trunc <8 x i64> %i to <8 x i32>
+  ret <8 x i32> %x
+}
+
+define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 {
+; ALL-LABEL: trunc_qd_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovqd %zmm0, (%rdi)
+; ALL-NEXT:    retq
+    %x = trunc <8 x i64> %i to <8 x i32>
+    store <8 x i32> %x, <8 x i32>* %res
+    ret void
+}
+
+define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
+; KNL-LABEL: trunc_qd_256:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qd_256:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovqd %ymm0, %xmm0
+; SKX-NEXT:    retq
+  %x = trunc <4 x i64> %i to <4 x i32>
+  ret <4 x i32> %x
+}
+
+define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 {
+; KNL-LABEL: trunc_qd_256_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vmovaps %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qd_256_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovqd %ymm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <4 x i64> %i to <4 x i32>
+    store <4 x i32> %x, <4 x i32>* %res
+    ret void
+}
+
+define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 {
+; ALL-LABEL: trunc_qd_128:
+; ALL:       ## BB#0:
+; ALL-NEXT:    retq
+  %x = trunc <2 x i64> %i to <2 x i32>
+  ret <2 x i32> %x
+}
+
+define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 {
+; KNL-LABEL: trunc_qd_128_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qd_128_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovqd %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <2 x i64> %i to <2 x i32>
+    store <2 x i32> %x, <2 x i32>* %res
+    ret void
+}
+
+define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 {
+; ALL-LABEL: trunc_db_512:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    retq
+  %x = trunc <16 x i32> %i to <16 x i8>
+  ret <16 x i8> %x
+}
+
+define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 {
+; ALL-LABEL: trunc_db_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovdb %zmm0, (%rdi)
+; ALL-NEXT:    retq
+    %x = trunc <16 x i32> %i to <16 x i8>
+    store <16 x i8> %x, <16 x i8>* %res
+    ret void
+}
+
+define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
+; KNL-LABEL: trunc_db_256:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_db_256:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; SKX-NEXT:    retq
+  %x = trunc <8 x i32> %i to <8 x i8>
+  ret <8 x i8> %x
+}
+
+define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
+; KNL-LABEL: trunc_db_256_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_db_256_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovdb %ymm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <8 x i32> %i to <8 x i8>
+    store <8 x i8> %x, <8 x i8>* %res
+    ret void
+}
+
+define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 {
+; ALL-LABEL: trunc_db_128:
+; ALL:       ## BB#0:
+; ALL-NEXT:    retq
+  %x = trunc <4 x i32> %i to <4 x i8>
+  ret <4 x i8> %x
+}
+
+define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 {
+; KNL-LABEL: trunc_db_128_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vmovd %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_db_128_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovdb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <4 x i32> %i to <4 x i8>
+    store <4 x i8> %x, <4 x i8>* %res
+    ret void
+}
+
+define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 {
+; ALL-LABEL: trunc_dw_512:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovdw %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %x = trunc <16 x i32> %i to <16 x i16>
+  ret <16 x i16> %x
+}
+
+define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 {
+; ALL-LABEL: trunc_dw_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovdw %zmm0, (%rdi)
+; ALL-NEXT:    retq
+    %x = trunc <16 x i32> %i to <16 x i16>
+    store <16 x i16> %x, <16 x i16>* %res
+    ret void
+}
+
+define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
+; KNL-LABEL: trunc_dw_256:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_dw_256:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; SKX-NEXT:    retq
+  %x = trunc <8 x i32> %i to <8 x i16>
+  ret <8 x i16> %x
+}
+
+define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 {
+; KNL-LABEL: trunc_dw_256_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    vmovaps %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_dw_256_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovdw %ymm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <8 x i32> %i to <8 x i16>
+    store <8 x i16> %x, <8 x i16>* %res
+    ret void
+}
+
+define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 {
+; KNL-LABEL: trunc_dw_128_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_dw_128_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovdw %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <4 x i32> %i to <4 x i16>
+    store <4 x i16> %x, <4 x i16>* %res
+    ret void
+}
+
+define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
+; KNL-LABEL: trunc_wb_512:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT:    vpmovdb %zmm1, %xmm1
+; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_wb_512:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovwb %zmm0, %ymm0
+; SKX-NEXT:    retq
+  %x = trunc <32 x i16> %i to <32 x i8>
+  ret <32 x i8> %x
+}
+
+define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
+; KNL-LABEL: trunc_wb_512_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT:    vpmovdb %zmm1, %xmm1
+; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vmovaps %ymm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_wb_512_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovwb %zmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <32 x i16> %i to <32 x i8>
+    store <32 x i8> %x, <32 x i8>* %res
+    ret void
+}
+
+define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 {
+; KNL-LABEL: trunc_wb_256:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_wb_256:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovwb %ymm0, %xmm0
+; SKX-NEXT:    retq
+  %x = trunc <16 x i16> %i to <16 x i8>
+  ret <16 x i8> %x
+}
+
+define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 {
+; KNL-LABEL: trunc_wb_256_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vmovaps %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_wb_256_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovwb %ymm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <16 x i16> %i to <16 x i8>
+    store <16 x i8> %x, <16 x i8>* %res
+    ret void
+}
+
+define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 {
+; ALL-LABEL: trunc_wb_128:
+; ALL:       ## BB#0:
+; ALL-NEXT:    retq
+  %x = trunc <8 x i16> %i to <8 x i8>
+  ret <8 x i8> %x
+}
+
+define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
+; KNL-LABEL: trunc_wb_128_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_wb_128_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovwb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <8 x i16> %i to <8 x i8>
+    store <8 x i8> %x, <8 x i8>* %res
+    ret void
+}
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 854f1019f0f83..4f679f9aca6fc 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -1,47 +1,54 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 define   <16 x i32> @_inreg16xi32(i32 %a) {
-; CHECK-LABEL: _inreg16xi32:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: _inreg16xi32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastd %edi, %zmm0
+; ALL-NEXT:    retq
   %b = insertelement <16 x i32> undef, i32 %a, i32 0
   %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
   ret <16 x i32> %c
 }
 
 define   <8 x i64> @_inreg8xi64(i64 %a) {
-; CHECK-LABEL: _inreg8xi64:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: _inreg8xi64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %rdi, %zmm0
+; ALL-NEXT:    retq
   %b = insertelement <8 x i64> undef, i64 %a, i32 0
   %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
   ret <8 x i64> %c
 }
 
-;CHECK-LABEL: _ss16xfloat_v4
-;CHECK: vbroadcastss %xmm0, %zmm0
-;CHECK: ret
 define   <16 x float> @_ss16xfloat_v4(<4 x float> %a) {
+; ALL-LABEL: _ss16xfloat_v4:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
+; ALL-NEXT:    retq
   %b = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float> %b
 }
 
 define   <16 x float> @_inreg16xfloat(float %a) {
-; CHECK-LABEL: _inreg16xfloat:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: _inreg16xfloat:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
+; ALL-NEXT:    retq
   %b = insertelement <16 x float> undef, float %a, i32 0
   %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float> %c
 }
 
-;CHECK-LABEL: _ss16xfloat_mask:
-;CHECK: vbroadcastss %xmm0, %zmm1 {%k1}
-;CHECK: ret
 define   <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) {
+; ALL-LABEL: _ss16xfloat_mask:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; ALL-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; ALL-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
+; ALL-NEXT:    vmovaps %zmm1, %zmm0
+; ALL-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %b = insertelement <16 x float> undef, float %a, i32 0
   %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
@@ -49,10 +56,13 @@ define   <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m
   ret <16 x float> %r
 }
 
-;CHECK-LABEL: _ss16xfloat_maskz:
-;CHECK: vbroadcastss %xmm0, %zmm0 {%k1} {z}
-;CHECK: ret
 define   <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
+; ALL-LABEL: _ss16xfloat_maskz:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; ALL-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; ALL-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %b = insertelement <16 x float> undef, float %a, i32 0
   %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
@@ -60,20 +70,24 @@ define   <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
   ret <16 x float> %r
 }
 
-;CHECK-LABEL: _ss16xfloat_load:
-;CHECK: vbroadcastss (%{{.*}}, %zmm
-;CHECK: ret
 define   <16 x float> @_ss16xfloat_load(float* %a.ptr) {
+; ALL-LABEL: _ss16xfloat_load:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastss (%rdi), %zmm0
+; ALL-NEXT:    retq
   %a = load float, float* %a.ptr
   %b = insertelement <16 x float> undef, float %a, i32 0
   %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float> %c
 }
 
-;CHECK-LABEL: _ss16xfloat_mask_load:
-;CHECK: vbroadcastss (%rdi), %zmm0 {%k1}
-;CHECK: ret
 define   <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) {
+; ALL-LABEL: _ss16xfloat_mask_load:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; ALL-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1}
+; ALL-NEXT:    retq
   %a = load float, float* %a.ptr
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %b = insertelement <16 x float> undef, float %a, i32 0
@@ -82,10 +96,13 @@ define   <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16
   ret <16 x float> %r
 }
 
-;CHECK-LABEL: _ss16xfloat_maskz_load:
-;CHECK: vbroadcastss (%rdi), %zmm0 {%k1} {z}
-;CHECK: ret
 define   <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) {
+; ALL-LABEL: _ss16xfloat_maskz_load:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; ALL-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z}
+; ALL-NEXT:    retq
   %a = load float, float* %a.ptr
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %b = insertelement <16 x float> undef, float %a, i32 0
@@ -95,19 +112,23 @@ define   <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1)
 }
 
 define   <8 x double> @_inreg8xdouble(double %a) {
-; CHECK-LABEL: _inreg8xdouble:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: _inreg8xdouble:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0
+; ALL-NEXT:    retq
   %b = insertelement <8 x double> undef, double %a, i32 0
   %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
   ret <8 x double> %c
 }
 
-;CHECK-LABEL: _sd8xdouble_mask:
-;CHECK: vbroadcastsd %xmm0, %zmm1 {%k1}
-;CHECK: ret
 define   <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) {
+; ALL-LABEL: _sd8xdouble_mask:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; ALL-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; ALL-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
+; ALL-NEXT:    vmovaps %zmm1, %zmm0
+; ALL-NEXT:    retq
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
   %b = insertelement <8 x double> undef, double %a, i32 0
   %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
@@ -115,10 +136,13 @@ define   <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m
   ret <8 x double> %r
 }
 
-;CHECK-LABEL: _sd8xdouble_maskz:
-;CHECK: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
-;CHECK: ret
 define   <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
+; ALL-LABEL: _sd8xdouble_maskz:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; ALL-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; ALL-NEXT:    retq
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
   %b = insertelement <8 x double> undef, double %a, i32 0
   %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
@@ -126,20 +150,24 @@ define   <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
   ret <8 x double> %r
 }
 
-;CHECK-LABEL: _sd8xdouble_load:
-;CHECK: vbroadcastsd (%rdi), %zmm
-;CHECK: ret
 define   <8 x double> @_sd8xdouble_load(double* %a.ptr) {
+; ALL-LABEL: _sd8xdouble_load:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd (%rdi), %zmm0
+; ALL-NEXT:    retq
   %a = load double, double* %a.ptr
   %b = insertelement <8 x double> undef, double %a, i32 0
   %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
   ret <8 x double> %c
 }
 
-;CHECK-LABEL: _sd8xdouble_mask_load:
-;CHECK: vbroadcastsd (%rdi), %zmm0 {%k1}
-;CHECK: ret
 define   <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) {
+; ALL-LABEL: _sd8xdouble_mask_load:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; ALL-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1}
+; ALL-NEXT:    retq
   %a = load double, double* %a.ptr
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
   %b = insertelement <8 x double> undef, double %a, i32 0
@@ -149,9 +177,12 @@ define   <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8
 }
 
 define   <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) {
-; CHECK-LABEL: _sd8xdouble_maskz_load:
-; CHECK:    vbroadcastsd (%rdi), %zmm0 {%k1} {z}
-; CHECK:    ret
+; ALL-LABEL: _sd8xdouble_maskz_load:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; ALL-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; ALL-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z}
+; ALL-NEXT:    retq
   %a = load double, double* %a.ptr
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
   %b = insertelement <8 x double> undef, double %a, i32 0
@@ -161,32 +192,32 @@ define   <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1)
 }
 
 define   <16 x i32> @_xmm16xi32(<16 x i32> %a) {
-; CHECK-LABEL: _xmm16xi32:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: _xmm16xi32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastd %xmm0, %zmm0
+; ALL-NEXT:    retq
   %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer
   ret <16 x i32> %b
 }
 
 define   <16 x float> @_xmm16xfloat(<16 x float> %a) {
-; CHECK-LABEL: _xmm16xfloat:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: _xmm16xfloat:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
+; ALL-NEXT:    retq
   %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float> %b
 }
 
 define <16 x i32> @test_vbroadcast() {
-; CHECK-LABEL: test_vbroadcast:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
-; CHECK-NEXT:    vcmpunordps %zmm0, %zmm0, %k1
-; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
-; CHECK-NEXT:    knotw %k1, %k1
-; CHECK-NEXT:    vmovdqu32 %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; ALL-LABEL: test_vbroadcast:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; ALL-NEXT:    vcmpunordps %zmm0, %zmm0, %k1
+; ALL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; ALL-NEXT:    knotw %k1, %k1
+; ALL-NEXT:    vmovdqu32 %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT:    retq
 entry:
   %0 = sext <16 x i1> zeroinitializer to <16 x i32>
   %1 = fcmp uno <16 x float> undef, zeroinitializer
@@ -198,10 +229,10 @@ entry:
 ; We implement the set1 intrinsics with vector initializers.  Verify that the
 ; IR generated will produce broadcasts at the end.
 define <8 x double> @test_set1_pd(double %d) #2 {
-; CHECK-LABEL: test_set1_pd:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: test_set1_pd:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   %vecinit.i = insertelement <8 x double> undef, double %d, i32 0
   %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1
@@ -215,10 +246,10 @@ entry:
 }
 
 define <8 x i64> @test_set1_epi64(i64 %d) #2 {
-; CHECK-LABEL: test_set1_epi64:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: test_set1_epi64:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vpbroadcastq %rdi, %zmm0
+; ALL-NEXT:    retq
 entry:
   %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0
   %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1
@@ -232,10 +263,10 @@ entry:
 }
 
 define <16 x float> @test_set1_ps(float %f) #2 {
-; CHECK-LABEL: test_set1_ps:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: test_set1_ps:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   %vecinit.i = insertelement <16 x float> undef, float %f, i32 0
   %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1
@@ -257,10 +288,10 @@ entry:
 }
 
 define <16 x i32> @test_set1_epi32(i32 %f) #2 {
-; CHECK-LABEL: test_set1_epi32:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: test_set1_epi32:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vpbroadcastd %edi, %zmm0
+; ALL-NEXT:    retq
 entry:
   %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0
   %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1
@@ -284,10 +315,10 @@ entry:
 ; We implement the scalar broadcast intrinsics with vector initializers.
 ; Verify that the IR generated will produce the broadcast at the end.
 define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) {
-; CHECK-LABEL: test_mm512_broadcastsd_pd:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
-; CHECK-NEXT:    retq
+; ALL-LABEL: test_mm512_broadcastsd_pd:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0
+; ALL-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %a, i32 0
   %vecinit.i = insertelement <8 x double> undef, double %0, i32 0
@@ -301,30 +332,69 @@ entry:
   ret <8 x double> %vecinit7.i
 }
 
-; CHECK-LABEL: test1
-; CHECK: vbroadcastss
 define <16 x float> @test1(<8 x float>%a)  {
+; ALL-LABEL: test1:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
+; ALL-NEXT:    retq
   %res = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float>%res
 }
 
-; CHECK-LABEL: test2
-; CHECK: vbroadcastsd
 define <8 x double> @test2(<4 x double>%a)  {
+; ALL-LABEL: test2:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0
+; ALL-NEXT:    retq
   %res = shufflevector <4 x double> %a, <4 x double> undef, <8 x i32> zeroinitializer
   ret <8 x double>%res
 }
 
-; CHECK-LABEL: test3
-; CHECK: vpbroadcastd
-define <16 x i32> @test3(<8 x i32>%a)  {
+define <64 x i8> @_invec32xi8(<32 x i8>%a)  {
+; AVX512F-LABEL: _invec32xi8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512F-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: _invec32xi8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = shufflevector <32 x i8> %a, <32 x i8> undef, <64 x i32> zeroinitializer
+  ret <64 x i8>%res
+}
+
+define <32 x i16> @_invec16xi16(<16 x i16>%a)  {
+; AVX512F-LABEL: _invec16xi16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX512F-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: _invec16xi16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpbroadcastw %xmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = shufflevector <16 x i16> %a, <16 x i16> undef, <32 x i32> zeroinitializer
+  ret <32 x i16>%res
+}
+
+define <16 x i32> @_invec8xi32(<8 x i32>%a)  {
+; ALL-LABEL: _invec8xi32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastd %xmm0, %zmm0
+; ALL-NEXT:    retq
   %res = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> zeroinitializer
   ret <16 x i32>%res
 }
 
-; CHECK-LABEL: test4
-; CHECK: vpbroadcastq
-define <8 x i64> @test4(<4 x i64>%a)  {
+define <8 x i64> @_invec4xi64(<4 x i64>%a)  {
+; ALL-LABEL: _invec4xi64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %zmm0
+; ALL-NEXT:    retq
   %res = shufflevector <4 x i64> %a, <4 x i64> undef, <8 x i32> zeroinitializer
   ret <8 x i64>%res
 }
+
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index 6a4a3aa7e371d..a8c558df9de8f 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -152,7 +152,6 @@ define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
   ret <8 x i32> %max
 }
 
-
 define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
 ; KNL-LABEL: test12:
 ; KNL:       ## BB#0:
@@ -166,6 +165,32 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
   ret i16 %res1
 }
 
+define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
+; SKX-LABEL: test12_v32i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; SKX-NEXT:    kunpckwd %k0, %k1, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    retq
+  %res = icmp eq <32 x i32> %a, %b
+  %res1 = bitcast <32 x i1> %res to i32
+  ret i32 %res1
+}
+
+define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
+; SKX-LABEL: test12_v64i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k0
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; SKX-NEXT:    kunpckdq %k0, %k1, %k0
+; SKX-NEXT:    kmovq %k0, %rax
+; SKX-NEXT:    retq
+  %res = icmp eq <64 x i16> %a, %b
+  %res1 = bitcast <64 x i1> %res to i64
+  ret i64 %res1
+}
+
 define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
 ; KNL-LABEL: test13:
 ; KNL:       ## BB#0:
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 71bf63ed44d04..5f3d16d4efbb7 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1,15 +1,51 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding| FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
 
 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
-; CHECK-LABEL: test_pcmpeq_b
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpeq_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpeq_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp0:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
 
 define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_b
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpeq_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpeq_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp1:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -17,15 +53,35 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
 
 define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
-; CHECK-LABEL: test_pcmpeq_w
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpeq_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpeq_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
 
 define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_w
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpeq_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpeq_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
@@ -33,15 +89,49 @@ define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
 declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
 
 define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
-; CHECK-LABEL: test_pcmpgt_b
-; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpgt_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpgt_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp2:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
 
 define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_b
-; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpgt_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpgt_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp3:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -49,357 +139,839 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
 
 define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
-; CHECK-LABEL: test_pcmpgt_w
-; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpgt_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpgt_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
 
 define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_w
-; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpgt_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpgt_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
 
 declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
 
-define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
-; CHECK_LABEL: test_cmp_b_512
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; AVX512BW-LABEL: test_cmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_cmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp4:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    addl (%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ##
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
-}
-
-define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-; CHECK_LABEL: test_mask_cmp_b_512
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
+}
+
+define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; AVX512BW-LABEL: test_mask_cmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_cmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp5:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
 }
 
 declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
 
-define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
-; CHECK_LABEL: test_ucmp_b_512
-; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ##
+define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; AVX512BW-LABEL: test_ucmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpequb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_ucmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp6:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    vpcmpequb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    addl (%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ##
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
-}
-
-define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-; CHECK_LABEL: test_mask_ucmp_b_512
-; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
+}
+
+define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; AVX512BW-LABEL: test_mask_x86_avx512_ucmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp7:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
 }
 
 declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
 
-define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
-; CHECK_LABEL: test_cmp_w_512
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512BW-LABEL: test_cmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmplew %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmpordw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_cmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmplew %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmpordw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ##
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmplew %zmm1, %zmm0, %k0 ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
-}
-
-define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
-; CHECK_LABEL: test_mask_cmp_w_512
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
+}
+
+define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; AVX512BW-LABEL: test_mask_cmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmpordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_cmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmpordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
 }
 
 declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
 
-define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
-; CHECK_LABEL: test_ucmp_w_512
-; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ##
+define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512BW-LABEL: test_ucmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpequw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmporduw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_ucmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpequw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmporduw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ##
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
-}
-
-define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
-; CHECK_LABEL: test_mask_ucmp_w_512
-; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
+}
+
+define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; AVX512BW-LABEL: test_mask_ucmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmporduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_ucmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmporduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
 }
 
 declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
 
-; CHECK-LABEL: test_x86_mask_blend_b_256
-; CHECK: vpblendmb
-define <32 x i8> @test_x86_mask_blend_b_256(i32 %a0, <32 x i8> %a1, <32 x i8> %a2) {
-  %res = call <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8> %a1, <32 x i8> %a2, i32 %a0) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8>, <32 x i8>, i32) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_w_256
-define <16 x i16> @test_x86_mask_blend_w_256(i16 %mask, <16 x i16> %a1, <16 x i16> %a2) {
-  ; CHECK: vpblendmw
-  %res = call <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16> %a1, <16 x i16> %a2, i16 %mask) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16>, <16 x i16>, i16) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_b_512
-; CHECK: vpblendmb
-define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) {
-  %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
-  ret <64 x i8> %res
-}
 declare <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8>, <64 x i8>, i64) nounwind readonly
 
-; CHECK-LABEL: test_x86_mask_blend_w_512
 define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i16> %a2) {
-  ; CHECK: vpblendmw
-  %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
+; AVX512BW-LABEL: test_x86_mask_blend_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_x86_mask_blend_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512F-32-NEXT:    retl
+    %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
 declare <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16>, <32 x i16>, i32) nounwind readonly
 
-; CHECK-LABEL: test_x86_mask_blend_b_128
-; CHECK: vpblendmb
-define <16 x i8> @test_x86_mask_blend_b_128(i16 %a0, <16 x i8> %a1, <16 x i8> %a2) {
-  %res = call <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8> %a1, <16 x i8> %a2, i16 %a0) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8>, <16 x i8>, i16) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_w_128
-define <8 x i16> @test_x86_mask_blend_w_128(i8 %mask, <8 x i16> %a1, <8 x i16> %a2) {
-  ; CHECK: vpblendmw
-  %res = call <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16> %a1, <8 x i16> %a2, i8 %mask) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
+define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) {
+; AVX512BW-LABEL: test_x86_mask_blend_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_x86_mask_blend_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
+  ret <64 x i8> %res
 }
-declare <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16>, <8 x i16>, i8) nounwind readonly
 
 define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rr_512
-  ;CHECK: vpackssdw       %zmm1, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x48,0x6b,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rrk_512
-  ;CHECK: vpackssdw       %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6b,0xd1]
+; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rrkz_512
-  ;CHECK: vpackssdw       %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rm_512
-  ;CHECK: vpackssdw       (%rdi), %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x48,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmk_512
-  ;CHECK: vpackssdw       (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6b,0x0f]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmkz_512
-  ;CHECK: vpackssdw       (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmb_512
-  ;CHECK: vpackssdw       (%rdi){1to16}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x58,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -408,8 +980,20 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmbk_512
-  ;CHECK: vpackssdw       (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0x6b,0x0f]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -418,8 +1002,18 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_512
-  ;CHECK: vpackssdw       (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -430,45 +1024,110 @@ define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
 declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
 
 define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rr_512
-  ;CHECK: vpacksswb       %zmm1, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0x63,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rrk_512
-  ;CHECK: vpacksswb       %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x63,0xd1]
+; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rrkz_512
-  ;CHECK: vpacksswb       %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x63,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rm_512
-  ;CHECK: vpacksswb       (%rdi), %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0x63,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rmk_512
-  ;CHECK: vpacksswb       (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x63,0x0f]
+; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rmkz_512
-  ;CHECK: vpacksswb       (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x63,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -478,53 +1137,118 @@ declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64
 
 
 define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rr_512
-  ;CHECK: vpackusdw       %zmm1, %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rrk_512
-  ;CHECK: vpackusdw       %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rrkz_512
-  ;CHECK: vpackusdw       %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rm_512
-  ;CHECK: vpackusdw       (%rdi), %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmk_512
-  ;CHECK: vpackusdw       (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmkz_512
-  ;CHECK: vpackusdw       (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmb_512
-  ;CHECK: vpackusdw       (%rdi){1to16}, %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -533,8 +1257,20 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmbk_512
-  ;CHECK: vpackusdw       (%rdi){1to16}, %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -543,8 +1279,18 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_512
-  ;CHECK: vpackusdw       (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -555,45 +1301,110 @@ define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b,
 declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
 
 define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rr_512
-  ;CHECK: vpackuswb       %zmm1, %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rrk_512
-  ;CHECK: vpackuswb       %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rrkz_512
-  ;CHECK: vpackuswb       %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rm_512
-  ;CHECK: vpackuswb       (%rdi), %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rmk_512
-  ;CHECK: vpackuswb       (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rmkz_512
-  ;CHECK: vpackuswb       (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -602,45 +1413,102 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt
 declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
 
 define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rr_512
-  ;CHECK: vpaddsw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rrk_512
-  ;CHECK: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rrkz_512
-  ;CHECK: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rm_512
-  ;CHECK: vpaddsw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_adds_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rmk_512
-  ;CHECK: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rmkz_512
-  ;CHECK: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -649,45 +1517,102 @@ define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rr_512
-  ;CHECK: vpsubsw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_subs_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rrk_512
-  ;CHECK: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rrkz_512
-  ;CHECK: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rm_512
-  ;CHECK: vpsubsw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_subs_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rmk_512
-  ;CHECK: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rmkz_512
-  ;CHECK: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -696,45 +1621,102 @@ define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rr_512
-  ;CHECK: vpaddusw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_adds_epu16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rrk_512
-  ;CHECK: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rrkz_512
-  ;CHECK: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rm_512
-  ;CHECK: vpaddusw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_adds_epu16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rmk_512
-  ;CHECK: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rmkz_512
-  ;CHECK: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -743,45 +1725,102 @@ define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rr_512
-  ;CHECK: vpsubusw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_subs_epu16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rrk_512
-  ;CHECK: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rrkz_512
-  ;CHECK: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rm_512
-  ;CHECK: vpsubusw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_subs_epu16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rmk_512
-  ;CHECK: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rmkz_512
-  ;CHECK: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -791,11 +1830,24 @@ declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <3
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_512
-; CHECK-NOT: call 
-; CHECK: vpmaxsb %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -804,11 +1856,22 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_512
-; CHECK-NOT: call 
-; CHECK: vpmaxsw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -817,11 +1880,24 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_512
-; CHECK-NOT: call 
-; CHECK: vpmaxub %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -830,11 +1906,22 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_512
-; CHECK-NOT: call 
-; CHECK: vpmaxuw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -843,11 +1930,24 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_512
-; CHECK-NOT: call 
-; CHECK: vpminsb %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -856,11 +1956,22 @@ define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_512
-; CHECK-NOT: call 
-; CHECK: vpminsw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -869,11 +1980,24 @@ define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_512
-; CHECK-NOT: call 
-; CHECK: vpminub %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -882,11 +2006,22 @@ define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_512
-; CHECK-NOT: call 
-; CHECK: vpminuw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -895,11 +2030,24 @@ define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16
 
 declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2w %zmm{{.*}}{%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -908,11 +2056,24 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
 
 declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2w %zmm{{.*}}{%k1} {z}
 define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -921,11 +2082,24 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3
 
 declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermi2w %zmm{{.*}}{%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512F-32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -934,11 +2108,24 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
 
 declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_512
-; CHECK-NOT: call 
-; CHECK: vpavgb %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -947,11 +2134,22 @@ define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x
 
 declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_512
-; CHECK-NOT: call 
-; CHECK: vpavgw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -960,11 +2158,24 @@ define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16>
 
 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpshufb %zmm{{.*}}{%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -973,11 +2184,22 @@ define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsw{{.*}}{%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpabsw %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpabsw %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpabsw %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vpabsw %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -986,11 +2208,24 @@ define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16>
 
 declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsb{{.*}}{%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpabsb %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpabsb %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpabsb %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vpabsb %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -999,12 +2234,22 @@ define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: {%k1} 
-; CHECK: vpmulhuw {{.*}}encoding: [0x62
 define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1013,12 +2258,22 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i1
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: {%k1} 
-; CHECK: vpmulhw {{.*}}encoding: [0x62
 define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1027,14 +2282,627 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: {%k1} 
-; CHECK: vpmulhrsw {{.*}}encoding: [0x62
 define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
   ret <32 x i16> %res2
 }
+
+declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z}
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm1 {%k1}
+; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-32-NEXT:    retl
+    %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
+    %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
+    %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
+    %res3 = add <32 x i8> %res0, %res1
+    %res4 = add <32 x i8> %res3, %res2
+    ret <32 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32)
+
+define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi)
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpmovwb %zmm0, (%eax)
+; AVX512F-32-NEXT:    vpmovwb %zmm0, (%eax) {%k1}
+; AVX512F-32-NEXT:    retl
+    call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
+    call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
+    ret void
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z}
+; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm1 {%k1}
+; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-32-NEXT:    retl
+    %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
+    %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
+    %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
+    %res3 = add <32 x i8> %res0, %res1
+    %res4 = add <32 x i8> %res3, %res2
+    ret <32 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32)
+
+define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi)
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    vpmovswb %zmm0, (%ecx)
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovswb %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT:    retl
+    call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
+    call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
+    ret void
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z}
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1}
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-32-NEXT:    retl
+    %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
+    %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
+    %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
+    %res3 = add <32 x i8> %res0, %res1
+    %res4 = add <32 x i8> %res3, %res2
+    ret <32 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32)
+
+define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi)
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, (%ecx)
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT:    retl
+    call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
+    call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
+    ret void
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
+  %res2 = add <32 x i16> %res, %res1
+  ret <32 x i16> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovw %edi, %k1
+; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
+; AVX512F-32-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+  %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+  %res2 = add <64 x i8> %res, %res1
+  ret <64 x i8> %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
+; AVX512F-32-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+  %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+  %res2 = add <64 x i8> %res, %res1
+  ret <64 x i8> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
+; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
+; AVX512F-32-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res2 = add <32 x i16> %res, %res1
+  ret <32 x i16> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
+; AVX512F-32-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res2 = add <32 x i16> %res, %res1
+  ret <32 x i16> %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
+  %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
+  %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
+  %res3 = add <64 x i8> %res, %res1
+  %res4 = add <64 x i8> %res3, %res2
+  ret <64 x i8> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
+
+define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_dq_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpslldq $8, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpslldq $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_dq_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpslldq $8, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpslldq $4, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
+  %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
+
+define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsrldq $8, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrldq $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsrldq $8, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpsrldq $4, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
+  %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+declare  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
+
+define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psadb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psadb_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
+  %res1 = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
+  %res2 = add  <8 x i64> %res, %res1
+  ret  <8 x i64> %res2
+}
+
+declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
+
+define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
+  %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
+  ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
+
+define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k0
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    kunpckdq %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp8:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
+  %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
+  ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)
+
+define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_cvtb2mask_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_cvtb2mask_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp9:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
+    %res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0)
+    ret i64 %res
+}
+
+declare i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16>)
+
+define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_cvtw2mask_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_cvtw2mask_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
+    %res = call i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16> %x0)
+    ret i32 %res
+}
+
+declare <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64)
+
+define <64 x i8>@test_int_x86_avx512_cvtmask2b_512(i64 %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_cvtmask2b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_cvtmask2b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64 %x0)
+  ret <64 x i8> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32)
+
+define <32 x i16>@test_int_x86_avx512_cvtmask2w_512(i32 %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_cvtmask2w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_cvtmask2w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32 %x0)
+  ret <32 x i16> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16>, i8, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index f5413896789a6..1db6756c23a84 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -3763,6 +3763,38 @@ define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16>
   ret <16 x i16> %res2
 }
 
+; CHECK-LABEL: test_x86_mask_blend_b_256
+; CHECK: vpblendmb
+define <32 x i8> @test_x86_mask_blend_b_256(i32 %a0, <32 x i8> %a1, <32 x i8> %a2) {
+  %res = call <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8> %a1, <32 x i8> %a2, i32 %a0) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8>, <32 x i8>, i32) nounwind readonly
+
+; CHECK-LABEL: test_x86_mask_blend_w_256
+define <16 x i16> @test_x86_mask_blend_w_256(i16 %mask, <16 x i16> %a1, <16 x i16> %a2) {
+  ; CHECK: vpblendmw
+  %res = call <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16> %a1, <16 x i16> %a2, i16 %mask) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16>, <16 x i16>, i16) nounwind readonly
+
+; CHECK-LABEL: test_x86_mask_blend_b_128
+; CHECK: vpblendmb
+define <16 x i8> @test_x86_mask_blend_b_128(i16 %a0, <16 x i8> %a1, <16 x i8> %a2) {
+  %res = call <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8> %a1, <16 x i8> %a2, i16 %a0) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8>, <16 x i8>, i16) nounwind readonly
+
+; CHECK-LABEL: test_x86_mask_blend_w_128
+define <8 x i16> @test_x86_mask_blend_w_128(i8 %mask, <8 x i16> %a1, <8 x i16> %a2) {
+  ; CHECK: vpblendmw
+  %res = call <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16> %a1, <8 x i16> %a2, i8 %mask) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16>, <8 x i16>, i8) nounwind readonly
+
 declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_128
@@ -3843,3 +3875,719 @@ define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i
   %res2 = add <16 x i16> %res, %res1
   ret <16 x i16> %res2
 }
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
+; CHECK:       vpmovwb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovwb %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovwb %xmm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
+; CHECK:  vpmovwb %xmm0, (%rdi)
+; CHECK:  vpmovwb %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
+; CHECK:       vpmovswb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovswb %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovswb %xmm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
+; CHECK:  vpmovswb %xmm0, (%rdi)
+; CHECK:  vpmovswb %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
+; CHECK:       vpmovuswb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovuswb %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovuswb %xmm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
+; CHECK:  vpmovuswb %xmm0, (%rdi)
+; CHECK:  vpmovuswb %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256:
+; CHECK:       vpmovwb %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovwb %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovwb %ymm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16)
+
+define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
+; CHECK:  vpmovwb %ymm0, (%rdi)
+; CHECK:  vpmovwb %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
+    call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256:
+; CHECK:       vpmovswb %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovswb %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovswb %ymm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16)
+
+define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
+; CHECK:  vpmovswb %ymm0, (%rdi)
+; CHECK:  vpmovswb %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
+    call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256:
+; CHECK:       vpmovuswb %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovuswb %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovuswb %ymm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16)
+
+define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
+; CHECK:  vpmovuswb %ymm0, (%rdi)
+; CHECK:  vpmovuswb %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
+    call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
+    ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1)
+  %res2 = add <8 x i16> %res, %res1
+  ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1)
+  %res2 = add <16 x i16> %res, %res1
+  ret <16 x i16> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_128:
+; CHECK:         vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[8],k1[8],xmm2[9],k1[9],xmm2[10],k1[10],xmm2[11],k1[11],xmm2[12],k1[12],xmm2[13],k1[13],xmm2[14],k1[14],xmm2[15],k1[15]
+; CHECK-NEXT:    vpunpckhbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x68,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+  %res2 = add <16 x i8> %res, %res1
+  ret <16 x i8> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_128:
+; CHECK:         vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3],xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7]
+; CHECK-NEXT:    vpunpcklbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x60,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+  %res2 = add <16 x i8> %res, %res1
+  ret <16 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_256:
+; CHECK:         vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15],ymm2[24],k1[24],ymm2[25],k1[25],ymm2[26],k1[26],ymm2[27],k1[27],ymm2[28],k1[28],ymm2[29],k1[29],ymm2[30],k1[30],ymm2[31],k1[31]
+; CHECK-NEXT:    vpunpckhbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x68,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+  %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+  %res2 = add <32 x i8> %res, %res1
+  ret <32 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_256:
+; CHECK:         vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[16],k1[16],ymm2[17],k1[17],ymm2[18],k1[18],ymm2[19],k1[19],ymm2[20],k1[20],ymm2[21],k1[21],ymm2[22],k1[22],ymm2[23],k1[23]
+; CHECK-NEXT:    vpunpcklbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x60,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+  %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+  %res2 = add <32 x i8> %res, %res1
+  ret <32 x i8> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_128:
+; CHECK:         vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3]
+; CHECK-NEXT:    vpunpcklwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x61,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res2 = add <8 x i16> %res, %res1
+  ret <8 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_128:
+; CHECK:         vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7]
+; CHECK-NEXT:    vpunpckhwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x69,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res2 = add <8 x i16> %res, %res1
+  ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_256:
+; CHECK:         vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11]
+; CHECK-NEXT:    vpunpcklwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x61,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res2 = add <16 x i16> %res, %res1
+  ret <16 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_256:
+; CHECK:         vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15]
+; CHECK-NEXT:    vpunpckhwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x69,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res2 = add <16 x i16> %res, %res1
+  ret <16 x i16> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_palignr_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1)
+  %res3 = add <16 x i8> %res, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_palignr_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpalignr $2, %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpalignr $2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddb %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4)
+  %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4)
+  %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1)
+  %res3 = add <32 x i8> %res, %res1
+  %res4 = add <32 x i8> %res3, %res2
+  ret <32 x i8> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> zeroinitializer, i8 %x4)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res2, %res3
+  ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> zeroinitializer, i16 %x4)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
+  %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask)
+  %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask)
+  %res3 = add <32 x i8> %res, %res1
+  %res4 = add <32 x i8> %res2, %res3
+  ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask)
+  %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask)
+  %res3 = add <16 x i8> %res, %res1
+  %res4 = add <16 x i8> %res2, %res3
+  ret <16 x i8> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
+  %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask)
+  %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res2, %res3
+  ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask)
+  %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res2, %res3
+  ret <8 x i16> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
+; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xd0]
+; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xc0]
+; CHECK-NEXT:    vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
+; CHECK-NEXT:    vpaddb %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
+  %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask)
+  %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask)
+  %res3 = add <64 x i8> %res, %res1
+  %res4 = add <64 x i8> %res2, %res3
+  ret <64 x i8> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
+; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xd0]
+; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xc0]
+; CHECK-NEXT:    vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; CHECK-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
+  %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask)
+  %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res2, %res3
+  ret <32 x i16> %res4
+}
+
+declare i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8>)
+
+define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovb2m %xmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
+    %res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0)
+    ret i16 %res
+}
+
+declare i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8>)
+
+define i32@test_int_x86_avx512_cvtb2mask_256(<32 x i8> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovb2m %ymm0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    retq
+    %res = call i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8> %x0)
+    ret i32 %res
+}
+
+declare i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16>)
+
+define i8@test_int_x86_avx512_cvtw2mask_128(<8 x i16> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovw2m %xmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
+    %res = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %x0)
+    ret i8 %res
+}
+
+declare i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16>)
+
+define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovw2m %ymm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
+    %res = call i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16> %x0)
+    ret i16 %res
+}
+
+declare <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16)
+
+define <16 x i8>@test_int_x86_avx512_cvtmask2b_128(i16 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16 %x0)
+  ret <16 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32)
+
+define <32 x i8>@test_int_x86_avx512_cvtmask2b_256(i32 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32 %x0)
+  ret <32 x i8> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8)
+
+define <8 x i16>@test_int_x86_avx512_cvtmask2w_128(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8 %x0)
+  ret <8 x i16> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16)
+
+define <16 x i16>@test_int_x86_avx512_cvtmask2w_256(i16 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16 %x0)
+  ret <16 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res2, %res3
+  ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16>, <8 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16>, i8, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res2, %res3
+  ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16>, i8, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll
new file mode 100644
index 0000000000000..29f17bbc01908
--- /dev/null
+++ b/test/CodeGen/X86/avx512cd-intrinsics.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
+
+define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
+  ; CHECK: test_x86_vbroadcastmw_512
+  ; CHECK: vpbroadcastmw2d %k0, %zmm0
+  %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) ; 
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
+
+define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
+  ; CHECK: test_x86_broadcastmb_512
+  ; CHECK: vpbroadcastmb2q %k0, %zmm0
+  %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) ; 
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
+
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
new file mode 100644
index 0000000000000..14e91e1a87684
--- /dev/null
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -0,0 +1,179 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s
+
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readonly
+
+declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vplzcntd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vplzcntd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vplzcntd %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res2 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res2, %res3
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vplzcntd %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vplzcntd %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vplzcntq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vplzcntq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vplzcntq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vplzcntq %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_vpconflict_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpconflictd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpconflictd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpconflictd %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res2 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res2, %res3
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpconflict_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpconflictd %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpconflictd %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpconflict_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpconflictq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpconflictq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpconflictq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpconflictq %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
+  ; CHECK: test_x86_vbroadcastmw_256
+  ; CHECK: vpbroadcastmw2d %k0, %ymm0
+  %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; 
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
+
+define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
+  ; CHECK: test_x86_vbroadcastmw_128
+  ; CHECK: vpbroadcastmw2d %k0, %xmm0
+  %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; 
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
+
+define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
+  ; CHECK: test_x86_broadcastmb_256
+  ; CHECK: vpbroadcastmb2q %k0, %ymm0
+  %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; 
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
+
+define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
+  ; CHECK: test_x86_broadcastmb_128
+  ; CHECK: vpbroadcastmb2q %k0, %xmm0
+  %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; 
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
new file mode 100644
index 0000000000000..a59fe393f556f
--- /dev/null
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -0,0 +1,667 @@
+
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s
+
+declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8, i32)
+
+define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtpd2qq {ru-sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtpd2qq {rn-sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 0)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double>, <8 x i64>, i8, i32)
+
+define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtpd2uqq {ru-sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtpd2uqq {rn-sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 0)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float>, <8 x i64>, i8, i32)
+
+define <8 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtps2qq {ru-sae}, %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtps2qq {rn-sae}, %ymm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 0)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float>, <8 x i64>, i8, i32)
+
+define <8 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtps2uqq {ru-sae}, %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtps2uqq {rn-sae}, %ymm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 0)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_cvt_qq2pd_512(<8 x i64> %x0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtqq2pd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtqq2pd {rn-sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 -1, i32 0)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64>, <8 x float>, i8, i32)
+
+define <8 x float>@test_int_x86_avx512_mask_cvt_qq2ps_512(<8 x i64> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtqq2ps %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtqq2ps {rn-sae}, %zmm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 -1, i32 0)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double>, <8 x i64>, i8, i32)
+
+define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttpd2qq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvttpd2qq {sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double>, <8 x i64>, i8, i32)
+
+define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttpd2uqq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvttpd2uqq {sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double> %x0, <8 x i64> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float>, <8 x i64>, i8, i32)
+
+define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttps2qq %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvttps2qq {sae}, %ymm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float>, <8 x i64>, i8, i32)
+
+define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttps2uqq %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvttps2uqq {sae}, %ymm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float> %x0, <8 x i64> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_512(<8 x i64> %x0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtuqq2pd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtuqq2pd {rn-sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64> %x0, <8 x double> %x1, i8 -1, i32 0)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64>, <8 x float>, i8, i32)
+
+define <8 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_512(<8 x i64> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtuqq2ps %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtuqq2ps {rn-sae}, %zmm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64> %x0, <8 x float> %x1, i8 -1, i32 0)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_pd_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vreducepd {{.*}}{%k1}
+; CHECK: vreducepd
+; CHECK: {sae}
+define <8 x double>@test_int_x86_avx512_mask_reduce_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
+  %res = call <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double> %x0, i32 8, <8 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double> %x0, i32 4, <8 x double> %x2, i8 -1, i32 8)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
+; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ps_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vreduceps
+; CHECK: {sae}
+; CKECK: {%k1}
+; CHECK: vreduceps
+define <16 x float>@test_int_x86_avx512_mask_reduce_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
+  %res = call <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float> %x0, i32 44, <16 x float> %x2, i16 %x3, i32 8)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 4)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8, i32)
+; CHECK-LABEL: @test_int_x86_avx512_mask_range_pd_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrangepd
+; CKECK: {%k1}
+; CHECK: vrangepd
+; CHECK: {sae}
+define <8 x double>@test_int_x86_avx512_mask_range_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
+  %res = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %x0, <8 x double> %x1, i32 8, <8 x double> %x3, i8 %x4, i32 4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %x0, <8 x double> %x1, i32 4, <8 x double> %x3, i8 -1, i32 8)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_range_ps_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrangeps
+; CKECK: {%k1}
+; CHECK: vrangeps
+; CHECK: {sae}
+define <16 x float>@test_int_x86_avx512_mask_range_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
+  %res = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %x0, <16 x float> %x1, i32 88, <16 x float> %x3, i16 %x4, i32 4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %x0, <16 x float> %x1, i32 4, <16 x float> %x3, i16 -1, i32 8)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ss
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vreducess
+; CKECK: {%k1}
+; CHECK: vreducess
+; CHECK: {sae}
+define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+  %res = call <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32)
+; CHECK-LABEL: @test_int_x86_avx512_mask_range_ss
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrangess
+; CHECK: {sae}
+; CKECK: {%k1}
+; CHECK: vrangess
+; CHECK: {sae}
+define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+  %res = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 8)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_sd
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vreducesd
+; CKECK: {%k1}
+; CHECK: vreducesd
+; CHECK: {sae}
+define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+  %res = call <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32)
+; CHECK-LABEL: @test_int_x86_avx512_mask_range_sd
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrangesd
+; CKECK: {%k1}
+; CHECK: vrangesd
+; CHECK: {sae}
+define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+  %res = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+
+declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res  = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res2, %res3
+  ret <8 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res2, %res3
+  ret <16 x float> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res3, %res2
+  ret <8 x double> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res2, %res3
+  ret <8 x i64> %res4
+}
+
+declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_pd_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vfpclasspd
+; CHECK: {%k1}
+; CHECK: vfpclasspd
+; CHECK: kmovb   %k0
+define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) {
+    %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 2, i8 %x1)
+    %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 4, i8 -1)
+    %res2 = add i8 %res, %res1
+    ret i8 %res2
+}
+declare i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float>, i32, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ps_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vfpclassps
+; CHECK: vfpclassps
+; CHECK: {%k1}
+; CHECK: kmov
+define i16@test_int_x86_avx512_mask_fpclass_ps_512(<16 x float> %x0, i16 %x1) {
+    %res = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 %x1)
+    %res1 = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 -1)
+    %res2 = add i16 %res, %res1
+    ret i16 %res2
+}
+
+declare i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double>, i32, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_sd
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vfpclasssd
+; CHECK: %k0 {%k1}
+; CHECK: vfpclasssd
+; CHECK: %k0
+define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) {
+  %res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 2, i8 %x1)
+  %res1 = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1)
+  %res2 = add i8 %res, %res1
+  ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float>, i32, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ss
+; CHECK-NOT: call
+; CHECK: kmovw
+; CHECK: vfpclassss
+; CHECK: %k0
+; CHECK: {%k1}
+; CHECK: kmovw
+; CHECK: vfpclassss
+; CHECK: %k0
+define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) {
+  %res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 %x1)
+  %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1)
+  %res2 = add i8 %res, %res1
+  ret i8 %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float>  %x0, <16 x float> %x2, i16 %x3)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %x3)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res3, %res2
+  ret <16 x float> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32>  %x0, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32>)
+
+define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovd2m %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
+  %res = call i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32> %x0)
+  ret i16 %res
+}
+
+declare i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64>)
+
+define i8@test_int_x86_avx512_cvtq2mask_512(<8 x i64> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovq2m %zmm0, %k0
+; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    retq
+  %res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0)
+  ret i8 %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16)
+
+define <16 x i32>@test_int_x86_avx512_cvtmask2d_512(i16 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16 %x0)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8)
+
+define <8 x i64>@test_int_x86_avx512_cvtmask2q_512(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8 %x0)
+  ret <8 x i64> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
+; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm1 {%k1}
+; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm0
+; CHECK: vaddps %zmm1, %zmm0, %zmm0
+; CHECK: vaddps %zmm0, %zmm2, %zmm0
+
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask)
+  %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
+  %res4 = fadd <16 x float> %res1, %res2
+  %res5 = fadd <16 x float> %res3, %res4
+  ret <16 x float> %res5
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512:
+; CHECK: kmovb %edi, %k1
+; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
+; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm1 {%k1}
+; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm0
+; CHECK: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK: vaddpd %zmm0, %zmm2, %zmm0
+
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask)
+  %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <8 x double> %res1, %res2
+  %res5 = fadd <8 x double> %res3, %res4
+  ret <8 x double> %res5
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
+; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm1 {%k1}
+; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm0
+; CHECK: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK: vpaddd %zmm0, %zmm2, %zmm0
+
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask)
+  %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
+  %res4 = add <16 x i32> %res1, %res2
+  %res5 = add <16 x i32> %res3, %res4
+  ret <16 x i32> %res5
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512:
+; CHECK: kmovb %edi, %k1
+; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
+; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm1 {%k1}
+; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm0
+; CHECK: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK: vpaddq %zmm0, %zmm2, %zmm0
+
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask)
+  %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
+  %res4 = add <8 x i64> %res1, %res2
+  %res5 = add <8 x i64> %res3, %res4
+  ret <8 x i64> %res5
+}
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index c577abee66403..2065322009da6 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -1134,7 +1134,7 @@ define <16 x float> @test_mask_xor_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
 
 define <16 x float> @test_mask_xor_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
   ;CHECK-LABEL: test_mask_xor_ps_rmbk_512
-  ;CHECK: vxorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x57,0x0f]
+  ;CHECK: vxorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} 
   %q = load float, float* %ptr_b
   %vecinit.i = insertelement <16 x float> undef, float %q, i32 0
   %b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -1144,7 +1144,7 @@ define <16 x float> @test_mask_xor_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <
 
 define <16 x float> @test_mask_xor_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
   ;CHECK-LABEL: test_mask_xor_ps_rmbkz_512
-  ;CHECK: vxorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x57,0x07]
+  ;CHECK: vxorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} 
   %q = load float, float* %ptr_b
   %vecinit.i = insertelement <16 x float> undef, float %q, i32 0
   %b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -1152,4 +1152,816 @@ define <16 x float> @test_mask_xor_ps_rmbkz_512(<16 x float> %a, float* %ptr_b,
   ret <16 x float> %res
 }
 
-declare <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
\ No newline at end of file
+declare <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
+
+declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtpd2qq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtpd2qq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtpd2qq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtpd2qq %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtpd2uqq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtpd2uqq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtpd2uqq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtpd2uqq %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtps2qq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtps2qq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtps2qq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtps2qq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtps2uqq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtps2uqq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtps2uqq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtps2uqq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_cvt_qq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtqq2pd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_cvt_qq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtqq2pd %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtqq2pd %ymm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtqq2ps %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtqq2ps %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtqq2ps %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttpd2qq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvttpd2qq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttpd2qq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvttpd2qq %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttpd2uqq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvttpd2uqq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttpd2uqq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvttpd2uqq %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttps2qq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvttps2qq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttps2qq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvttps2qq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtuqq2pd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtuqq2pd %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtuqq2pd %ymm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtuqq2ps %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtuqq2ps %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvtuqq2ps %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttps2uqq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvttps2uqq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vcvttps2uqq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvttps2uqq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_pd_128
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vreducepd {{.*}}{%k1} 
+; CHECK: vreducepd
+define <2 x double>@test_int_x86_avx512_mask_reduce_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+  %res = call <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double> %x0, i32 8, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_pd_256
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vreducepd {{.*}}{%k1} 
+; CHECK: vreducepd
+define <4 x double>@test_int_x86_avx512_mask_reduce_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+  %res = call <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double> %x0, i32 0, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float>, i32, <4 x float>, i8)
+; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ps_128
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vreduceps {{.*}}{%k1} 
+; CHECK: vreduceps
+define <4 x float>@test_int_x86_avx512_mask_reduce_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+  %res = call <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ps_256
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vreduceps {{.*}}{%k1} 
+; CHECK: vreduceps
+define <8 x float>@test_int_x86_avx512_mask_reduce_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+  %res = call <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_range_pd_128
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vrangepd {{.*}}{%k1} 
+; CHECK: vrangepd
+define <2 x double>@test_int_x86_avx512_mask_range_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+  %res = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %x0, <2 x double> %x1, i32 4, <2 x double> %x3, i8 %x4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %x0, <2 x double> %x1, i32 8, <2 x double> %x3, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double>, <4 x double>, i32, <4 x double>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_range_pd_256
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vrangepd {{.*}}{%k1} 
+; CHECK: vrangepd
+define <4 x double>@test_int_x86_avx512_mask_range_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
+  %res = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %x0, <4 x double> %x1, i32 4, <4 x double> %x3, i8 %x4)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %x0, <4 x double> %x1, i32 88, <4 x double> %x3, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float>, <4 x float>, i32, <4 x float>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_range_ps_128
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vrangeps {{.*}}{%k1} 
+; CHECK: vrangeps
+define <4 x float>@test_int_x86_avx512_mask_range_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+  %res = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %x0, <4 x float> %x1, i32 4, <4 x float> %x3, i8 %x4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %x0, <4 x float> %x1, i32 88, <4 x float> %x3, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float>, <8 x float>, i32, <8 x float>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_range_ps_256
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vrangeps {{.*}}{%k1} 
+; CHECK: vrangeps
+define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
+  %res = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %x0, <8 x float> %x1, i32 4, <8 x float> %x3, i8 %x4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %x0, <8 x float> %x1, i32 88, <8 x float> %x3, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res3, %res2
+  ret <2 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4)
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res2, %res3
+  ret <4 x double> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ps_128
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vfpclassps
+; CHECK: {%k1} 
+; CHECK: vfpclassps
+; CHECK: kmovb   %k0
+define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) {
+  %res = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 2, i8 %x1)
+  %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 4, i8 -1)
+  %res2 = add i8 %res, %res1
+  ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float>, i32, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ps_256
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vfpclassps
+; CHECK: {%k1} 
+; CHECK: vfpclassps
+; CHECK: kmovb   %k0
+define i8 @test_int_x86_avx512_mask_fpclass_ps_256(<8 x float> %x0, i8 %x1) {
+  %res = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 2, i8 %x1)
+  %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 4, i8 -1)
+  %res2 = add i8 %res, %res1
+  ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double>, i32, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_pd_128
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vfpclasspd
+; CHECK: {%k1} 
+; CHECK: vfpclasspd
+; CHECK: kmovb   %k0
+define i8 @test_int_x86_avx512_mask_fpclass_pd_128(<2 x double> %x0, i8 %x1) {
+  %res =  call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 4, i8 %x1)
+  %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 2, i8 -1)
+  %res2 = add i8 %res, %res1
+  ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double>, i32, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_pd_256
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: vfpclasspd
+; CHECK: {%k1} 
+; CHECK: vfpclasspd
+; CHECK: kmovb   %k0
+define i8 @test_int_x86_avx512_mask_fpclass_pd_256(<4 x double> %x0, i8 %x1) {
+  %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 2, i8 %x1)
+  %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 4, i8 -1)
+  %res2 = add i8 %res, %res1
+  ret i8 %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>  %x0, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %x3)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 -1)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res3, %res2
+  ret <8 x float> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32>  %x0, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>  %x0, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>)
+
+define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovd2m %xmm0, %k0
+; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    retq
+    %res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0)
+    ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32>)
+
+define i8@test_int_x86_avx512_cvtd2mask_256(<8 x i32> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovd2m %ymm0, %k0
+; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    retq
+    %res = call i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32> %x0)
+    ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64>)
+
+define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovq2m %xmm0, %k0
+; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    retq
+    %res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0)
+    ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64>)
+
+define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovq2m %ymm0, %k0
+; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    retq
+    %res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0)
+    ret i8 %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8)
+
+define <4 x i32>@test_int_x86_avx512_cvtmask2d_128(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8 %x0)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8)
+
+define <8 x i32>@test_int_x86_avx512_cvtmask2d_256(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8 %x0)
+  ret <8 x i32> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8)
+
+define <2 x i64>@test_int_x86_avx512_cvtmask2q_128(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8 %x0)
+  ret <2 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8)
+
+define <4 x i64>@test_int_x86_avx512_cvtmask2q_256(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovb %edi, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8 %x0)
+  ret <4 x i64> %res
+}
+declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256:
+; CHECK: kmovb %edi, %k1
+; CHECK: vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
+; CHECK: vshuff64x2 $0, %ymm0, %ymm0, %ymm1 {%k1}
+; CHECK: vshuff64x2 $0, %ymm0, %ymm0, %ymm0
+; CHECK: vaddpd %ymm1, %ymm0, %ymm0
+; CHECK: vaddpd %ymm0, %ymm2, %ymm0
+
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask)
+  %res3 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <4 x double> %res1, %res2
+  %res5 = fadd <4 x double> %res3, %res4
+  ret <4 x double> %res5
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256:
+; CHECK: kmovb %edi, %k1
+; CHECK: vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
+; CHECK: vshufi64x2 $0, %ymm0, %ymm0, %ymm1 {%k1}
+; CHECK: vshufi64x2 $0, %ymm0, %ymm0, %ymm0
+; CHECK: vpaddq %ymm1, %ymm0, %ymm0
+; CHECK: vpaddq %ymm0, %ymm2, %ymm0
+
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask)
+  %res3 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> zeroinitializer, i8 %mask)
+  %res4 = add <4 x i64> %res1, %res2
+  %res5 = add <4 x i64> %res3, %res4
+  ret <4 x i64> %res5
+}
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index fb7c93dc53b3a..d9e8728c5ca66 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -1867,7 +1867,7 @@ define <4 x i32> @test_mask_xor_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i
 
 define <4 x i32> @test_mask_xor_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
   ;CHECK-LABEL: test_mask_xor_epi32_rmbkz_128
-  ;CHECK: vpxord  (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07]  
+  ;CHECK: vpxord  (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2299,7 +2299,7 @@ define <8 x float> @test_mm512_maskz_add_ps_256(<8 x float> %a0, <8 x float> %a1
 
 define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_add_ps_256
-  ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1} 
+  ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1}
   %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
   ret <8 x float> %res
 }
@@ -2321,7 +2321,7 @@ define <4 x float> @test_mm512_maskz_add_ps_128(<4 x float> %a0, <4 x float> %a1
 
 define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_add_ps_128
-  ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1} 
+  ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1}
   %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
   ret <4 x float> %res
 }
@@ -2343,7 +2343,7 @@ define <8 x float> @test_mm512_maskz_sub_ps_256(<8 x float> %a0, <8 x float> %a1
 
 define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_sub_ps_256
-  ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1} 
+  ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1}
   %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
   ret <8 x float> %res
 }
@@ -2365,7 +2365,7 @@ define <4 x float> @test_mm512_maskz_sub_ps_128(<4 x float> %a0, <4 x float> %a1
 
 define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_sub_ps_128
-  ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1} 
+  ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1}
   %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
   ret <4 x float> %res
 }
@@ -2387,7 +2387,7 @@ define <8 x float> @test_mm512_maskz_mul_ps_256(<8 x float> %a0, <8 x float> %a1
 
 define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_mul_ps_256
-  ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1} 
+  ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1}
   %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
   ret <8 x float> %res
 }
@@ -2409,7 +2409,7 @@ define <4 x float> @test_mm512_maskz_mul_ps_128(<4 x float> %a0, <4 x float> %a1
 
 define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_mul_ps_128
-  ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1} 
+  ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1}
   %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
   ret <4 x float> %res
 }
@@ -2431,7 +2431,7 @@ define <8 x float> @test_mm512_maskz_div_ps_256(<8 x float> %a0, <8 x float> %a1
 
 define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_div_ps_256
-  ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1} 
+  ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1}
   %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
   ret <8 x float> %res
 }
@@ -2453,7 +2453,7 @@ define <4 x float> @test_mm512_maskz_div_ps_128(<4 x float> %a0, <4 x float> %a1
 
 define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_div_ps_128
-  ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1} 
+  ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1}
   %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
   ret <4 x float> %res
 }
@@ -2475,7 +2475,7 @@ define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1
 
 define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_max_ps_256
-  ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1} 
+  ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1}
   %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
   ret <8 x float> %res
 }
@@ -2497,7 +2497,7 @@ define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1
 
 define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_max_ps_128
-  ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1} 
+  ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1}
   %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
   ret <4 x float> %res
 }
@@ -2519,7 +2519,7 @@ define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1
 
 define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_min_ps_256
-  ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1} 
+  ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1}
   %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
   ret <8 x float> %res
 }
@@ -2541,7 +2541,7 @@ define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1
 
 define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
   ;CHECK-LABEL: test_mm512_mask_min_ps_128
-  ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1} 
+  ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1}
   %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
   ret <4 x float> %res
 }
@@ -2591,9 +2591,9 @@ declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float>
 declare <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_128
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpmaxsd %xmm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
   %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2 ,i8 %mask)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
@@ -2604,9 +2604,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %
 declare <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_256
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpmaxsd %ymm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
   %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -2617,9 +2617,9 @@ define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %
 declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_128
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpmaxsq %xmm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
   %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -2630,9 +2630,9 @@ define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %
 declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_256
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpmaxsq %ymm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
   %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
@@ -2643,9 +2643,9 @@ define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %
 declare <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_128
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpmaxud %xmm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2,i8 %mask) {
   %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
@@ -2656,9 +2656,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %
 declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_256
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpmaxud %ymm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
   %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -2669,9 +2669,9 @@ define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %
 declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_128
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpmaxuq %xmm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
   %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -2682,9 +2682,9 @@ define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %
 declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_256
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpmaxuq %ymm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
   %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
@@ -2695,9 +2695,9 @@ define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %
 declare <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_128
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpminsd %xmm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
   %res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
@@ -2708,9 +2708,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> %
 declare <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_256
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpminsd %ymm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
   %res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -2721,9 +2721,9 @@ define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %
 declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_128
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpminsq %xmm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
   %res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -2734,9 +2734,9 @@ define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %
 declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_256
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpminsq %ymm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
   %res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
@@ -2747,9 +2747,9 @@ define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> %
 declare <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_128
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpminud %xmm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
   %res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
@@ -2760,9 +2760,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> %
 declare <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_256
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpminud %ymm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
   %res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -2773,9 +2773,9 @@ define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %
 declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_128
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpminuq %xmm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
   %res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -2786,9 +2786,9 @@ define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %
 declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_256
-; CHECK-NOT: call 
+; CHECK-NOT: call
 ; CHECK: vpminuq %ymm
-; CHECK: {%k1} 
+; CHECK: {%k1}
 define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
   %res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
@@ -2799,8 +2799,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %
 declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_128
-; CHECK-NOT: call 
-; CHECK: kmov 
+; CHECK-NOT: call
+; CHECK: kmov
 ; CHECK: vpermt2d %xmm{{.*}}{%k1}
 ; CHECK-NOT: {z}
 define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
@@ -2813,8 +2813,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i
 declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_128
-; CHECK-NOT: call 
-; CHECK: kmov 
+; CHECK-NOT: call
+; CHECK: kmov
 ; CHECK: vpermt2d %xmm{{.*}}{%k1} {z}
 define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
   %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
@@ -2826,8 +2826,8 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
 declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_256
-; CHECK-NOT: call 
-; CHECK: kmov 
+; CHECK-NOT: call
+; CHECK: kmov
 ; CHECK: vpermt2d %ymm{{.*}}{%k1}
 ; CHECK-NOT: {z}
 define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
@@ -2840,8 +2840,8 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i
 declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_256
-; CHECK-NOT: call 
-; CHECK: kmov 
+; CHECK-NOT: call
+; CHECK: kmov
 ; CHECK: vpermt2d {{.*}}{%k1} {z}
 define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
   %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
@@ -2853,9 +2853,9 @@ define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x
 declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_128
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermi2pd %xmm{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2pd %xmm{{.*}}{%k1}
 define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
   %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
@@ -2866,9 +2866,9 @@ define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0,
 declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_256
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermi2pd %ymm{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2pd %ymm{{.*}}{%k1}
 define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
   %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
   %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
@@ -2879,9 +2879,9 @@ define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0,
 declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_128
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermi2ps %xmm{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2ps %xmm{{.*}}{%k1}
 define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
   %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
@@ -2892,9 +2892,9 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <
 declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_256
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermi2ps %ymm{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2ps %ymm{{.*}}{%k1}
 define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
   %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
   %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
@@ -2905,9 +2905,9 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
 declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_128
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsq{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsq{{.*}}{%k1}
 define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
   %res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
@@ -2918,9 +2918,9 @@ define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x
 declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_256
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsq{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsq{{.*}}{%k1}
 define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
   %res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
@@ -2931,9 +2931,9 @@ define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x
 declare <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32>, <4 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_128
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsd{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsd{{.*}}{%k1}
 define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
   %res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
@@ -2944,9 +2944,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x
 declare <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32>, <8 x i32>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_256
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsd{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsd{{.*}}{%k1}
 define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
   %res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
@@ -2958,9 +2958,9 @@ define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x
 declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_128
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vscalefpd{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefpd{{.*}}{%k1}
 define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
@@ -2971,9 +2971,9 @@ define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2
 declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_256
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vscalefpd{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefpd{{.*}}{%k1}
 define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
   %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
   %res1 = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
@@ -2983,9 +2983,9 @@ define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4
 
 declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
 ; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_128
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vscalefps{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefps{{.*}}{%k1}
 define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
@@ -2995,12 +2995,2809 @@ define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x
 
 declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
 ; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_256
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vscalefps{{.*}}{%k1} 
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefps{{.*}}{%k1}
 define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
   %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
   %res1 = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
   %res2 = fadd <8 x float> %res, %res1
   ret <8 x float> %res2
-}
\ No newline at end of file
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128:
+; CHECK:         vunpckhpd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[1],k1[1]
+; CHECK-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x15,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[1],xmm1[1]
+  %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_256:
+; CHECK:         vunpckhpd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[1],k1[1],ymm2[3],k1[3]
+; CHECK-NEXT:    vunpckhpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x15,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_128:
+; CHECK:         vunpckhps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[2],k1[2],xmm2[3],k1[3]
+; CHECK-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x15,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256:
+; CHECK:       ## BB#0:
+; CHECK:         vunpckhps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[2],k1[2],ymm2[3],k1[3],ymm2[6],k1[6],ymm2[7],k1[7]
+; CHECK-NEXT:    vunpckhps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x15,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+  %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_128:
+; CHECK:         vunpcklpd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0]
+; CHECK-NEXT:    vunpcklpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x14,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0]
+  %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_256:
+; CHECK:         vunpcklpd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[2],k1[2]
+; CHECK-NEXT:    vunpcklpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x14,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+  %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_128:
+; CHECK:         vunpcklps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1]
+; CHECK-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x14,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+  %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_256:
+; CHECK:         vunpcklps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[4],k1[4],ymm2[5],k1[5]
+; CHECK-NEXT:    vunpcklps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x14,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+  %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_128:
+; CHECK:         vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[2],k1[2],xmm2[3],k1[3]
+; CHECK-NEXT:    vpunpckhdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6a,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_128:
+; CHECK:         vpunpckldq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1]
+; CHECK-NEXT:    vpunpckldq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x62,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_256:
+; CHECK:       ## BB#0:
+; CHECK:         vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[2],k1[2],ymm2[3],k1[3],ymm2[6],k1[6],ymm2[7],k1[7]
+; CHECK-NEXT:    vpunpckhdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6a,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_256:
+; CHECK:         vpunpckldq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[4],k1[4],ymm2[5],k1[5]
+; CHECK-NEXT:    vpunpckldq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x62,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128:
+; CHECK:         vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[1],k1[1]
+; CHECK-NEXT:    vpunpckhqdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6d,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[1],xmm1[1]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128:
+; CHECK:         vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0]
+; CHECK-NEXT:    vpunpcklqdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6c,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256:
+; CHECK:         vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[2],k1[2]
+; CHECK-NEXT:    vpunpcklqdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6c,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+  %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256:
+; CHECK:         vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[1],k1[1],ymm2[3],k1[3]
+; CHECK-NEXT:    vpunpckhqdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6d,0xc1]
+; CHECK-NEXT:    ## ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+  %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_128:
+; CHECK:       vpmovqb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovqb %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovqb %xmm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_128:
+; CHECK:  vpmovqb %xmm0, (%rdi)
+; CHECK:  vpmovqb %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_128:
+; CHECK:       vpmovsqb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovsqb %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovsqb %xmm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_128:
+; CHECK:  vpmovsqb %xmm0, (%rdi)
+; CHECK:  vpmovsqb %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_128:
+; CHECK:       vpmovusqb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovusqb %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovusqb %xmm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_128:
+; CHECK:  vpmovusqb %xmm0, (%rdi)
+; CHECK:  vpmovusqb %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_256:
+; CHECK:       vpmovqb %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovqb %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovqb %ymm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_256:
+; CHECK:  vpmovqb %ymm0, (%rdi)
+; CHECK:  vpmovqb %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_256:
+; CHECK:       vpmovsqb %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovsqb %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovsqb %ymm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_256:
+; CHECK:  vpmovsqb %ymm0, (%rdi)
+; CHECK:  vpmovsqb %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_256:
+; CHECK:       vpmovusqb %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovusqb %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovusqb %ymm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_256:
+; CHECK:  vpmovusqb %ymm0, (%rdi)
+; CHECK:  vpmovusqb %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_128:
+; CHECK:       vpmovqw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovqw %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovqw %xmm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_128:
+; CHECK:  vpmovqw %xmm0, (%rdi)
+; CHECK:  vpmovqw %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_128:
+; CHECK:       vpmovsqw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovsqw %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovsqw %xmm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_128:
+; CHECK:  vpmovsqw %xmm0, (%rdi)
+; CHECK:  vpmovsqw %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_128:
+; CHECK:       vpmovusqw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovusqw %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovusqw %xmm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_128:
+; CHECK:  vpmovusqw %xmm0, (%rdi)
+; CHECK:  vpmovusqw %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_256:
+; CHECK:       vpmovqw %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovqw %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovqw %ymm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_256:
+; CHECK:  vpmovqw %ymm0, (%rdi)
+; CHECK:  vpmovqw %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_256:
+; CHECK:       vpmovsqw %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovsqw %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovsqw %ymm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_256:
+; CHECK:  vpmovsqw %ymm0, (%rdi)
+; CHECK:  vpmovsqw %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_256:
+; CHECK:       vpmovusqw %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovusqw %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovusqw %ymm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_256:
+; CHECK:  vpmovusqw %ymm0, (%rdi)
+; CHECK:  vpmovusqw %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_128:
+; CHECK:       vpmovqd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovqd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovqd %xmm0, %xmm0
+    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
+    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
+    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+    %res3 = add <4 x i32> %res0, %res1
+    %res4 = add <4 x i32> %res3, %res2
+    ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_128:
+; CHECK:  vpmovqd %xmm0, (%rdi)
+; CHECK:  vpmovqd %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_128:
+; CHECK:       vpmovsqd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovsqd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovsqd %xmm0, %xmm0
+    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
+    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
+    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+    %res3 = add <4 x i32> %res0, %res1
+    %res4 = add <4 x i32> %res3, %res2
+    ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_128:
+; CHECK:  vpmovsqd %xmm0, (%rdi)
+; CHECK:  vpmovsqd %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_128:
+; CHECK:       vpmovusqd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovusqd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovusqd %xmm0, %xmm0
+    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
+    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
+    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+    %res3 = add <4 x i32> %res0, %res1
+    %res4 = add <4 x i32> %res3, %res2
+    ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_128:
+; CHECK:  vpmovusqd %xmm0, (%rdi)
+; CHECK:  vpmovusqd %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_256:
+; CHECK:       vpmovqd %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovqd %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovqd %ymm0, %xmm0
+    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
+    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
+    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+    %res3 = add <4 x i32> %res0, %res1
+    %res4 = add <4 x i32> %res3, %res2
+    ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_256:
+; CHECK:  vpmovqd %ymm0, (%rdi)
+; CHECK:  vpmovqd %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_256:
+; CHECK:       vpmovsqd %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovsqd %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovsqd %ymm0, %xmm0
+    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
+    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
+    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+    %res3 = add <4 x i32> %res0, %res1
+    %res4 = add <4 x i32> %res3, %res2
+    ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_256:
+; CHECK:  vpmovsqd %ymm0, (%rdi)
+; CHECK:  vpmovsqd %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_256:
+; CHECK:       vpmovusqd %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovusqd %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovusqd %ymm0, %xmm0
+    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
+    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
+    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+    %res3 = add <4 x i32> %res0, %res1
+    %res4 = add <4 x i32> %res3, %res2
+    ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_256:
+; CHECK:  vpmovusqd %ymm0, (%rdi)
+; CHECK:  vpmovusqd %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_128:
+; CHECK:       vpmovdb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovdb %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovdb %xmm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_128:
+; CHECK:  vpmovdb %xmm0, (%rdi)
+; CHECK:  vpmovdb %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_128:
+; CHECK:       vpmovsdb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovsdb %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovsdb %xmm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_128:
+; CHECK:  vpmovsdb %xmm0, (%rdi)
+; CHECK:  vpmovsdb %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_128:
+; CHECK:       vpmovusdb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovusdb %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovusdb %xmm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_128:
+; CHECK:  vpmovusdb %xmm0, (%rdi)
+; CHECK:  vpmovusdb %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_256:
+; CHECK:       vpmovdb %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovdb %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovdb %ymm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_256:
+; CHECK:  vpmovdb %ymm0, (%rdi)
+; CHECK:  vpmovdb %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_256:
+; CHECK:       vpmovsdb %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovsdb %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovsdb %ymm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_256:
+; CHECK:  vpmovsdb %ymm0, (%rdi)
+; CHECK:  vpmovsdb %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_256:
+; CHECK:       vpmovusdb %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovusdb %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovusdb %ymm0, %xmm0
+    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+    %res3 = add <16 x i8> %res0, %res1
+    %res4 = add <16 x i8> %res3, %res2
+    ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_256:
+; CHECK:  vpmovusdb %ymm0, (%rdi)
+; CHECK:  vpmovusdb %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_128:
+; CHECK:       vpmovdw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovdw %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovdw %xmm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_128:
+; CHECK:  vpmovdw %xmm0, (%rdi)
+; CHECK:  vpmovdw %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_128:
+; CHECK:       vpmovsdw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovsdw %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovsdw %xmm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_128:
+; CHECK:  vpmovsdw %xmm0, (%rdi)
+; CHECK:  vpmovsdw %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_128:
+; CHECK:       vpmovusdw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovusdw %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovusdw %xmm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_128:
+; CHECK:  vpmovusdw %xmm0, (%rdi)
+; CHECK:  vpmovusdw %xmm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_256:
+; CHECK:       vpmovdw %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovdw %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovdw %ymm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_256:
+; CHECK:  vpmovdw %ymm0, (%rdi)
+; CHECK:  vpmovdw %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_256:
+; CHECK:       vpmovsdw %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovsdw %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovsdw %ymm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_256:
+; CHECK:  vpmovsdw %ymm0, (%rdi)
+; CHECK:  vpmovsdw %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_256:
+; CHECK:       vpmovusdw %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:  vpmovusdw %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:  vpmovusdw %ymm0, %xmm0
+    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+    %res3 = add <8 x i16> %res0, %res1
+    %res4 = add <8 x i16> %res3, %res2
+    ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_256:
+; CHECK:  vpmovusdw %ymm0, (%rdi)
+; CHECK:  vpmovusdw %ymm0, (%rdi) {%k1}
+    call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
+    call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
+    ret void
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtdq2pd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtpd2dq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtpd2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtpd2dq %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtpd2dq %ymm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtpd2ps %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtpd2udq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtpd2udq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtpd2udq %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtpd2udq %ymm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtps2dq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtps2pd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtps2pd %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 %x2)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 %x2)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtps2udq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtps2udq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtps2udq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtps2udq %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvttpd2dq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvttpd2udq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvttpd2udq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvttpd2udq %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvttps2udq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvttps2udq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtudq2pd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtudq2pd %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8)
+; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrndscalepd {{.*}}{%k1}
+; CHECK: vrndscalepd
+define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 88, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8)
+; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrndscalepd {{.*}}{%k1}
+; CHECK: vrndscalepd
+define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+  %res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 88, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8)
+; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrndscaleps {{.*}}{%k1}
+; CHECK: vrndscaleps
+define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrndscaleps {{.*}}{%k1}
+; CHECK: vrndscaleps
+define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+  %res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 66, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float>, <8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    ## ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res2, %res3
+  ret <8 x float> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double>, <4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    ## ymm3 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ## ymm0 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4)
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res2, %res3
+  ret <4 x double> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshufi32x4 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT:    vshufi64x2 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ## ymm0 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res  = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 %x3)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> zeroinitializer, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vgetmantpd $11, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vgetmantpd $11, %ymm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vgetmantps $11, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vgetmantps $11, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vgetmantps $11, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vgetmantps $11, %ymm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshufpd $22, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[1]
+; CHECK-NEXT:    vshufpd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    ## xmm3 = k1[0],xmm0[1]
+; CHECK-NEXT:    vshufpd $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[1]
+; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> %x3, i8 %x4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> %x3, i8 -1)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> zeroinitializer, i8 %x4)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double>, <4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshufpd $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[1],ymm2[3],k1[2]
+; CHECK-NEXT:    vshufpd $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
+; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float>, <4 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshufps $22, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    ## xmm2 = xmm2[2,1],k1[1,0]
+; CHECK-NEXT:    vshufps $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    ## xmm0 = xmm0[2,1],xmm1[1,0]
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 %x4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float>, <8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vshufps $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    ## ymm2 = ymm2[2,1],k1[1,0],ymm2[6,5],k1[5,4]
+; CHECK-NEXT:    vshufps $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ## ymm0 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4]
+; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    valignd $22, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    valignd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    valignd $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1)
+    %res2 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4)
+  %res3 = add <4 x i32> %res, %res1
+    %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    valignd $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    valignd $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    valignq $22, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    valignq $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    valignq $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    valignq $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpermilpd $22, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    ## ymm1 = ymm1[0,1,3,2]
+; CHECK-NEXT:    vpermilpd $22, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    ## ymm2 = k1[0,1,3,2]
+; CHECK-NEXT:    vpermilpd $22, %ymm0, %ymm0
+; CHECK-NEXT:    ## ymm0 = ymm0[0,1,3,2]
+; CHECK-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1)
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res2, %res3
+  ret <4 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpermilpd $1, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    ## xmm1 = xmm1[1,0]
+; CHECK-NEXT:    vpermilpd $1, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    ## xmm2 = k1[1,0]
+; CHECK-NEXT:    vpermilpd $1, %xmm0, %xmm0
+; CHECK-NEXT:    ## xmm0 = xmm0[1,0]
+; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res3, %res2
+  ret <2 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpermilps $22, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    ## ymm1 = ymm1[2,1,1,0,6,5,5,4]
+; CHECK-NEXT:    vpermilps $22, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    ## ymm2 = k1[2,1,1,0,6,5,5,4]
+; CHECK-NEXT:    vpermilps $22, %ymm0, %ymm0
+; CHECK-NEXT:    ## ymm0 = ymm0[2,1,1,0,6,5,5,4]
+; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res3, %res2
+  ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpermilps $22, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    ## xmm1 = xmm1[2,1,1,0]
+; CHECK-NEXT:    vpermilps $22, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    ## xmm2 = k1[2,1,1,0]
+; CHECK-NEXT:    vpermilps $22, %xmm0, %xmm0
+; CHECK-NEXT:    ## xmm0 = xmm0[2,1,1,0]
+; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res2, %res3
+  ret <4 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> zeroinitializer, i8 %x3)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res3, %res2
+  ret <2 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res3, %res2
+  ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> zeroinitializer, i8 %x3)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res2, %res3
+  ret <8 x float> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+
+  %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res2, %res3
+  ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8)
+
+define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
+  %res1 = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32, i8)
+
+define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32, i8)
+
+define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
+  %res1 = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32, i8)
+
+define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm3
+; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
+  %res1 = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 -1)
+  %res1 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask)
+  %res2 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res2, %res3
+  ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
+  %res1 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask)
+  %res2 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res2, %res3
+  ret <4 x i32> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
+  %res1 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask)
+  %res2 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res2, %res3
+  ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
+  %res1 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask)
+  %res2 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res2, %res3
+  ret <2 x i64> %res4
+}
+
+define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
+  ; CHECK: test_x86_vcvtph2ps_128
+  ; CHECK: vcvtph2ps  %xmm0, %xmm0    
+  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8 %mask) {
+  ; CHECK: test_x86_vcvtph2ps_128_rrk
+  ; CHECK: vcvtph2ps  %xmm0, %xmm1 {%k1}
+  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask)
+  ret <4 x float> %res
+}
+
+
+define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) {
+  ; CHECK: test_x86_vcvtph2ps_128_rrkz
+  ; CHECK: vcvtph2ps  %xmm0, %xmm0 {%k1} {z}
+  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16>, <4 x float>, i8) nounwind readonly
+
+define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
+  ; CHECK: test_x86_vcvtph2ps_256
+  ; CHECK: vcvtph2ps  %xmm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) {
+  ; CHECK: test_x86_vcvtph2ps_256_rrk
+  ; CHECK: vcvtph2ps  %xmm0, %ymm1 {%k1} 
+  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_vcvtph2ps_256_rrkz(<8 x i16> %a0, i8 %mask) {
+  ; CHECK: test_x86_vcvtph2ps_256_rrkz
+  ; CHECK: vcvtph2ps  %xmm0, %ymm0 {%k1} {z}
+  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, i8) nounwind readonly
+
+define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) {
+  ; CHECK: test_x86_vcvtps2ph_128
+  ; CHECK: vcvtps2ph $2, %xmm0, %xmm0
+  %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+
+declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float>, i32, <8 x i16>, i8) nounwind readonly
+
+define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
+  ; CHECK: test_x86_vcvtps2ph_256
+  ; CHECK: vcvtps2ph $2, %ymm0, %xmm0
+  %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16>, i8) nounwind readonly
+
+declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vmovsldup %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    ## xmm1 = xmm0[0,0,2,2]
+; CHECK-NEXT:    vmovsldup %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    ## xmm2 = xmm0[0,0,2,2]
+; CHECK-NEXT:    vmovsldup %xmm0, %xmm0 
+; CHECK-NEXT:    ## xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vmovsldup %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    ## ymm1 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    vmovsldup %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    ## ymm2 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    vmovsldup %ymm0, %ymm0 
+; CHECK-NEXT:    ## ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res2, %res3
+  ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vmovshdup %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    ## xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    ## xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vmovshdup %xmm0, %xmm0 
+; CHECK-NEXT:    ## xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vmovshdup %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    ## ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT:    vmovshdup %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    ## ymm2 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT:    vmovshdup %ymm0, %ymm0 
+; CHECK-NEXT:    ## ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res2, %res3
+  ret <8 x float> %res4
+}
+declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovddup %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    ## xmm1 = xmm0[0,0]
+; CHECK-NEXT:    vmovddup %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    ## xmm2 = xmm0[0,0]
+; CHECK-NEXT:    vmovddup %xmm0, %xmm0
+; CHECK-NEXT:    ## xmm0 = xmm0[0,0]
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovddup %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    ## ymm1 = ymm0[0,0,2,2]
+; CHECK-NEXT:    vmovddup %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    ## ymm2 = ymm0[0,0,2,2]
+; CHECK-NEXT:    vmovddup %ymm0, %ymm0
+; CHECK-NEXT:    ## ymm0 = ymm0[0,0,2,2]
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2)
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res2, %res3
+  ret <4 x double> %res4
+}
+
+define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) {
+; CHECK-LABEL: test_rsqrt_ps_256_rr:
+; CHECK: vrsqrt14ps %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_ps_256_rrkz:
+; CHECK: vrsqrt14ps %ymm0, %ymm0 {%k1} {z}
+  %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_ps_256_rrk:
+; CHECK: vrsqrt14ps %ymm0, %ymm1 {%k1}
+  %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) {
+; CHECK-LABEL: test_rsqrt_ps_128_rr:
+; CHECK: vrsqrt14ps %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_ps_128_rrkz:
+; CHECK: vrsqrt14ps %xmm0, %xmm0 {%k1} {z}
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_ps_128_rrk:
+; CHECK: vrsqrt14ps %xmm0, %xmm1 {%k1}
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) {
+; CHECK-LABEL: test_rcp_ps_256_rr:
+; CHECK: vrcp14ps %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) {
+; CHECK-LABEL: test_rcp_ps_256_rrkz:
+; CHECK: vrcp14ps %ymm0, %ymm0 {%k1} {z}
+  %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_rcp_ps_256_rrk:
+; CHECK: vrcp14ps %ymm0, %ymm1 {%k1}
+  %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) {
+; CHECK-LABEL: test_rcp_ps_128_rr:
+; CHECK: vrcp14ps %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) {
+; CHECK-LABEL: test_rcp_ps_128_rrkz:
+; CHECK: vrcp14ps %xmm0, %xmm0 {%k1} {z}
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_rcp_ps_128_rrk:
+; CHECK: vrcp14ps %xmm0, %xmm1 {%k1}
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone
+
+
+define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) {
+; CHECK-LABEL: test_rsqrt_pd_256_rr:
+; CHECK: vrsqrt14pd %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_pd_256_rrkz:
+; CHECK: vrsqrt14pd %ymm0, %ymm0 {%k1} {z}
+  %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_pd_256_rrk:
+; CHECK: vrsqrt14pd %ymm0, %ymm1 {%k1}
+  %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) {
+; CHECK-LABEL: test_rsqrt_pd_128_rr:
+; CHECK: vrsqrt14pd %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_pd_128_rrkz:
+; CHECK: vrsqrt14pd %xmm0, %xmm0 {%k1} {z}
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_pd_128_rrk:
+; CHECK: vrsqrt14pd %xmm0, %xmm1 {%k1}
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) {
+; CHECK-LABEL: test_rcp_pd_256_rr:
+; CHECK: vrcp14pd %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) {
+; CHECK-LABEL: test_rcp_pd_256_rrkz:
+; CHECK: vrcp14pd %ymm0, %ymm0 {%k1} {z}
+  %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
+; CHECK-LABEL: test_rcp_pd_256_rrk:
+; CHECK: vrcp14pd %ymm0, %ymm1 {%k1}
+  %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) {
+; CHECK-LABEL: test_rcp_pd_128_rr:
+; CHECK: vrcp14pd %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) {
+; CHECK-LABEL: test_rcp_pd_128_rrkz:
+; CHECK: vrcp14pd %xmm0, %xmm0 {%k1} {z}
+  %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
+; CHECK-LABEL: test_rcp_pd_128_rrk:
+; CHECK: vrcp14pd %xmm0, %xmm1 {%k1}
+  %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256:
+; CHECK: kmovw   %eax, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+
+  %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) 
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) 
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) 
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res2, %res3
+  ret <4 x double> %res4
+}
+declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly
+
+define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256:
+; CHECK: kmovw   %eax, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1}
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
+
+  %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) 
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) 
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) 
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res2, %res3
+  ret <8 x float> %res4
+}
+declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly
+
+define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128:
+; CHECK: kmovw   %eax, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+
+  %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) 
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) 
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) 
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly
+
+
+declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256:
+; CHECK: kmovw %eax, %k1
+; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
+; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1}
+; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm0
+; CHECK: vaddps %ymm1, %ymm0, %ymm0
+; CHECK: vaddps %ymm0, %ymm2, %ymm0
+
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask)
+  %res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask)
+  %res4 = fadd <8 x float> %res1, %res2
+  %res5 = fadd <8 x float> %res3, %res4
+  ret <8 x float> %res5
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256:
+; CHECK: kmovw %eax, %k1
+; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
+; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1}
+; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm0
+; CHECK: vpaddd %ymm1, %ymm0, %ymm0
+; CHECK: vpaddd %ymm0, %ymm2, %ymm0
+
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask)
+  %res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
+  %res4 = add <8 x i32> %res1, %res2
+  %res5 = add <8 x i32> %res3, %res4
+  ret <8 x i32> %res5
+}
diff --git a/test/CodeGen/X86/bit-piece-comment.ll b/test/CodeGen/X86/bit-piece-comment.ll
new file mode 100644
index 0000000000000..6ce858b11dcfb
--- /dev/null
+++ b/test/CodeGen/X86/bit-piece-comment.ll
@@ -0,0 +1,64 @@
+; RUN: llc -filetype=asm < %s
+;
+; We check that we don't crash when printing assembly comments that include
+; a DW_OP_bit_piece
+;
+; Regenerate from
+; void fn1() {
+; struct {
+;   int dword[2];
+; } u;
+; u.dword[1] = 0;
+; };
+; via clang++ -g -fno-integrated-as -Os
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+
+%struct.anon = type { [2 x i32] }
+
+; Function Attrs: norecurse nounwind optsize readnone uwtable
+define void @_Z3fn1v() #0 !dbg !4 {
+entry:
+  tail call void @llvm.dbg.declare(metadata %struct.anon* undef, metadata !8, metadata !19), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !8, metadata !21), !dbg !20
+  ret void, !dbg !22
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { norecurse nounwind optsize readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16, !17}
+!llvm.ident = !{!18}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 256088) (llvm/trunk 256097)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!1 = !DIFile(filename: "test.cpp", directory: "/mnt/extra")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "fn1", linkageName: "_Z3fn1v", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, variables: !7)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null}
+!7 = !{!8}
+!8 = !DILocalVariable(name: "u", scope: !4, file: !1, line: 4, type: !9)
+!9 = !DICompositeType(tag: DW_TAG_structure_type, scope: !4, file: !1, line: 2, size: 64, align: 32, elements: !10)
+!10 = !{!11}
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "dword", scope: !9, file: !1, line: 3, baseType: !12, size: 64, align: 32)
+!12 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, size: 64, align: 32, elements: !14)
+!13 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!14 = !{!15}
+!15 = !DISubrange(count: 2)
+!16 = !{i32 2, !"Dwarf Version", i32 4}
+!17 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = !{!"clang version 3.8.0 (trunk 256088) (llvm/trunk 256097)"}
+!19 = !DIExpression()
+!20 = !DILocation(line: 4, column: 5, scope: !4)
+!21 = !DIExpression(DW_OP_bit_piece, 32, 32)
+!22 = !DILocation(line: 6, column: 1, scope: !4)
diff --git a/test/CodeGen/X86/bitreverse.ll b/test/CodeGen/X86/bitreverse.ll
new file mode 100644
index 0000000000000..e3bc8ace38ab5
--- /dev/null
+++ b/test/CodeGen/X86/bitreverse.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=x86 %s -o - | FileCheck %s
+
+; These tests just check that the plumbing is in place for @llvm.bitreverse. The
+; actual output is massive at the moment as llvm.bitreverse is not yet legal.
+
+declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
+
+define <2 x i16> @f(<2 x i16> %a) {
+; CHECK-LABEL: f:
+; CHECK: shll
+  %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
+  ret <2 x i16> %b
+}
+
+declare i8 @llvm.bitreverse.i8(i8) readnone
+
+define i8 @g(i8 %a) {
+; CHECK-LABEL: g:
+; CHECK: shlb
+  %b = call i8 @llvm.bitreverse.i8(i8 %a)
+  ret i8 %b
+}
diff --git a/test/CodeGen/X86/branchfolding-catchpads.ll b/test/CodeGen/X86/branchfolding-catchpads.ll
new file mode 100644
index 0000000000000..0468b3c314f61
--- /dev/null
+++ b/test/CodeGen/X86/branchfolding-catchpads.ll
@@ -0,0 +1,95 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @throw()
+declare i16 @f()
+
+define i16 @test1(i16 %a, i8* %b) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %cmp = icmp eq i16 %a, 10
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %call1 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+if.else:
+  %call2 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+catch.dispatch:
+  %cs = catchswitch within none [ label %catch, label %catch.2 ] unwind to caller
+
+catch:
+  catchpad within %cs [i8* null, i32 8, i8* null]
+  call void @throw() noreturn
+  br label %unreachable
+
+catch.2:
+  catchpad within %cs [i8* null, i32 64, i8* null]
+  store i8 1, i8* %b
+  call void @throw() noreturn
+  br label %unreachable
+
+cleanup:
+  %retval = phi i16 [ %call1, %if.then ], [ %call2, %if.else ]
+  ret i16 %retval
+
+unreachable:
+  unreachable
+}
+
+; This test verifies the case where two funclet blocks meet the old criteria
+; to be placed at the end.  The order of the blocks is not important for the
+; purposes of this test.  The failure mode is an infinite loop during
+; compilation.
+;
+; CHECK-LABEL: .def     test1;
+
+define i16 @test2(i16 %a, i8* %b) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %cmp = icmp eq i16 %a, 10
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %call1 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+if.else:
+  %call2 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+catch.dispatch:
+  %cs = catchswitch within none [ label %catch, label %catch.2, label %catch.3 ] unwind to caller
+
+catch:
+  catchpad within %cs [i8* null, i32 8, i8* null]
+  call void @throw() noreturn
+  br label %unreachable
+
+catch.2:
+  %c2 = catchpad within %cs [i8* null, i32 32, i8* null]
+  store i8 1, i8* %b
+  catchret from %c2 to label %cleanup
+
+catch.3:
+  %c3 = catchpad within %cs [i8* null, i32 64, i8* null]
+  store i8 2, i8* %b
+  catchret from %c3 to label %cleanup
+
+cleanup:
+  %retval = phi i16 [ %call1, %if.then ], [ %call2, %if.else ], [ -1, %catch.2 ], [ -1, %catch.3 ]
+  ret i16 %retval
+
+unreachable:
+  unreachable
+}
+
+; This test verifies the case where three funclet blocks all meet the old
+; criteria to be placed at the end.  The order of the blocks is not important
+; for the purposes of this test.  The failure mode is an infinite loop during
+; compilation.
+;
+; CHECK-LABEL: .def     test2;
+
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
index 73dbe1f650a12..fd7290d58179a 100644
--- a/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s
 
 define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
diff --git a/test/CodeGen/X86/catchpad-realign-savexmm.ll b/test/CodeGen/X86/catchpad-realign-savexmm.ll
new file mode 100644
index 0000000000000..1160101792ff8
--- /dev/null
+++ b/test/CodeGen/X86/catchpad-realign-savexmm.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -verify-machineinstrs < %s | FileCheck %s
+
+; We should store -2 into UnwindHelp in a slot immediately after the last XMM
+; CSR save.
+
+declare void @g()
+declare i32 @__CxxFrameHandler3(...)
+
+@fp_global = global double 0.0
+
+define void @f() personality i32 (...)* @__CxxFrameHandler3 {
+  %v = load double, double* @fp_global
+  call void @g()
+  %v1 = fadd double %v, 1.0
+  store double %v1, double* @fp_global
+  invoke void @g()
+      to label %return unwind label %catch.dispatch
+
+return:
+  ret void
+
+catch.dispatch:
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:
+  %p = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  catchret from %p to label %return
+}
+
+; CHECK: f: # @f
+; CHECK: pushq   %rbp
+; CHECK: .seh_pushreg 5
+; CHECK: subq    $64, %rsp
+; CHECK: .seh_stackalloc 64
+; CHECK: leaq    64(%rsp), %rbp
+; CHECK: .seh_setframe 5, 64
+; CHECK: movaps  %xmm6, -16(%rbp)        # 16-byte Spill
+; CHECK: .seh_savexmm 6, 48
+; CHECK: .seh_endprologue
+; CHECK: movq    $-2, -24(%rbp)
+; CHECK: movsd   fp_global(%rip), %xmm6  # xmm6 = mem[0],zero
+; CHECK: callq   g
+; CHECK: addsd   __real@3ff0000000000000(%rip), %xmm6
+; CHECK: movsd   %xmm6, fp_global(%rip)
+; CHECK: .Ltmp{{.*}}
+; CHECK: callq   g
+; CHECK: .Ltmp{{.*}}
+; CHECK: .LBB{{.*}} # Block address taken
+; CHECK: movaps  -16(%rbp), %xmm6
+; CHECK: addq    $64, %rsp
+; CHECK: popq    %rbp
+; CHECK: retq
+; CHECK: .seh_handlerdata
diff --git a/test/CodeGen/X86/catchpad-regmask.ll b/test/CodeGen/X86/catchpad-regmask.ll
new file mode 100644
index 0000000000000..0d436f6eb595d
--- /dev/null
+++ b/test/CodeGen/X86/catchpad-regmask.ll
@@ -0,0 +1,144 @@
+; RUN: llc < %s | FileCheck %s
+
+; Based on this code:
+;
+; extern "C" int array[4];
+; extern "C" void global_array(int idx1, int idx2, int idx3) {
+;   try {
+;     array[idx1] = 111;
+;     throw;
+;   } catch (...) {
+;     array[idx2] = 222;
+;   }
+;   array[idx3] = 333;
+; }
+; extern "C" __declspec(dllimport) int imported;
+; extern "C" void access_imported() {
+;   try {
+;     imported = 111;
+;     throw;
+;   } catch (...) {
+;     imported = 222;
+;   }
+;   imported = 333;
+; }
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc18.0.0"
+
+%eh.ThrowInfo = type { i32, i32, i32, i32 }
+
+@array = external global [4 x i32], align 16
+@imported = external dllimport global i32, align 4
+
+; Function Attrs: uwtable
+define void @global_array(i32 %idx1, i32 %idx2, i32 %idx3) #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %idxprom = sext i32 %idx1 to i64
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* @array, i64 0, i64 %idxprom
+  store i32 111, i32* %arrayidx, align 4, !tbaa !2
+  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) #1
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %0 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  %idxprom1 = sext i32 %idx2 to i64
+  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32]* @array, i64 0, i64 %idxprom1
+  store i32 222, i32* %arrayidx2, align 4, !tbaa !2
+  catchret from %0 to label %try.cont
+
+try.cont:                                         ; preds = %catch
+  %idxprom3 = sext i32 %idx3 to i64
+  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32]* @array, i64 0, i64 %idxprom3
+  store i32 333, i32* %arrayidx4, align 4, !tbaa !2
+  ret void
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+; CHECK-LABEL: global_array: # @global_array
+; CHECK: pushq %rbp
+; 	First array access
+; CHECK: movslq  %ecx, %[[idx:[^ ]*]]
+; CHECK: leaq    array(%rip), %[[base:[^ ]*]]
+; CHECK: movl    $111, (%[[base]],%[[idx]],4)
+;	Might throw an exception and return to below...
+; CHECK: callq   _CxxThrowException
+; 	Third array access must remat the address of array
+; CHECK: movslq  {{.*}}, %[[idx:[^ ]*]]
+; CHECK: leaq    array(%rip), %[[base:[^ ]*]]
+; CHECK: movl    $333, (%[[base]],%[[idx]],4)
+; CHECK: popq %rbp
+; CHECK: retq
+
+; CHECK: "?catch$2@?0?global_array@4HA":
+; CHECK: pushq   %rbp
+; CHECK: movslq  {{.*}}, %[[idx:[^ ]*]]
+; CHECK: leaq    array(%rip), %[[base:[^ ]*]]
+; CHECK: movl    $222, (%[[base]],%[[idx]],4)
+; CHECK: popq    %rbp
+; CHECK: retq                            # CATCHRET
+
+declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: uwtable
+define void @access_imported() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  store i32 111, i32* @imported, align 4, !tbaa !2
+  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) #1
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %0 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  store i32 222, i32* @imported, align 4, !tbaa !2
+  catchret from %0 to label %try.cont
+
+try.cont:                                         ; preds = %catch
+  store i32 333, i32* @imported, align 4, !tbaa !2
+  ret void
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+; CHECK-LABEL: access_imported: # @access_imported
+; CHECK: pushq %rbp
+; CHECK: movq    __imp_imported(%rip), %[[base:[^ ]*]]
+; CHECK: movl    $111, (%[[base]])
+;	Might throw an exception and return to below...
+; CHECK: callq   _CxxThrowException
+; 	Third access must reload the address of imported
+; CHECK: movq    __imp_imported(%rip), %[[base:[^ ]*]]
+; CHECK: movl    $333, (%[[base]])
+; CHECK: popq %rbp
+; CHECK: retq
+
+; CHECK: "?catch$2@?0?access_imported@4HA":
+; CHECK: pushq   %rbp
+; CHECK: movq    __imp_imported(%rip), %[[base:[^ ]*]]
+; CHECK: movl    $222, (%[[base]])
+; CHECK: popq    %rbp
+; CHECK: retq                            # CATCHRET
+
+
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noreturn }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.8.0 "}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/catchpad-weight.ll b/test/CodeGen/X86/catchpad-weight.ll
new file mode 100644
index 0000000000000..60939bc6b03ee
--- /dev/null
+++ b/test/CodeGen/X86/catchpad-weight.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march=x86-64 -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s
+
+; Check if the edge weight to the catchpad is calculated correctly.
+
+; CHECK: Successors according to CFG: BB#2(0x7ffff100 / 0x80000000 = 100.00%) BB#1(0x00000800 / 0x80000000 = 0.00%) BB#3(0x00000400 / 0x80000000 = 0.00%) BB#4(0x00000200 / 0x80000000 = 0.00%) BB#5(0x00000100 / 0x80000000 = 0.00%)
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64--windows-msvc18.0.0"
+
+%rtti.TypeDescriptor7 = type { i8**, i8*, [8 x i8] }
+%struct.HasDtor = type { i8 }
+
+$"\01??_R0?AUA@@@8" = comdat any
+
+$"\01??_R0?AUB@@@8" = comdat any
+
+$"\01??_R0?AUC@@@8" = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0?AUA@@@8" = linkonce_odr global %rtti.TypeDescriptor7 { i8** @"\01??_7type_info@@6B@", i8* null, [8 x i8] c".?AUA@@\00" }, comdat
+@"\01??_R0?AUB@@@8" = linkonce_odr global %rtti.TypeDescriptor7 { i8** @"\01??_7type_info@@6B@", i8* null, [8 x i8] c".?AUB@@\00" }, comdat
+@"\01??_R0?AUC@@@8" = linkonce_odr global %rtti.TypeDescriptor7 { i8** @"\01??_7type_info@@6B@", i8* null, [8 x i8] c".?AUC@@\00" }, comdat
+
+; Function Attrs: uwtable
+define i32 @main() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %o = alloca %struct.HasDtor, align 1
+  %0 = getelementptr inbounds %struct.HasDtor, %struct.HasDtor* %o, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* %0) #4
+  invoke void @"\01?may_throw@@YAXXZ"()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %catch.5] unwind label %catch.dispatch.1
+
+catch.5:                                          ; preds = %catch.dispatch
+  %1 = catchpad within %cs1 [%rtti.TypeDescriptor7* @"\01??_R0?AUA@@@8", i32 0, i8* null]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch, %catch.3, %catch.5
+  call void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor* nonnull %o) #4
+  call void @llvm.lifetime.end(i64 1, i8* %0) #4
+  ret i32 0
+
+catch.dispatch.1:                                 ; preds = %catch.dispatch
+  %cs2 = catchswitch within none [label %catch.3] unwind label %catch.dispatch.2
+
+catch.3:                                          ; preds = %catch.dispatch.1
+  %2 = catchpad within %cs2 [%rtti.TypeDescriptor7* @"\01??_R0?AUB@@@8", i32 0, i8* null]
+  catchret from %2 to label %try.cont
+
+catch.dispatch.2:                                 ; preds = %catch.dispatch.1
+  %cs3 = catchswitch within none [label %catch] unwind label %ehcleanup
+
+catch:                                            ; preds = %catch.dispatch.2
+  %3 = catchpad within %cs3 [%rtti.TypeDescriptor7* @"\01??_R0?AUC@@@8", i32 0, i8* null]
+  catchret from %3 to label %try.cont
+
+ehcleanup:                                        ; preds = %catchendblock
+  %4 = cleanuppad within none []
+  call void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor* nonnull %o) #4 [ "funclet"(token %4) ]
+  cleanupret from %4 unwind to caller
+}
+
+; Function Attrs: nounwind argmemonly
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+declare void @"\01?may_throw@@YAXXZ"() #2
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind
+declare void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor*) #3
+
+; Function Attrs: nounwind argmemonly
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind argmemonly }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
diff --git a/test/CodeGen/X86/catchret-empty-fallthrough.ll b/test/CodeGen/X86/catchret-empty-fallthrough.ll
new file mode 100644
index 0000000000000..7ad1033031712
--- /dev/null
+++ b/test/CodeGen/X86/catchret-empty-fallthrough.ll
@@ -0,0 +1,53 @@
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+; BranchFolding used to remove our empty landingpad block, which is
+; undesirable.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc18.0.0"
+
+declare i32 @__C_specific_handler(...)
+
+declare void @bar()
+
+define void @foo(i1 %cond) personality i32 (...)* @__C_specific_handler {
+entry:
+  br i1 %cond, label %return, label %try
+
+try:                                              ; preds = %entry
+  invoke void @bar()
+          to label %fallthrough unwind label %dispatch
+
+dispatch:                                         ; preds = %try
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %dispatch
+  %0 = catchpad within %cs1 [i8* null]
+  catchret from %0 to label %return
+
+fallthrough:                                      ; preds = %try
+  unreachable
+
+return:                                           ; preds = %catch, %entry
+  ret void
+}
+
+; CHECK-LABEL: foo: # @foo
+; CHECK: testb $1, %cl
+; CHECK: je .LBB0_[[try:[0-9]+]]
+; CHECK: .LBB0_[[return:[0-9]+]]:
+; CHECK: retq
+; CHECK: .LBB0_[[try]]:
+; CHECK: .Ltmp0:
+; CHECK: callq bar
+; CHECK: .Ltmp1:
+; CHECK: .LBB0_[[catch:[0-9]+]]:
+
+; CHECK: .seh_handlerdata
+; CHECK-NEXT: .Lfoo$parent_frame_offset = 32
+; CHECK-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16
+; CHECK-NEXT: .Llsda_begin0:
+; CHECK-NEXT: .long   .Ltmp0@IMGREL+1
+; CHECK-NEXT: .long   .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long   1
+; CHECK-NEXT: .long   .LBB0_[[catch]]@IMGREL
diff --git a/test/CodeGen/X86/catchret-fallthrough.ll b/test/CodeGen/X86/catchret-fallthrough.ll
new file mode 100644
index 0000000000000..6a94b290e823a
--- /dev/null
+++ b/test/CodeGen/X86/catchret-fallthrough.ll
@@ -0,0 +1,42 @@
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+; We used to have an issue where we inserted an MBB between invoke.cont.3 and
+; its fallthrough target of ret void.
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc18.0.0"
+
+@some_global = global i32 0
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @g()
+
+define void @f() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @g()
+          to label %invoke.cont.3 unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %0 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  catchret from %0 to label %nrvo.skipdtor
+
+invoke.cont.3:                                    ; preds = %entry
+  store i32 123, i32* @some_global
+  br label %nrvo.skipdtor
+
+nrvo.skipdtor:                                    ; preds = %invoke.cont.3, %invoke.cont.4
+  ret void
+}
+
+; CHECK-LABEL: _f: # @f
+; CHECK: calll _g
+; CHECK: movl $123, _some_global
+; CHECK-NOT: jmp
+; CHECK-NOT: movl {{.*}}, %esp
+; CHECK: retl
+; CHECK: addl $12, %ebp
+; CHECK: jmp LBB0_{{.*}}
diff --git a/test/CodeGen/X86/cleanuppad-inalloca.ll b/test/CodeGen/X86/cleanuppad-inalloca.ll
new file mode 100644
index 0000000000000..2e34ada52e6b9
--- /dev/null
+++ b/test/CodeGen/X86/cleanuppad-inalloca.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s | FileCheck %s
+
+; Based on this C++:
+; struct A {
+;   int x;
+;   A();
+;   A(const A &a);
+;   ~A();
+; };
+; extern "C" void takes_two(A a1, A a2);
+; extern "C" void passes_two() { takes_two(A(), A()); }
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686--windows-msvc"
+
+%struct.A = type { i32 }
+
+define void @passes_two() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %argmem = alloca inalloca <{ %struct.A, %struct.A }>, align 4
+  %0 = getelementptr inbounds <{ %struct.A, %struct.A }>, <{ %struct.A, %struct.A }>* %argmem, i32 0, i32 1
+  %call = call x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"(%struct.A* %0)
+  %1 = getelementptr inbounds <{ %struct.A, %struct.A }>, <{ %struct.A, %struct.A }>* %argmem, i32 0, i32 0
+  %call1 = invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"(%struct.A* %1)
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  call void @takes_two(<{ %struct.A, %struct.A }>* inalloca nonnull %argmem)
+  ret void
+
+ehcleanup:                                        ; preds = %entry
+  %2 = cleanuppad within none []
+  call x86_thiscallcc void @"\01??1A@@QAE@XZ"(%struct.A* %0) [ "funclet"(token %2) ]
+  cleanupret from %2 unwind to caller
+}
+
+; CHECK: _passes_two:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: subl ${{[0-9]+}}, %esp
+; CHECK: movl $8, %eax
+; CHECK: calll __chkstk
+; CHECK: calll "??0A@@QAE@XZ"
+; CHECK: calll "??0A@@QAE@XZ"
+; CHECK: calll _takes_two
+; 	ESP must be restored via EBP due to "dynamic" alloca.
+; CHECK: leal -{{[0-9]+}}(%ebp), %esp
+; CHECK: popl %ebp
+; CHECK: retl
+
+; CHECK: "?dtor$2@?0?passes_two@4HA":
+; CHECK: pushl %ebp
+; CHECK: subl $8, %esp
+; CHECK: addl $12, %ebp
+; CHECK: {{movl|leal}} -{{[0-9]+}}(%ebp), %ecx
+; CHECK: calll "??1A@@QAE@XZ"
+; CHECK: addl $8, %esp
+; CHECK: retl
+
+declare void @takes_two(<{ %struct.A, %struct.A }>* inalloca) #0
+
+declare x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"(%struct.A* returned) #0
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare x86_thiscallcc void @"\01??1A@@QAE@XZ"(%struct.A*) #0
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/cleanuppad-large-codemodel.ll b/test/CodeGen/X86/cleanuppad-large-codemodel.ll
new file mode 100644
index 0000000000000..8ffb97d8dd687
--- /dev/null
+++ b/test/CodeGen/X86/cleanuppad-large-codemodel.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -code-model=large -o - < %s | FileCheck %s
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @bar()
+
+define void @foo() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @bar()
+    to label %exit unwind label %cleanup
+cleanup:
+  %c = cleanuppad within none []
+  call void @bar() [ "funclet"(token %c) ]
+  cleanupret from %c unwind to caller
+exit:
+  ret void
+}
+
+; CHECK: foo: # @foo
+; CHECK: movabsq $bar, %[[reg:[^ ]*]]
+; CHECK: callq *%[[reg]]
+; CHECK: retq
+
+; CHECK: "?dtor$2@?0?foo@4HA":
+; CHECK: movabsq $bar, %[[reg:[^ ]*]]
+; CHECK: callq *%[[reg]]
+; CHECK: retq                            # CLEANUPRET
diff --git a/test/CodeGen/X86/cleanuppad-realign.ll b/test/CodeGen/X86/cleanuppad-realign.ll
new file mode 100644
index 0000000000000..5a565cc1570f1
--- /dev/null
+++ b/test/CodeGen/X86/cleanuppad-realign.ll
@@ -0,0 +1,78 @@
+; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
+
+declare i32 @__CxxFrameHandler3(...)
+declare void @Dtor(i64* %o)
+declare void @f(i32)
+
+define void @realigned_cleanup() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  ; Overalign %o to cause stack realignment.
+  %o = alloca i64, align 32
+  invoke void @f(i32 1)
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  call void @Dtor(i64* %o)
+  ret void
+
+ehcleanup:                                        ; preds = %entry
+  %0 = cleanuppad within none []
+  call void @Dtor(i64* %o) [ "funclet"(token %0) ]
+  cleanupret from %0 unwind to caller
+}
+
+; X86-LABEL: _realigned_cleanup: # @realigned_cleanup
+; X86:         pushl   %ebp
+; X86:         movl    %esp, %ebp
+; X86:         pushl   %ebx
+; X86:         pushl   %edi
+; X86:         pushl   %esi
+; X86:         andl    $-32, %esp
+; X86:         subl    $96, %esp
+; X86:         movl    %esp, %esi
+;	EBP will reload from this offset.
+; X86:         movl    %ebp, 28(%esi)
+; 	The last EH reg field is the state number, so dtor adjust is this +4.
+; X86:         movl    $-1, 72(%esi)
+
+; X86-LABEL: "?dtor$2@?0?realigned_cleanup@4HA":
+; X86:         pushl   %ebp
+; X86:         leal    -76(%ebp), %esi
+; X86:         movl    28(%esi), %ebp
+;	We used to have a bug where we clobbered ESI after the prologue.
+; X86-NOT: 	movl {{.*}}, %esi
+; X86:         popl    %ebp
+; X86:         retl                            # CLEANUPRET
+
+; X64-LABEL: realigned_cleanup: # @realigned_cleanup
+; X64:         pushq   %rbp
+; X64:         .seh_pushreg 5
+; X64:         pushq   %rbx
+; X64:         .seh_pushreg 3
+; X64:         subq    $104, %rsp
+; X64:         .seh_stackalloc 104
+; X64:         leaq    96(%rsp), %rbp
+; X64:         .seh_setframe 5, 96
+; X64:         .seh_endprologue
+; X64:         andq    $-32, %rsp
+; X64:         movq    %rsp, %rbx
+;	RBP will reload from this offset.
+; X64:         movq    %rbp, 56(%rbx)
+; X64: 	       movq    $-2, (%rbp)
+
+; X64-LABEL: "?dtor$2@?0?realigned_cleanup@4HA":
+; X64:         movq    %rdx, 16(%rsp)
+; X64:         pushq   %rbp
+; X64:         .seh_pushreg 5
+; X64:         pushq   %rbx
+; X64:         .seh_pushreg 3
+; X64:         subq    $40, %rsp
+; X64:         .seh_stackalloc 40
+; X64:         leaq    96(%rdx), %rbp
+; X64:         .seh_endprologue
+; X64: 	       andq    $-32, %rdx
+; X64: 	       movq    %rdx, %rbx
+; X64-NOT: 	mov{{.*}}, %rbx
+; X64:         popq    %rbp
+; X64:         retq                            # CLEANUPRET
diff --git a/test/CodeGen/X86/clz.ll b/test/CodeGen/X86/clz.ll
index 6a6f5256f44d7..4a094480c931d 100644
--- a/test/CodeGen/X86/clz.ll
+++ b/test/CodeGen/X86/clz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 declare i8 @llvm.cttz.i8(i8, i1)
 declare i16 @llvm.cttz.i16(i16, i1)
@@ -10,131 +10,151 @@ declare i32 @llvm.ctlz.i32(i32, i1)
 declare i64 @llvm.ctlz.i64(i64, i1)
 
 define i8 @cttz_i8(i8 %x)  {
+; CHECK-LABEL: cttz_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    bsfl %eax, %eax
+; CHECK-NEXT:    retq
   %tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true )
   ret i8 %tmp
-; CHECK-LABEL: cttz_i8:
-; CHECK: bsfl
-; CHECK-NOT: cmov
-; CHECK: ret
 }
 
 define i16 @cttz_i16(i16 %x)  {
+; CHECK-LABEL: cttz_i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    bsfw %di, %ax
+; CHECK-NEXT:    retq
   %tmp = call i16 @llvm.cttz.i16( i16 %x, i1 true )
   ret i16 %tmp
-; CHECK-LABEL: cttz_i16:
-; CHECK: bsfw
-; CHECK-NOT: cmov
-; CHECK: ret
 }
 
 define i32 @cttz_i32(i32 %x)  {
+; CHECK-LABEL: cttz_i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    bsfl %edi, %eax
+; CHECK-NEXT:    retq
   %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true )
   ret i32 %tmp
-; CHECK-LABEL: cttz_i32:
-; CHECK: bsfl
-; CHECK-NOT: cmov
-; CHECK: ret
 }
 
 define i64 @cttz_i64(i64 %x)  {
+; CHECK-LABEL: cttz_i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    bsfq %rdi, %rax
+; CHECK-NEXT:    retq
   %tmp = call i64 @llvm.cttz.i64( i64 %x, i1 true )
   ret i64 %tmp
-; CHECK-LABEL: cttz_i64:
-; CHECK: bsfq
-; CHECK-NOT: cmov
-; CHECK: ret
 }
 
 define i8 @ctlz_i8(i8 %x) {
-entry:
+; CHECK-LABEL: ctlz_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    bsrl %eax, %eax
+; CHECK-NEXT:    xorl $7, %eax
+; CHECK-NEXT:    retq
   %tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true )
   ret i8 %tmp2
-; CHECK-LABEL: ctlz_i8:
-; CHECK: bsrl
-; CHECK-NOT: cmov
-; CHECK: xorl $7,
-; CHECK: ret
 }
 
 define i16 @ctlz_i16(i16 %x) {
-entry:
+; CHECK-LABEL: ctlz_i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    bsrw %di, %ax
+; CHECK-NEXT:    xorl $15, %eax
+; CHECK-NEXT:    retq
   %tmp2 = call i16 @llvm.ctlz.i16( i16 %x, i1 true )
   ret i16 %tmp2
-; CHECK-LABEL: ctlz_i16:
-; CHECK: bsrw
-; CHECK-NOT: cmov
-; CHECK: xorl $15,
-; CHECK: ret
 }
 
 define i32 @ctlz_i32(i32 %x) {
+; CHECK-LABEL: ctlz_i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    bsrl %edi, %eax
+; CHECK-NEXT:    xorl $31, %eax
+; CHECK-NEXT:    retq
   %tmp = call i32 @llvm.ctlz.i32( i32 %x, i1 true )
   ret i32 %tmp
-; CHECK-LABEL: ctlz_i32:
-; CHECK: bsrl
-; CHECK-NOT: cmov
-; CHECK: xorl $31,
-; CHECK: ret
 }
 
 define i64 @ctlz_i64(i64 %x) {
+; CHECK-LABEL: ctlz_i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    bsrq %rdi, %rax
+; CHECK-NEXT:    xorq $63, %rax
+; CHECK-NEXT:    retq
   %tmp = call i64 @llvm.ctlz.i64( i64 %x, i1 true )
   ret i64 %tmp
-; CHECK-LABEL: ctlz_i64:
-; CHECK: bsrq
-; CHECK-NOT: cmov
-; CHECK: xorq $63,
-; CHECK: ret
 }
 
-define i32 @ctlz_i32_cmov(i32 %n) {
-entry:
-; Generate a cmov to handle zero inputs when necessary.
-; CHECK-LABEL: ctlz_i32_cmov:
-; CHECK: bsrl
-; CHECK: cmov
-; CHECK: xorl $31,
-; CHECK: ret
+define i32 @ctlz_i32_zero_test(i32 %n) {
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+
+; CHECK-LABEL: ctlz_i32_zero_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $32, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB8_2
+; CHECK-NEXT:  # BB#1: # %cond.false
+; CHECK-NEXT:    bsrl %edi, %eax
+; CHECK-NEXT:    xorl $31, %eax
+; CHECK-NEXT:  .LBB8_2: # %cond.end
+; CHECK-NEXT:    retq
   %tmp1 = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
   ret i32 %tmp1
 }
 
 define i32 @ctlz_i32_fold_cmov(i32 %n) {
-entry:
 ; Don't generate the cmovne when the source is known non-zero (and bsr would
 ; not set ZF).
 ; rdar://9490949
+; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
+;        codegen doesn't know how to delete the movl and je.
+
 ; CHECK-LABEL: ctlz_i32_fold_cmov:
-; CHECK: bsrl
-; CHECK-NOT: cmov
-; CHECK: xorl $31,
-; CHECK: ret
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orl $1, %edi
+; CHECK-NEXT:    movl $32, %eax
+; CHECK-NEXT:    je .LBB9_2
+; CHECK-NEXT:  # BB#1: # %cond.false
+; CHECK-NEXT:    bsrl %edi, %eax
+; CHECK-NEXT:    xorl $31, %eax
+; CHECK-NEXT:  .LBB9_2: # %cond.end
+; CHECK-NEXT:    retq
   %or = or i32 %n, 1
   %tmp1 = call i32 @llvm.ctlz.i32(i32 %or, i1 false)
   ret i32 %tmp1
 }
 
 define i32 @ctlz_bsr(i32 %n) {
-entry:
 ; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
 ; the most significant bit, which is what 'bsr' does natively.
+
 ; CHECK-LABEL: ctlz_bsr:
-; CHECK: bsrl
-; CHECK-NOT: xorl
-; CHECK: ret
+; CHECK:       # BB#0:
+; CHECK-NEXT:    bsrl %edi, %eax
+; CHECK-NEXT:    retq
   %ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
   %bsr = xor i32 %ctlz, 31
   ret i32 %bsr
 }
 
-define i32 @ctlz_bsr_cmov(i32 %n) {
-entry:
-; Same as ctlz_bsr, but ensure this happens even when there is a potential
-; zero.
-; CHECK-LABEL: ctlz_bsr_cmov:
-; CHECK: bsrl
-; CHECK-NOT: xorl
-; CHECK: ret
+define i32 @ctlz_bsr_zero_test(i32 %n) {
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
+;        codegen doesn't know how to combine the $32 and $31 into $63.
+
+; CHECK-LABEL: ctlz_bsr_zero_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $32, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB11_2
+; CHECK-NEXT:  # BB#1: # %cond.false
+; CHECK-NEXT:    bsrl %edi, %eax
+; CHECK-NEXT:    xorl $31, %eax
+; CHECK-NEXT:  .LBB11_2: # %cond.end
+; CHECK-NEXT:    xorl $31, %eax
+; CHECK-NEXT:    retq
   %ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
   %bsr = xor i32 %ctlz, 31
   ret i32 %bsr
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index 584179aacbc9c..eb9a29011428e 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -211,3 +211,47 @@ define zeroext i1 @test15(i32 %bf.load, i32 %n) {
 ; CHECK:  shrl	$16, %edi
 ; CHECK:  cmpl	%esi, %edi
 }
+
+define i8 @test16(i16 signext %L) {
+  %lshr  = lshr i16 %L, 15
+  %trunc = trunc i16 %lshr to i8
+  %not   = xor i8 %trunc, 1
+  ret i8 %not
+
+; CHECK-LABEL: test16:
+; CHECK:  testw   %di, %di
+; CHECK:  setns   %al
+}
+
+define i8 @test17(i32 %L) {
+  %lshr  = lshr i32 %L, 31
+  %trunc = trunc i32 %lshr to i8
+  %not   = xor i8 %trunc, 1
+  ret i8 %not
+
+; CHECK-LABEL: test17:
+; CHECK:  testl   %edi, %edi
+; CHECK:  setns   %al
+}
+
+define i8 @test18(i64 %L) {
+  %lshr  = lshr i64 %L, 63
+  %trunc = trunc i64 %lshr to i8
+  %not   = xor i8 %trunc, 1
+  ret i8 %not
+
+; CHECK-LABEL: test18:
+; CHECK:  testq   %rdi, %rdi
+; CHECK:  setns   %al
+}
+
+define zeroext i1 @test19(i32 %L) {
+  %lshr  = lshr i32 %L, 31
+  %trunc = trunc i32 %lshr to i1
+  %not   = xor i1 %trunc, 1
+  ret i1 %not
+
+; CHECK-LABEL: test19:
+; CHECK:  testl   %edi, %edi
+; CHECK:  setns   %al
+}
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index 61123930887b6..e21ba2a14cf5a 100644
--- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -1,24 +1,72 @@
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
+; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
 
-declare i32 @bar()
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s -check-prefix=x8664-sahf
 
-define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
-; CHECK-LABEL: test_intervening_call:
-; CHECK: cmpxchg
-; CHECK: pushf[[LQ:[lq]]]
-; CHECK-NEXT: pop[[LQ]] [[FLAGS:%.*]]
+declare i32 @foo()
+declare i32 @bar(i64)
 
-; CHECK-NEXT: call[[LQ]] bar
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
+; i386-LABEL: test_intervening_call:
+; i386: cmpxchg8b
+; i386-NEXT: pushl %eax
+; i386-NEXT: seto %al
+; i386-NEXT: lahf
+; i386-NEXT: movl %eax, [[FLAGS:%.*]]
+; i386-NEXT: popl %eax
+; i386-NEXT: movl %edx, 4(%esp)
+; i386-NEXT: movl %eax, (%esp)
+; i386-NEXT: calll bar
+; i386-NEXT: movl [[FLAGS]], %eax
+; i386-NEXT: addb $127, %al
+; i386-NEXT: sahf
+; i386-NEXT: jne
+
+; i386f-LABEL: test_intervening_call:
+; i386f: cmpxchg8b
+; i386f-NEXT: movl %eax, (%esp)
+; i386f-NEXT: movl %edx, 4(%esp)
+; i386f-NEXT: seto %al
+; i386f-NEXT: lahf
+; i386f-NEXT: movl %eax, [[FLAGS:%.*]]
+; i386f-NEXT: calll bar
+; i386f-NEXT: movl [[FLAGS]], %eax
+; i386f-NEXT: addb $127, %al
+; i386f-NEXT: sahf
+; i386f-NEXT: jne
+
+; x8664-LABEL: test_intervening_call:
+; x8664: cmpxchgq
+; x8664: pushfq
+; x8664-NEXT: popq [[FLAGS:%.*]]
+; x8664-NEXT: movq %rax, %rdi
+; x8664-NEXT: callq bar
+; x8664-NEXT: pushq [[FLAGS]]
+; x8664-NEXT: popfq
+; x8664-NEXT: jne
+
+; x8664-sahf-LABEL: test_intervening_call:
+; x8664-sahf: cmpxchgq
+; x8664-sahf: pushq %rax
+; x8664-sahf-NEXT: seto %al
+; x8664-sahf-NEXT: lahf
+; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
+; x8664-sahf-NEXT: popq %rax
+; x8664-sahf-NEXT: movq %rax, %rdi
+; x8664-sahf-NEXT: callq bar
+; x8664-sahf-NEXT: movq [[FLAGS]], %rax
+; x8664-sahf-NEXT: addb $127, %al
+; x8664-sahf-NEXT: sahf
+; x8664-sahf-NEXT: jne
 
-; CHECK-NEXT: push[[LQ]] [[FLAGS]]
-; CHECK-NEXT: popf[[LQ]]
-; CHECK-NEXT: jne
   %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+  %v = extractvalue { i64, i1 } %cx, 0
   %p = extractvalue { i64, i1 } %cx, 1
-  call i32 @bar()
+  call i32 @bar(i64 %v)
   br i1 %p, label %t, label %f
 
 t:
@@ -30,10 +78,22 @@ f:
 
 ; Interesting in producing a clobber without any function calls.
 define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) {
-; CHECK-LABEL: test_control_flow:
+; i386-LABEL: test_control_flow:
+; i386: cmpxchg
+; i386-NEXT: jne
+
+; i386f-LABEL: test_control_flow:
+; i386f: cmpxchg
+; i386f-NEXT: jne
+
+; x8664-LABEL: test_control_flow:
+; x8664: cmpxchg
+; x8664-NEXT: jne
+
+; x8664-sahf-LABEL: test_control_flow:
+; x8664-sahf: cmpxchg
+; x8664-sahf-NEXT: jne
 
-; CHECK: cmpxchg
-; CHECK-NEXT: jne
 entry:
   %cmp = icmp sgt i32 %i, %j
   br i1 %cmp, label %loop_start, label %cond.end
@@ -67,20 +127,54 @@ cond.end:
 ; This one is an interesting case because CMOV doesn't have a chain
 ; operand. Naive attempts to limit cmpxchg EFLAGS use are likely to fail here.
 define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
-; CHECK-LABEL: test_feed_cmov:
-
-; CHECK: cmpxchg
-; CHECK: pushf[[LQ:[lq]]]
-; CHECK-NEXT: pop[[LQ]] [[FLAGS:%.*]]
-
-; CHECK-NEXT: call[[LQ]] bar
+; i386-LABEL: test_feed_cmov:
+; i386: cmpxchgl
+; i386-NEXT: seto %al
+; i386-NEXT: lahf
+; i386-NEXT: movl %eax, [[FLAGS:%.*]]
+; i386-NEXT: calll foo
+; i386-NEXT: pushl %eax
+; i386-NEXT: movl [[FLAGS]], %eax
+; i386-NEXT: addb $127, %al
+; i386-NEXT: sahf
+; i386-NEXT: popl %eax
+
+; i386f-LABEL: test_feed_cmov:
+; i386f: cmpxchgl
+; i386f-NEXT: seto %al
+; i386f-NEXT: lahf
+; i386f-NEXT: movl %eax, [[FLAGS:%.*]]
+; i386f-NEXT: calll foo
+; i386f-NEXT: pushl %eax
+; i386f-NEXT: movl [[FLAGS]], %eax
+; i386f-NEXT: addb $127, %al
+; i386f-NEXT: sahf
+; i386f-NEXT: popl %eax
+
+; x8664-LABEL: test_feed_cmov:
+; x8664: cmpxchg
+; x8664: pushfq
+; x8664-NEXT: popq [[FLAGS:%.*]]
+; x8664-NEXT: callq foo
+; x8664-NEXT: pushq [[FLAGS]]
+; x8664-NEXT: popfq
+
+; x8664-sahf-LABEL: test_feed_cmov:
+; x8664-sahf: cmpxchgl
+; x8664-sahf: seto %al
+; x8664-sahf-NEXT: lahf
+; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
+; x8664-sahf-NEXT: callq foo
+; x8664-sahf-NEXT: pushq %rax
+; x8664-sahf-NEXT: movq [[FLAGS]], %rax
+; x8664-sahf-NEXT: addb $127, %al
+; x8664-sahf-NEXT: sahf
+; x8664-sahf-NEXT: popq %rax
 
-; CHECK-NEXT: push[[LQ]] [[FLAGS]]
-; CHECK-NEXT: popf[[LQ]]
   %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
   %success = extractvalue { i32, i1 } %res, 1
 
-  %rhs = call i32 @bar()
+  %rhs = call i32 @foo()
 
   %ret = select i1 %success, i32 %new, i32 %rhs
   ret i32 %ret
diff --git a/test/CodeGen/X86/coal-sections.ll b/test/CodeGen/X86/coal-sections.ll
new file mode 100644
index 0000000000000..05b2a8c8bf871
--- /dev/null
+++ b/test/CodeGen/X86/coal-sections.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple x86_64-apple-darwin | FileCheck %s
+
+; Check that *coal* sections are not emitted.
+
+; CHECK: .section  __TEXT,__text,regular,pure_instructions{{$}}
+; CHECK-NEXT: .globl  _foo
+
+; CHECK: .section  __TEXT,__const{{$}}
+; CHECK-NEXT: .globl  _a
+
+; CHECK: .section  __DATA,__data{{$}}
+; CHECK-NEXT: .globl  _b
+
+@a = weak_odr constant [4 x i32] [i32 1, i32 2, i32 3, i32 4], align 16
+@b = weak global i32 5, align 4
+@g = common global i32* null, align 8
+
+; Function Attrs: nounwind ssp uwtable
+define weak i32* @foo() {
+entry:
+  store i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 0), i32** @g, align 8
+  ret i32* @b
+}
diff --git a/test/CodeGen/X86/coalescer-win64.ll b/test/CodeGen/X86/coalescer-win64.ll
new file mode 100644
index 0000000000000..ff084ae5b9e0e
--- /dev/null
+++ b/test/CodeGen/X86/coalescer-win64.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -verify-coalescing | FileCheck %s
+target triple = "x86_64-pc-win32"
+
+@fnptr = external global void ()*
+
+define void @test1() {
+entry:
+  %p = load void ()*, void ()** @fnptr
+  tail call void %p()
+  ret void
+}
+
+; CHECK-LABEL: test1{{$}}
+; CHECK: .seh_proc test1{{$}}
+; CHECK: rex64 jmpq *fnptr(%rip)
+; CHECK: .seh_endproc
diff --git a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
new file mode 100644
index 0000000000000..592d1ce45bb61
--- /dev/null
+++ b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
@@ -0,0 +1,122 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+
+define void @foo() !prof !1 {
+; Test if a cold block in a loop will be placed at the end of the function
+; chain.
+;
+; CHECK-LABEL: foo:
+; CHECK: callq b
+; CHECK: callq c
+; CHECK: callq e
+; CHECK: callq f
+; CHECK: callq d
+
+entry:
+  br label %header
+
+header:
+  call void @b()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !4
+
+if.then:
+  call void @c()
+  br label %if.end
+
+if.else:
+  call void @d()
+  br label %if.end
+
+if.end:
+  call void @e()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header, label %end, !prof !5
+
+end:
+  call void @f()
+  ret void
+}
+
+define void @nested_loop_0() !prof !1 {
+; Test if a block that is cold in the inner loop but not cold in the outer loop
+; will merged to the outer loop chain.
+;
+; CHECK-LABEL: nested_loop_0:
+; CHECK: callq c
+; CHECK: callq d
+; CHECK: callq e
+; CHECK: callq b
+; CHECK: callq f
+
+entry:
+  br label %header
+
+header:
+  call void @b()
+  %call4 = call zeroext i1 @a()
+  br i1 %call4, label %header2, label %end
+
+header2:
+  call void @c()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !2
+
+if.then:
+  call void @d()
+  %call3 = call zeroext i1 @a()
+  br i1 %call3, label %header2, label %header, !prof !3
+
+if.else:
+  call void @e()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header2, label %header, !prof !3
+
+end:
+  call void @f()
+  ret void
+}
+
+define void @nested_loop_1() !prof !1 {
+; Test if a cold block in an inner loop will be placed at the end of the
+; function chain.
+;
+; CHECK-LABEL: nested_loop_1:
+; CHECK: callq b
+; CHECK: callq c
+; CHECK: callq e
+; CHECK: callq d
+
+entry:
+  br label %header
+
+header:
+  call void @b()
+  br label %header2
+
+header2:
+  call void @c()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %end, label %if.else, !prof !4
+
+if.else:
+  call void @d()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header2, label %header, !prof !5
+
+end:
+  call void @e()
+  ret void
+}
+
+declare zeroext i1 @a()
+declare void @b()
+declare void @c()
+declare void @d()
+declare void @e()
+declare void @f()
+
+!1 = !{!"function_entry_count", i64 1}
+!2 = !{!"branch_weights", i32 100, i32 1}
+!3 = !{!"branch_weights", i32 1, i32 10}
+!4 = !{!"branch_weights", i32 1000, i32 1}
+!5 = !{!"branch_weights", i32 100, i32 1}
diff --git a/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll b/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll
new file mode 100644
index 0000000000000..79b4883fb1d6b
--- /dev/null
+++ b/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll
@@ -0,0 +1,123 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+
+define void @foo() {
+; Test that when determining the edge probability from a node in an inner loop
+; to a node in an outer loop, the weights on edges in the inner loop should be
+; ignored if we are building the chain for the outer loop.
+;
+; CHECK-LABEL: foo:
+; CHECK: callq c
+; CHECK: callq b
+
+entry:
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !1
+
+if.then:
+  %call1 = call zeroext i1 @a()
+  br i1 %call1, label %while.body, label %if.end.1, !prof !1
+
+while.body:
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %if.then.1, label %while.cond
+
+if.then.1:
+  call void @d()
+  br label %while.cond
+
+while.cond:
+  %call3 = call zeroext i1 @a()
+  br i1 %call3, label %while.body, label %if.end
+
+if.end.1:
+  call void @d()
+  br label %if.end
+
+if.else:
+  call void @b()
+  br label %if.end
+
+if.end:
+  call void @c()
+  ret void
+}
+
+define void @bar() {
+; Test that when determining the edge probability from a node in a loop to a
+; node in its peer loop, the weights on edges in the first loop should be
+; ignored.
+;
+; CHECK-LABEL: bar:
+; CHECK: callq c
+; CHECK: callq b
+
+entry:
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !1
+
+if.then:
+  %call1 = call zeroext i1 @a()
+  br i1 %call1, label %if.then, label %while.body, !prof !2
+
+while.body:
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %while.body, label %if.end, !prof !2
+
+if.else:
+  call void @b()
+  br label %if.end
+
+if.end:
+  call void @c()
+  ret void
+}
+
+define void @par() {
+; Test that when determining the edge probability from a node in a loop to a
+; node in its outer loop, the weights on edges in the outer loop should be
+; ignored if we are building the chain for the inner loop.
+;
+; CHECK-LABEL: par:
+; CHECK: callq c
+; CHECK: callq d
+; CHECK: callq b
+
+entry:
+  br label %if.cond
+
+if.cond:
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !3
+
+if.then:
+  call void @b()
+  br label %if.end
+
+if.else:
+  call void @c()
+  %call1 = call zeroext i1 @a()
+  br i1 %call1, label %if.end, label %exit, !prof !4
+
+if.end:
+  call void @d()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %if.cond, label %if.end.2, !prof !2
+
+if.end.2:
+  call void @e()
+  br label %if.cond
+
+exit:
+  ret void
+}
+
+declare zeroext i1 @a()
+declare void @b()
+declare void @c()
+declare void @d()
+declare void @e()
+
+!1 = !{!"branch_weights", i32 10, i32 1}
+!2 = !{!"branch_weights", i32 100, i32 1}
+!3 = !{!"branch_weights", i32 1, i32 100}
+!4 = !{!"branch_weights", i32 1, i32 1}
diff --git a/test/CodeGen/X86/code_placement_loop_rotation.ll b/test/CodeGen/X86/code_placement_loop_rotation.ll
new file mode 100644
index 0000000000000..3ec5961486e85
--- /dev/null
+++ b/test/CodeGen/X86/code_placement_loop_rotation.ll
@@ -0,0 +1,80 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK-PROFILE
+
+define void @foo() {
+; Test that not all edges in the loop chain are fall through without profile
+; data.
+;
+; CHECK-LABEL: foo:
+; CHECK: callq e
+; CHECK: callq f
+; CHECK: callq g
+; CHECK: callq h
+
+entry:
+  br label %header
+
+header:
+  call void @e()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !2
+
+if.then:
+  call void @f()
+  br label %if.end
+
+if.else:
+  call void @g()
+  br label %if.end
+
+if.end:
+  call void @h()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header, label %end
+
+end:
+  ret void
+}
+
+define void @bar() !prof !1 {
+; Test that all edges in the loop chain are fall through with profile data.
+;
+; CHECK-PROFILE-LABEL: bar:
+; CHECK-PROFILE: callq g
+; CHECK-PROFILE: callq h
+; CHECK-PROFILE: callq e
+; CHECK-PROFILE: callq f
+
+entry:
+  br label %header
+
+header:
+  call void @e()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !2
+
+if.then:
+  call void @f()
+  br label %if.end
+
+if.else:
+  call void @g()
+  br label %if.end
+
+if.end:
+  call void @h()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header, label %end
+
+end:
+  ret void
+}
+
+declare zeroext i1 @a()
+declare void @e()
+declare void @f()
+declare void @g()
+declare void @h()
+
+!1 = !{!"function_entry_count", i64 1}
+!2 = !{!"branch_weights", i32 16, i32 16}
diff --git a/test/CodeGen/X86/code_placement_loop_rotation2.ll b/test/CodeGen/X86/code_placement_loop_rotation2.ll
new file mode 100644
index 0000000000000..6d8b3c99cd050
--- /dev/null
+++ b/test/CodeGen/X86/code_placement_loop_rotation2.ll
@@ -0,0 +1,122 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK-PROFILE
+
+define void @foo() {
+; Test a nested loop case when profile data is not available.
+;
+; CHECK-LABEL: foo:
+; CHECK: callq b
+; CHECK: callq c
+; CHECK: callq d
+; CHECK: callq e
+; CHECK: callq f
+; CHECK: callq g
+; CHECK: callq h
+
+entry:
+  br label %header
+
+header:
+  call void @b()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !2
+
+if.then:
+  br label %header2
+
+header2:
+  call void @c()
+  %call1 = call zeroext i1 @a()
+  br i1 %call1, label %if.then2, label %if.else2, !prof !2
+
+if.then2:
+  call void @d()
+  br label %if.end2
+
+if.else2:
+  call void @e()
+  br label %if.end2
+
+if.end2:
+  call void @f()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header2, label %if.end
+
+if.else:
+  call void @g()
+  br label %if.end
+
+if.end:
+  call void @h()
+  %call3 = call zeroext i1 @a()
+  br i1 %call3, label %header, label %end
+
+end:
+  ret void
+}
+
+define void @bar() !prof !1 {
+; Test a nested loop case when profile data is available.
+;
+; CHECK-PROFILE-LABEL: bar:
+; CHECK-PROFILE: callq e
+; CHECK-PROFILE: callq f
+; CHECK-PROFILE: callq c
+; CHECK-PROFILE: callq d
+; CHECK-PROFILE: callq h
+; CHECK-PROFILE: callq b
+; CHECK-PROFILE: callq g
+
+entry:
+  br label %header
+
+header:
+  call void @b()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !2
+
+if.then:
+  br label %header2
+
+header2:
+  call void @c()
+  %call1 = call zeroext i1 @a()
+  br i1 %call1, label %if.then2, label %if.else2, !prof !2
+
+if.then2:
+  call void @d()
+  br label %if.end2
+
+if.else2:
+  call void @e()
+  br label %if.end2
+
+if.end2:
+  call void @f()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header2, label %if.end
+
+if.else:
+  call void @g()
+  br label %if.end
+
+if.end:
+  call void @h()
+  %call3 = call zeroext i1 @a()
+  br i1 %call3, label %header, label %end
+
+end:
+  ret void
+}
+
+declare zeroext i1 @a()
+declare void @b()
+declare void @c()
+declare void @d()
+declare void @e()
+declare void @f()
+declare void @g()
+declare void @h()
+
+!1 = !{!"function_entry_count", i64 1}
+!2 = !{!"branch_weights", i32 16, i32 16}
diff --git a/test/CodeGen/X86/codegen-prepare-cast.ll b/test/CodeGen/X86/codegen-prepare-cast.ll
index 1ab8017e8858f..c5c2d64f63d82 100644
--- a/test/CodeGen/X86/codegen-prepare-cast.ll
+++ b/test/CodeGen/X86/codegen-prepare-cast.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: @_Dmain
 ; CHECK: load i8, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0)
-; CHECK ret
+; CHECK: ret
 define fastcc i32 @_Dmain(%"char[][]" %unnamed) {
 entry:
         %tmp = getelementptr [7 x i8], [7 x i8]* @.str, i32 0, i32 0              ; <i8*> [#uses=1]
diff --git a/test/CodeGen/X86/coff-comdat.ll b/test/CodeGen/X86/coff-comdat.ll
index 18f418959ec9c..712825a99100d 100644
--- a/test/CodeGen/X86/coff-comdat.ll
+++ b/test/CodeGen/X86/coff-comdat.ll
@@ -53,7 +53,7 @@ define x86_fastcallcc void @f8() comdat($f8) {
 $vftable = comdat largest
 
 @some_name = private unnamed_addr constant [2 x i8*] zeroinitializer, comdat($vftable)
-@vftable = alias getelementptr([2 x i8*], [2 x i8*]* @some_name, i32 0, i32 1)
+@vftable = alias i8*, getelementptr([2 x i8*], [2 x i8*]* @some_name, i32 0, i32 1)
 
 ; CHECK: .section        .text,"xr",discard,_f1
 ; CHECK: .globl  _f1
diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll
index bb46ac539171a..fddf18d1bdb0a 100644
--- a/test/CodeGen/X86/combine-and.ll
+++ b/test/CodeGen/X86/combine-and.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s
 ;
 ; Verify that the DAGCombiner is able to fold a vector AND into a blend
diff --git a/test/CodeGen/X86/combine-avx-intrinsics.ll b/test/CodeGen/X86/combine-avx-intrinsics.ll
index f610f7fcb91ef..64e081523c1f4 100644
--- a/test/CodeGen/X86/combine-avx-intrinsics.ll
+++ b/test/CodeGen/X86/combine-avx-intrinsics.ll
@@ -19,24 +19,6 @@ define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0) {
 ; CHECK: ret
 
 
-define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
-  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a0, <4 x double> %a1)
-  ret <4 x double> %1
-}
-; CHECK-LABEL: test_x86_avx_blendv_pd_256
-; CHECK-NOT: vblendvpd
-; CHECK: ret
-
-
-define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
-  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a0, <8 x float> %a1)
-  ret <8 x float> %1
-}
-; CHECK-LABEL: test_x86_avx_blendv_ps_256
-; CHECK-NOT: vblendvps
-; CHECK: ret
-
-
 define <4 x double> @test2_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
   %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0)
   ret <4 x double> %1
@@ -55,24 +37,6 @@ define <8 x float> @test2_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1)
 ; CHECK: ret
 
 
-define <4 x double> @test2_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
-  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> zeroinitializer)
-  ret <4 x double> %1
-}
-; CHECK-LABEL: test2_x86_avx_blendv_pd_256
-; CHECK-NOT: vblendvpd
-; CHECK: ret
-
-
-define <8 x float> @test2_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
-  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer)
-  ret <8 x float> %1
-}
-; CHECK-LABEL: test2_x86_avx_blendv_ps_256
-; CHECK-NOT: vblendvps
-; CHECK: ret
-
-
 define <4 x double> @test3_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
   %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 -1)
   ret <4 x double> %1
@@ -91,29 +55,6 @@ define <8 x float> @test3_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1)
 ; CHECK: ret
 
 
-define <4 x double> @test3_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
-  %Mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <4 x double>
-  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %Mask)
-  ret <4 x double> %1
-}
-; CHECK-LABEL: test3_x86_avx_blendv_pd_256
-; CHECK-NOT: vblendvpd
-; CHECK: ret
-
-
-define <8 x float> @test3_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
-  %Mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x float>
-  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %Mask)
-  ret <8 x float> %1
-}
-; CHECK-LABEL: test3_x86_avx_blendv_ps_256
-; CHECK-NOT: vblendvps
-; CHECK: ret
-
-
-
 declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32)
 declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32)
-declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
 
diff --git a/test/CodeGen/X86/combine-avx2-intrinsics.ll b/test/CodeGen/X86/combine-avx2-intrinsics.ll
index 8794f8b868490..2714b26c91414 100644
--- a/test/CodeGen/X86/combine-avx2-intrinsics.ll
+++ b/test/CodeGen/X86/combine-avx2-intrinsics.ll
@@ -3,56 +3,6 @@
 ; Verify that the backend correctly combines AVX2 builtin intrinsics.
 
 
-define <8 x i32> @test_psra_1(<8 x i32> %A) {
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 3)
-  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
-  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 2)
-  ret <8 x i32> %3
-}
-; CHECK-LABEL: test_psra_1
-; CHECK: vpsrad $8, %ymm0, %ymm0
-; CHECK-NEXT: ret
-
-define <16 x i16> @test_psra_2(<16 x i16> %A) {
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 3)
-  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
-  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 2)
-  ret <16 x i16> %3
-}
-; CHECK-LABEL: test_psra_2
-; CHECK: vpsraw $8, %ymm0, %ymm0
-; CHECK-NEXT: ret
-
-define <16 x i16> @test_psra_3(<16 x i16> %A) {
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0)
-  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
-  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0)
-  ret <16 x i16> %3
-}
-; CHECK-LABEL: test_psra_3
-; CHECK-NOT: vpsraw
-; CHECK: ret
-
-define <8 x i32> @test_psra_4(<8 x i32> %A) {
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0)
-  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
-  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0)
-  ret <8 x i32> %3
-}
-; CHECK-LABEL: test_psra_4
-; CHECK-NOT: vpsrad
-; CHECK: ret
-
-
-define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
-  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a0, <32 x i8> %a1)
-  ret <32 x i8> %res
-}
-; CHECK-LABEL: test_x86_avx2_pblendvb
-; CHECK-NOT: vpblendvb
-; CHECK: ret
-
-
 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0) {
   %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a0, i32 7)
   ret <16 x i16> %res
@@ -80,15 +30,6 @@ define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0) {
 ; CHECK: ret
 
 
-define <32 x i8> @test2_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
-  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> zeroinitializer)
-  ret <32 x i8> %res
-}
-; CHECK-LABEL: test2_x86_avx2_pblendvb
-; CHECK-NOT: vpblendvb
-; CHECK: ret
-
-
 define <16 x i16> @test2_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
   %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 0)
   ret <16 x i16> %res
@@ -116,16 +57,6 @@ define <8 x i32> @test2_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK: ret
 
 
-define <32 x i8> @test3_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
-  %1 = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <32 x i8>
-  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %1)
-  ret <32 x i8> %res
-}
-; CHECK-LABEL: test3_x86_avx2_pblendvb
-; CHECK-NOT: vpblendvb
-; CHECK: ret
-
-
 define <16 x i16> @test3_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
   %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 -1)
   ret <16 x i16> %res
@@ -153,12 +84,7 @@ define <8 x i32> @test3_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK: ret
 
 
-declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
 declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32)
 declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32)
 declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32)
-declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>)
-declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32)
-declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32)
 
diff --git a/test/CodeGen/X86/combine-multiplies.ll b/test/CodeGen/X86/combine-multiplies.ll
new file mode 100644
index 0000000000000..5e51edbf52f91
--- /dev/null
+++ b/test/CodeGen/X86/combine-multiplies.ll
@@ -0,0 +1,163 @@
+; RUN: llc < %s -mattr=sse2 -mtriple=i386-unknown-linux-gnu | FileCheck %s
+
+; Source file looks something like this:
+;
+; typedef int AAA[100][100];
+;
+; void testCombineMultiplies(AAA a,int lll)
+; {
+;   int LOC = lll + 5;
+;
+;   a[LOC][LOC] = 11;
+;
+;   a[LOC][20] = 22;
+;   a[LOC+20][20] = 33;
+; }
+;
+; We want to make sure we don't generate 2 multiply instructions,
+; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
+; should combine the instructions in such a way to avoid the extra
+; multiply.
+;
+; Output looks roughly like this:
+;
+;	movl	8(%esp), %eax
+;	movl	12(%esp), %ecx
+;	imull	$400, %ecx, %edx        # imm = 0x190
+;	leal	(%edx,%eax), %esi
+;	movl	$11, 2020(%esi,%ecx,4)
+;	movl	$22, 2080(%edx,%eax)
+;	movl	$33, 10080(%edx,%eax)
+;
+; CHECK-LABEL: testCombineMultiplies
+; CHECK: imull $400, [[ARG1:%[a-z]+]], [[MUL:%[a-z]+]] # imm = 0x190
+; CHECK-NEXT: leal ([[MUL]],[[ARG2:%[a-z]+]]), [[LEA:%[a-z]+]]
+; CHECK-NEXT: movl $11, {{[0-9]+}}([[LEA]],[[ARG1]],4)
+; CHECK-NEXT: movl $22, {{[0-9]+}}([[MUL]],[[ARG2]])
+; CHECK-NEXT: movl $33, {{[0-9]+}}([[MUL]],[[ARG2]])
+; CHECK: retl
+;
+
+; Function Attrs: nounwind
+define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) {
+entry:
+  %add = add nsw i32 %lll, 5
+  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
+  store i32 11, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
+  store i32 22, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %lll, 25
+  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
+  store i32 33, i32* %arrayidx6, align 4
+  ret void
+}
+
+
+; Test for the same optimization on vector multiplies.
+;
+; Source looks something like this:
+;
+; typedef int v4int __attribute__((__vector_size__(16)));
+;
+; v4int x;
+; v4int v2, v3;
+; void testCombineMultiplies_splat(v4int v1) {
+;   v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
+;   v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
+;   x = (v1 + (v4int){ 11, 11, 11, 11 });
+; }
+;
+; Output looks something like this:
+;
+; testCombineMultiplies_splat:                              # @testCombineMultiplies_splat
+; # BB#0:                                 # %entry
+; 	movdqa	.LCPI1_0, %xmm1         # xmm1 = [11,11,11,11]
+; 	paddd	%xmm0, %xmm1
+; 	movdqa	.LCPI1_1, %xmm2         # xmm2 = [22,22,22,22]
+; 	pshufd	$245, %xmm0, %xmm3      # xmm3 = xmm0[1,1,3,3]
+; 	pmuludq	%xmm2, %xmm0
+; 	pshufd	$232, %xmm0, %xmm0      # xmm0 = xmm0[0,2,2,3]
+; 	pmuludq	%xmm2, %xmm3
+; 	pshufd	$232, %xmm3, %xmm2      # xmm2 = xmm3[0,2,2,3]
+; 	punpckldq	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; 	movdqa	.LCPI1_2, %xmm2         # xmm2 = [242,242,242,242]
+;	paddd	%xmm0, %xmm2
+;	paddd	.LCPI1_3, %xmm0
+;	movdqa	%xmm2, v2
+;	movdqa	%xmm0, v3
+;	movdqa	%xmm1, x
+;	retl
+;
+; Again, we want to make sure we don't generate two different multiplies.
+; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
+; pmuludq instructions), followed by two adds. Without this optimization, we'd
+; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).
+;
+; CHECK-LABEL: testCombineMultiplies_splat
+; CHECK:       movdqa .LCPI1_0, [[C11:%xmm[0-9]]]
+; CHECK-NEXT:  paddd %xmm0, [[C11]]
+; CHECK-NEXT:  movdqa .LCPI1_1, [[C22:%xmm[0-9]]]
+; CHECK-NEXT:  pshufd $245, %xmm0, [[T1:%xmm[0-9]]]
+; CHECK-NEXT:  pmuludq [[C22]], [[T2:%xmm[0-9]]]
+; CHECK-NEXT:  pshufd $232, [[T2]], [[T3:%xmm[0-9]]]
+; CHECK-NEXT:  pmuludq [[C22]], [[T4:%xmm[0-9]]]
+; CHECK-NEXT:  pshufd $232, [[T4]], [[T5:%xmm[0-9]]]
+; CHECK-NEXT:  punpckldq [[T5]], [[T6:%xmm[0-9]]]
+; CHECK-NEXT:  movdqa .LCPI1_2, [[C242:%xmm[0-9]]]
+; CHECK-NEXT:  paddd [[T6]], [[C242]]
+; CHECK-NEXT:  paddd .LCPI1_3, [[C726:%xmm[0-9]]]
+; CHECK-NEXT:  movdqa [[C242]], v2
+; CHECK-NEXT:  [[C726]], v3
+; CHECK-NEXT:  [[C11]], x
+; CHECK-NEXT:  retl 
+
+@v2 = common global <4 x i32> zeroinitializer, align 16
+@v3 = common global <4 x i32> zeroinitializer, align 16
+@x = common global <4 x i32> zeroinitializer, align 16
+
+; Function Attrs: nounwind
+define void @testCombineMultiplies_splat(<4 x i32> %v1) {
+entry:
+  %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
+  %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
+  %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
+  %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
+  store <4 x i32> %mul1, <4 x i32>* @v2, align 16
+  store <4 x i32> %mul2, <4 x i32>* @v3, align 16
+  store <4 x i32> %add1, <4 x i32>* @x, align 16
+  ret void
+}
+
+; Finally, check the non-splatted vector case. This is very similar
+; to the previous test case, except for the vector values.
+;
+; CHECK-LABEL: testCombineMultiplies_non_splat
+; CHECK:       movdqa .LCPI2_0, [[C11:%xmm[0-9]]]
+; CHECK-NEXT:  paddd %xmm0, [[C11]]
+; CHECK-NEXT:  movdqa .LCPI2_1, [[C22:%xmm[0-9]]]
+; CHECK-NEXT:  pshufd $245, %xmm0, [[T1:%xmm[0-9]]]
+; CHECK-NEXT:  pmuludq [[C22]], [[T2:%xmm[0-9]]]
+; CHECK-NEXT:  pshufd $232, [[T2]], [[T3:%xmm[0-9]]]
+; CHECK-NEXT:  pshufd $245, [[C22]], [[T7:%xmm[0-9]]]
+; CHECK-NEXT:  pmuludq [[T1]], [[T7]]
+; CHECK-NEXT:  pshufd $232, [[T7]], [[T5:%xmm[0-9]]]
+; CHECK-NEXT:  punpckldq [[T5]], [[T6:%xmm[0-9]]]
+; CHECK-NEXT:  movdqa .LCPI2_2, [[C242:%xmm[0-9]]]
+; CHECK-NEXT:  paddd [[T6]], [[C242]]
+; CHECK-NEXT:  paddd .LCPI2_3, [[C726:%xmm[0-9]]]
+; CHECK-NEXT:  movdqa [[C242]], v2
+; CHECK-NEXT:  [[C726]], v3
+; CHECK-NEXT:  [[C11]], x
+; CHECK-NEXT:  retl 
+; Function Attrs: nounwind
+define void @testCombineMultiplies_non_splat(<4 x i32> %v1) {
+entry:
+  %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
+  %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
+  %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
+  %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
+  store <4 x i32> %mul1, <4 x i32>* @v2, align 16
+  store <4 x i32> %mul2, <4 x i32>* @v3, align 16
+  store <4 x i32> %add1, <4 x i32>* @x, align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index 970f1762c1b85..e17cfbeeee124 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
 
 
diff --git a/test/CodeGen/X86/combine-sse2-intrinsics.ll b/test/CodeGen/X86/combine-sse2-intrinsics.ll
deleted file mode 100644
index fa500e5d8d67d..0000000000000
--- a/test/CodeGen/X86/combine-sse2-intrinsics.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
-
-; Verify that the backend correctly combines SSE2 builtin intrinsics.
-
-
-define <4 x i32> @test_psra_1(<4 x i32> %A) {
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 3)
-  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
-  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 2)
-  ret <4 x i32> %3
-}
-; CHECK-LABEL: test_psra_1
-; CHECK: psrad $8, %xmm0
-; CHECK-NEXT: ret
-
-define <8 x i16> @test_psra_2(<8 x i16> %A) {
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 3)
-  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
-  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 2)
-  ret <8 x i16> %3
-}
-; CHECK-LABEL: test_psra_2
-; CHECK: psraw $8, %xmm0
-; CHECK-NEXT: ret
-
-define <4 x i32> @test_psra_3(<4 x i32> %A) {
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0)
-  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
-  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 0)
-  ret <4 x i32> %3
-}
-; CHECK-LABEL: test_psra_3
-; CHECK-NOT: psrad
-; CHECK: ret
-
-
-define <8 x i16> @test_psra_4(<8 x i16> %A) {
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)
-  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
-  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0)
-  ret <8 x i16> %3
-}
-; CHECK-LABEL: test_psra_4
-; CHECK-NOT: psraw
-; CHECK: ret
-
-
-declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
-declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
-
diff --git a/test/CodeGen/X86/combine-sse41-intrinsics.ll b/test/CodeGen/X86/combine-sse41-intrinsics.ll
index 254991aec094a..1916883c201b6 100644
--- a/test/CodeGen/X86/combine-sse41-intrinsics.ll
+++ b/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -19,33 +19,6 @@ define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK: ret
 
 
-define <2 x double> @test_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
-  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> zeroinitializer)
-  ret <2 x double> %1
-}
-; CHECK-LABEL: test_x86_sse41_blendv_pd
-; CHECK-NOT: blendvpd
-; CHECK: ret
-
-
-define <4 x float> @test_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
-  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer)
-  ret <4 x float> %1
-}
-; CHECK-LABEL: test_x86_sse41_blendv_ps
-; CHECK-NOT: blendvps
-; CHECK: ret
-
-
-define <16 x i8> @test_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1) {
-  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> zeroinitializer)
-  ret <16 x i8> %1
-}
-; CHECK-LABEL: test_x86_sse41_pblendv_b
-; CHECK-NOT: pblendvb
-; CHECK: ret
-
-
 define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
   %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0)
   ret <8 x i16> %1
@@ -75,39 +48,6 @@ define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-NEXT: ret
 
 
-define <2 x double> @test2_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
-  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <2 x double>
-  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %Mask )
-  ret <2 x double> %1
-}
-; CHECK-LABEL: test2_x86_sse41_blendv_pd
-; CHECK-NOT: blendvpd
-; CHECK: movaps %xmm1, %xmm0
-; CHECK-NEXT: ret
-
-
-define <4 x float> @test2_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
-  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x float>
-  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %Mask)
-  ret <4 x float> %1
-}
-; CHECK-LABEL: test2_x86_sse41_blendv_ps
-; CHECK-NOT: blendvps
-; CHECK: movaps %xmm1, %xmm0
-; CHECK-NEXT: ret
-
-
-define <16 x i8> @test2_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
-  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <16 x i8>
-  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %Mask)
-  ret <16 x i8> %1
-}
-; CHECK-LABEL: test2_x86_sse41_pblendv_b
-; CHECK-NOT: pblendvb
-; CHECK: movaps %xmm1, %xmm0
-; CHECK-NEXT: ret
-
-
 define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
   %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1)
   ret <8 x i16> %1
@@ -136,33 +76,6 @@ define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) {
 ; CHECK: ret
 
 
-define <2 x double> @test3_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
-  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a0, <2 x double> %a1 )
-  ret <2 x double> %1
-}
-; CHECK-LABEL: test3_x86_sse41_blendv_pd
-; CHECK-NOT: blendvpd
-; CHECK: ret
-
-
-define <4 x float> @test3_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
-  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a0, <4 x float> %a1)
-  ret <4 x float> %1
-}
-; CHECK-LABEL: test3_x86_sse41_blendv_ps
-; CHECK-NOT: blendvps
-; CHECK: ret
-
-
-define <16 x i8> @test3_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1) {
-  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> %a1)
-  ret <16 x i8> %1
-}
-; CHECK-LABEL: test3_x86_sse41_pblendv_b
-; CHECK-NOT: pblendvb
-; CHECK: ret
-
-
 define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) {
   %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7)
   ret <8 x i16> %1
@@ -174,9 +87,5 @@ define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) {
 
 declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32)
 declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32)
-declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
-declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
 declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32)
-declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>)
 
diff --git a/test/CodeGen/X86/commute-two-addr.ll b/test/CodeGen/X86/commute-two-addr.ll
index 5b01e2f4e90d5..656c385e2bc7d 100644
--- a/test/CodeGen/X86/commute-two-addr.ll
+++ b/test/CodeGen/X86/commute-two-addr.ll
@@ -39,7 +39,7 @@ define %0 @t3(i32 %lb, i8 zeroext %has_lb, i8 zeroext %lb_inclusive, i32 %ub, i8
 entry:
 ; DARWIN-LABEL: t3:
 ; DARWIN: shlq $32, %rcx
-; DARWIN-NEXT: leaq (%rax,%rcx), %rax
+; DARWIN-NEXT: orq %rcx, %rax
 ; DARWIN-NEXT: shll $8
 ; DARWIN-NOT: leaq
   %tmp21 = zext i32 %lb to i64
diff --git a/test/CodeGen/X86/constant-hoisting-and.ll b/test/CodeGen/X86/constant-hoisting-and.ll
new file mode 100644
index 0000000000000..611445f4a249f
--- /dev/null
+++ b/test/CodeGen/X86/constant-hoisting-and.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -O3 -march=x86-64 |FileCheck %s
+define i64 @foo(i1 %z, i64 %data1, i64 %data2)
+{
+; If constant 4294967294 is hoisted to a variable, then we won't be able to use
+; the implicit zero extension of 32-bit operations to handle the AND.
+entry:
+  %val1 = and i64 %data1, 4294967294
+  br i1 %z, label %End, label %L_val2
+
+; CHECK: andl    $-2, {{.*}}
+; CHECK: andl    $-2, {{.*}}
+L_val2:
+  %val2 = and i64 %data2, 4294967294
+  br label %End
+
+End:
+  %p1 = phi i64 [%val1,%entry], [%val2,%L_val2]
+  ret i64 %p1
+}
diff --git a/test/CodeGen/X86/constant-hoisting-cmp.ll b/test/CodeGen/X86/constant-hoisting-cmp.ll
new file mode 100644
index 0000000000000..4e9e49487287d
--- /dev/null
+++ b/test/CodeGen/X86/constant-hoisting-cmp.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -O3 -march=x86-64 |FileCheck %s
+define i64 @foo(i64 %data1, i64 %data2, i64 %data3)
+{
+; If constant 4294967295 is hoisted to a variable, then we won't be able to
+; use a shift right by 32 to optimize the compare.
+entry:
+  %val1 = add i64 %data3, 1
+  %x = icmp ugt i64 %data1, 4294967295
+  br i1 %x, label %End, label %L_val2
+
+; CHECK: shrq    $32, {{.*}}
+; CHECK: shrq    $32, {{.*}}
+L_val2:
+  %val2 = add i64 %data3, 2
+  %y = icmp ugt i64 %data2, 4294967295
+  br i1 %y, label %End, label %L_val3
+
+L_val3:
+  %val3 = add i64 %data3, 3
+  br label %End
+
+End:
+  %p1 = phi i64 [%val1,%entry], [%val2,%L_val2], [%val3,%L_val3]
+  ret i64 %p1
+}
diff --git a/test/CodeGen/X86/copysign-constant-magnitude.ll b/test/CodeGen/X86/copysign-constant-magnitude.ll
index 537d6298ddf46..6c577a2cfcc78 100644
--- a/test/CodeGen/X86/copysign-constant-magnitude.ll
+++ b/test/CodeGen/X86/copysign-constant-magnitude.ll
@@ -5,13 +5,13 @@ target triple = "x86_64-apple-macosx10.10.0"
 
 define void @test_copysign_const_magnitude_d(double %X) {
 ; CHECK: [[SIGNMASK:L.+]]:
-; CHECK-NEXT:   .quad -9223372036854775808    ## double -0.000000e+00
-; CHECK-NEXT:   .quad 0                       ## double 0.000000e+00
+; CHECK-NEXT:   .quad -9223372036854775808    ## double -0
+; CHECK-NEXT:   .quad 0                       ## double 0
 ; CHECK: [[ZERO:L.+]]:
 ; CHECK-NEXT:   .space 16
 ; CHECK: [[ONE:L.+]]:
-; CHECK-NEXT:   .quad 4607182418800017408     ## double 1.000000e+00
-; CHECK-NEXT:   .quad 0                       ## double 0.000000e+00
+; CHECK-NEXT:   .quad 4607182418800017408     ## double 1
+; CHECK-NEXT:   .quad 0                       ## double 0
 ; CHECK-LABEL: test_copysign_const_magnitude_d:
 
 ; CHECK: id
@@ -50,17 +50,17 @@ define void @test_copysign_const_magnitude_d(double %X) {
 
 define void @test_copysign_const_magnitude_f(float %X) {
 ; CHECK: [[SIGNMASK:L.+]]:
-; CHECK-NEXT:   .long	2147483648              ## float -0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-NEXT:   .long	2147483648              ## float -0
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
 ; CHECK: [[ZERO:L.+]]:
 ; CHECK-NEXT:   .space 16
 ; CHECK: [[ONE:L.+]]:
-; CHECK-NEXT:   .long	1065353216              ## float 1.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-NEXT:   .long	1065353216              ## float 1
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
 ; CHECK-LABEL: test_copysign_const_magnitude_f:
 
 ; CHECK: id
diff --git a/test/CodeGen/X86/cppeh-nounwind.ll b/test/CodeGen/X86/cppeh-nounwind.ll
deleted file mode 100644
index d9bc001a92df2..0000000000000
--- a/test/CodeGen/X86/cppeh-nounwind.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck %s
-
-; Sometimes invokes of nounwind functions make it through to CodeGen, especially
-; at -O0, where Clang sometimes optimistically annotates functions as nounwind.
-; WinEHPrepare ends up outlining functions, and emitting references to LSDA
-; labels. Make sure we emit the LSDA in that case.
-
-declare i32 @__CxxFrameHandler3(...)
-declare void @nounwind_func() nounwind
-declare void @cleanup()
-
-define void @should_emit_tables() personality i32 (...)* @__CxxFrameHandler3 {
-entry:
-  invoke void @nounwind_func()
-      to label %done unwind label %lpad
-
-done:
-  ret void
-
-lpad:
-  %vals = landingpad { i8*, i32 }
-      cleanup
-  call void @cleanup()
-  resume { i8*, i32 } %vals
-}
-
-; CHECK: _should_emit_tables:
-; CHECK: calll _nounwind_func
-; CHECK: retl
-
-; CHECK: L__ehtable$should_emit_tables:
-
-; CHECK: ___ehhandler$should_emit_tables:
-; CHECK: movl $L__ehtable$should_emit_tables, %eax
-; CHECK: jmp ___CxxFrameHandler3 # TAILCALL
diff --git a/test/CodeGen/X86/cxx_tlscc64.ll b/test/CodeGen/X86/cxx_tlscc64.ll
new file mode 100644
index 0000000000000..c229521cc9a4b
--- /dev/null
+++ b/test/CodeGen/X86/cxx_tlscc64.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck --check-prefix=SHRINK %s
+%struct.S = type { i8 }
+
+@sg = internal thread_local global %struct.S zeroinitializer, align 1
+@__dso_handle = external global i8
+@__tls_guard = internal thread_local unnamed_addr global i1 false
+
+declare void @_ZN1SC1Ev(%struct.S*)
+declare void @_ZN1SD1Ev(%struct.S*)
+declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
+
+; Every GPR should be saved - except rdi, rax, and rsp
+; CHECK-LABEL: _ZTW2sg
+; CHECK: pushq %r11
+; CHECK: pushq %r10
+; CHECK: pushq %r9
+; CHECK: pushq %r8
+; CHECK: pushq %rsi
+; CHECK: pushq %rdx
+; CHECK: pushq %rcx
+; CHECK: pushq %rbx
+; CHECK: callq
+; CHECK: jne
+; CHECK: callq
+; CHECK: tlv_atexit
+; CHECK: callq
+; CHECK: popq %rbx
+; CHECK: popq %rcx
+; CHECK: popq %rdx
+; CHECK: popq %rsi
+; CHECK: popq %r8
+; CHECK: popq %r9
+; CHECK: popq %r10
+; CHECK: popq %r11
+; SHRINK-LABEL: _ZTW2sg
+; SHRINK: callq
+; SHRINK: jne
+; SHRINK: pushq %r11
+; SHRINK: pushq %r10
+; SHRINK: pushq %r9
+; SHRINK: pushq %r8
+; SHRINK: pushq %rsi
+; SHRINK: pushq %rdx
+; SHRINK: pushq %rcx
+; SHRINK: pushq %rbx
+; SHRINK: callq
+; SHRINK: tlv_atexit
+; SHRINK: popq %rbx
+; SHRINK: popq %rcx
+; SHRINK: popq %rdx
+; SHRINK: popq %rsi
+; SHRINK: popq %r8
+; SHRINK: popq %r9
+; SHRINK: popq %r10
+; SHRINK: popq %r11
+; SHRINK: LBB{{.*}}:
+; SHRINK: callq
+define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() {
+  %.b.i = load i1, i1* @__tls_guard, align 1
+  br i1 %.b.i, label %__tls_init.exit, label %init.i
+
+init.i:
+  store i1 true, i1* @__tls_guard, align 1
+  tail call void @_ZN1SC1Ev(%struct.S* nonnull @sg) #2
+  %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (void (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle) #2
+  br label %__tls_init.exit
+
+__tls_init.exit:
+  ret %struct.S* @sg
+}
diff --git a/test/CodeGen/X86/dag-fmf-cse.ll b/test/CodeGen/X86/dag-fmf-cse.ll
new file mode 100644
index 0000000000000..ac8c5000aba4c
--- /dev/null
+++ b/test/CodeGen/X86/dag-fmf-cse.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma -enable-unsafe-fp-math -enable-fmf-dag=1 | FileCheck %s
+
+; If fast-math-flags are propagated correctly, the mul1 expression
+; should be recognized as a factor in the last fsub, so we should
+; see a mul and add, not a mul and fma:
+; a * b - (-a * b) ---> (a * b) + (a * b)
+
+define float @fmf_should_not_break_cse(float %a, float %b) {
+; CHECK-LABEL: fmf_should_not_break_cse:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+
+  %mul1 = fmul fast float %a, %b
+  %nega = fsub fast float 0.0, %a
+  %mul2 = fmul fast float %nega, %b
+  %abx2 = fsub fast float %mul1, %mul2
+  ret float %abx2
+}
+
diff --git a/test/CodeGen/X86/dag-merge-fast-accesses.ll b/test/CodeGen/X86/dag-merge-fast-accesses.ll
new file mode 100644
index 0000000000000..867881d83d3f4
--- /dev/null
+++ b/test/CodeGen/X86/dag-merge-fast-accesses.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-slow-unaligned-mem-16 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+slow-unaligned-mem-16 | FileCheck %s --check-prefix=SLOW
+
+; Verify that the DAGCombiner is creating unaligned 16-byte loads and stores
+; if and only if those are fast.
+
+define void @merge_const_vec_store(i64* %ptr) {
+; FAST-LABEL: merge_const_vec_store:
+; FAST:       # BB#0:
+; FAST-NEXT:    xorps %xmm0, %xmm0
+; FAST-NEXT:    movups %xmm0, (%rdi)
+; FAST-NEXT:    retq
+;
+; SLOW-LABEL: merge_const_vec_store:
+; SLOW:       # BB#0:
+; SLOW-NEXT:    movq $0, (%rdi)
+; SLOW-NEXT:    movq $0, 8(%rdi)
+; SLOW-NEXT:    retq
+
+  %idx0 = getelementptr i64, i64* %ptr, i64 0
+  %idx1 = getelementptr i64, i64* %ptr, i64 1
+
+  store i64 0, i64* %idx0, align 8
+  store i64 0, i64* %idx1, align 8
+  ret void
+}
+
+
+define void @merge_vec_element_store(<4 x double> %v, double* %ptr) {
+; FAST-LABEL: merge_vec_element_store:
+; FAST:       # BB#0:
+; FAST-NEXT:    movups %xmm0, (%rdi)
+; FAST-NEXT:    retq
+;
+; SLOW-LABEL: merge_vec_element_store:
+; SLOW:       # BB#0:
+; SLOW-NEXT:    movlpd %xmm0, (%rdi)
+; SLOW-NEXT:    movhpd %xmm0, 8(%rdi)
+; SLOW-NEXT:    retq
+
+  %vecext0 = extractelement <4 x double> %v, i32 0
+  %vecext1 = extractelement <4 x double> %v, i32 1
+
+  %idx0 = getelementptr double, double* %ptr, i64 0
+  %idx1 = getelementptr double, double* %ptr, i64 1
+
+  store double %vecext0, double* %idx0, align 8
+  store double %vecext1, double* %idx1, align 8
+  ret void
+}
+
+
+;; TODO: FAST *should* be:
+;;    movups (%rdi), %xmm0
+;;    movups %xmm0, 40(%rdi)
+;; ..but is not currently. See the UseAA FIXME in DAGCombiner.cpp
+;; visitSTORE.
+
+define void @merge_vec_load_and_stores(i64 *%ptr) {
+; FAST-LABEL: merge_vec_load_and_stores:
+; FAST:       # BB#0:
+; FAST-NEXT:    movq (%rdi), %rax
+; FAST-NEXT:    movq 8(%rdi), %rcx
+; FAST-NEXT:    movq %rax, 40(%rdi)
+; FAST-NEXT:    movq %rcx, 48(%rdi)
+; FAST-NEXT:    retq
+;
+; SLOW-LABEL: merge_vec_load_and_stores:
+; SLOW:       # BB#0:
+; SLOW-NEXT:    movq (%rdi), %rax
+; SLOW-NEXT:    movq 8(%rdi), %rcx
+; SLOW-NEXT:    movq %rax, 40(%rdi)
+; SLOW-NEXT:    movq %rcx, 48(%rdi)
+; SLOW-NEXT:    retq
+
+  %idx0 = getelementptr i64, i64* %ptr, i64 0
+  %idx1 = getelementptr i64, i64* %ptr, i64 1
+
+  %ld0 = load i64, i64* %idx0, align 4
+  %ld1 = load i64, i64* %idx1, align 4
+
+  %idx4 = getelementptr i64, i64* %ptr, i64 5
+  %idx5 = getelementptr i64, i64* %ptr, i64 6
+
+  store i64 %ld0, i64* %idx4, align 4
+  store i64 %ld1, i64* %idx5, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/X86/darwin-tls.ll b/test/CodeGen/X86/darwin-tls.ll
new file mode 100644
index 0000000000000..ca9a998ccc75a
--- /dev/null
+++ b/test/CodeGen/X86/darwin-tls.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple x86_64-apple-darwin | FileCheck %s
+
+@a = thread_local global i32 4, align 4
+
+define i32 @f2(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5) {
+entry:
+; Parameters are in %edi, %esi, %edx, %ecx, %r8d, there is no need to save
+; these parameters except the one in %edi, before making the TLS call.
+; %edi is used to pass parameter to the TLS call.
+; CHECK-NOT: movl %r8d
+; CHECK-NOT: movl %ecx
+; CHECK-NOT: movl %edx
+; CHECK-NOT: movl %esi
+; CHECK: movq {{.*}}TLVP{{.*}}, %rdi
+; CHECK-NEXT: callq
+; CHECK-NEXT: movl (%rax),
+; CHECK-NOT: movl {{.*}}, %esi
+; CHECK-NOT: movl {{.*}}, %edx
+; CHECK-NOT: movl {{.*}}, %ecx
+; CHECK-NOT: movl {{.*}}, %r8d
+; CHECK: callq
+  %0 = load i32, i32* @a, align 4
+  %call = tail call i32 @f3(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5)
+  %add = add nsw i32 %call, %0
+  ret i32 %add
+}
+
+declare i32 @f3(i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index 20d0129c3e89d..54bd48926834a 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -48,7 +48,7 @@
 @.str2 = private unnamed_addr constant [2 x i8] c"-\00", align 1
 
 ; Function Attrs: uwtable
-define void @_Z3barii(i32 %param1, i32 %param2) #0 {
+define void @_Z3barii(i32 %param1, i32 %param2) #0 !dbg !24 {
 entry:
   %var1 = alloca %struct.AAA3, align 1
   %var2 = alloca %struct.AAA3, align 1
@@ -113,7 +113,7 @@ attributes #2 = { nounwind readnone }
 !llvm.module.flags = !{!44, !45}
 !llvm.ident = !{!46}
 
-!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !3, subprograms: !23, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !3, subprograms: !23, globals: !2, imports: !2)
 !1 = !DIFile(filename: "dbg-changes-codegen-branch-folding.cpp", directory: "/tmp/dbginfo")
 !2 = !{}
 !3 = !{!4}
@@ -137,26 +137,26 @@ attributes #2 = { nounwind readnone }
 !21 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !22)
 !22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !"_ZTS4AAA3")
 !23 = !{!24, !35, !40}
-!24 = !DISubprogram(name: "bar", linkageName: "_Z3barii", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !1, scope: !25, type: !26, function: void (i32, i32)* @_Z3barii, variables: !29)
+!24 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barii", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !1, scope: !25, type: !26, variables: !29)
 !25 = !DIFile(filename: "dbg-changes-codegen-branch-folding.cpp", directory: "/tmp/dbginfo")
 !26 = !DISubroutineType(types: !27)
 !27 = !{null, !28, !28}
 !28 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !29 = !{!30, !31, !32, !33, !34}
-!30 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "param1", line: 11, arg: 1, scope: !24, file: !25, type: !28)
-!31 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "param2", line: 11, arg: 2, scope: !24, file: !25, type: !28)
-!32 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "temp", line: 12, scope: !24, file: !25, type: !15)
-!33 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "var1", line: 17, scope: !24, file: !25, type: !"_ZTS4AAA3")
-!34 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "var2", line: 18, scope: !24, file: !25, type: !"_ZTS4AAA3")
-!35 = !DISubprogram(name: "operator=", linkageName: "_ZN4AAA3aSEPKc", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !"_ZTS4AAA3", type: !12, declaration: !17, variables: !36)
+!30 = !DILocalVariable(name: "param1", line: 11, arg: 1, scope: !24, file: !25, type: !28)
+!31 = !DILocalVariable(name: "param2", line: 11, arg: 2, scope: !24, file: !25, type: !28)
+!32 = !DILocalVariable(name: "temp", line: 12, scope: !24, file: !25, type: !15)
+!33 = !DILocalVariable(name: "var1", line: 17, scope: !24, file: !25, type: !"_ZTS4AAA3")
+!34 = !DILocalVariable(name: "var2", line: 18, scope: !24, file: !25, type: !"_ZTS4AAA3")
+!35 = distinct !DISubprogram(name: "operator=", linkageName: "_ZN4AAA3aSEPKc", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !"_ZTS4AAA3", type: !12, declaration: !17, variables: !36)
 !36 = !{!37, !39}
-!37 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
+!37 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
 !38 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !"_ZTS4AAA3")
-!39 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
-!40 = !DISubprogram(name: "AAA3", linkageName: "_ZN4AAA3C2EPKc", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !1, scope: !"_ZTS4AAA3", type: !12, declaration: !11, variables: !41)
+!39 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
+!40 = distinct !DISubprogram(name: "AAA3", linkageName: "_ZN4AAA3C2EPKc", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !1, scope: !"_ZTS4AAA3", type: !12, declaration: !11, variables: !41)
 !41 = !{!42, !43}
-!42 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
-!43 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
+!42 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
+!43 = !DILocalVariable(name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
 !44 = !{i32 2, !"Dwarf Version", i32 4}
 !45 = !{i32 2, !"Debug Info Version", i32 3}
 !46 = !{!"clang version 3.5.0 "}
@@ -169,36 +169,36 @@ attributes #2 = { nounwind readnone }
 !53 = distinct !DILexicalBlock(line: 14, column: 0, file: !1, scope: !51)
 !54 = !DILocation(line: 16, scope: !53)
 !55 = !DILocation(line: 17, scope: !24)
-!56 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
+!56 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
 !57 = !DILocation(line: 0, scope: !40, inlinedAt: !55)
 !58 = !{i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)}
-!59 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
+!59 = !DILocalVariable(name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
 !60 = !DILocation(line: 5, scope: !40, inlinedAt: !55)
 !61 = !DILocation(line: 5, scope: !62, inlinedAt: !55)
 !62 = distinct !DILexicalBlock(line: 5, column: 0, file: !1, scope: !40)
 !63 = !DILocation(line: 18, scope: !24)
-!64 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
+!64 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
 !65 = !DILocation(line: 0, scope: !40, inlinedAt: !63)
-!66 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
+!66 = !DILocalVariable(name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
 !67 = !DILocation(line: 5, scope: !40, inlinedAt: !63)
 !68 = !DILocation(line: 5, scope: !62, inlinedAt: !63)
 !69 = !DILocation(line: 20, scope: !70)
 !70 = distinct !DILexicalBlock(line: 20, column: 0, file: !1, scope: !24)
-!71 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
+!71 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
 !72 = !DILocation(line: 21, scope: !70)
 !73 = !DILocation(line: 0, scope: !35, inlinedAt: !72)
 !74 = !{i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str1, i64 0, i64 0)}
-!75 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
+!75 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
 !76 = !DILocation(line: 6, scope: !35, inlinedAt: !72)
-!77 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
+!77 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
 !78 = !DILocation(line: 23, scope: !70)
 !79 = !DILocation(line: 0, scope: !35, inlinedAt: !78)
 !80 = !{i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str2, i64 0, i64 0)}
-!81 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
+!81 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
 !82 = !DILocation(line: 6, scope: !35, inlinedAt: !78)
-!83 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
+!83 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
 !84 = !DILocation(line: 24, scope: !24)
 !85 = !DILocation(line: 0, scope: !35, inlinedAt: !84)
-!86 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
+!86 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
 !87 = !DILocation(line: 6, scope: !35, inlinedAt: !84)
 !88 = !DILocation(line: 25, scope: !24)
diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll
index b15e4bd4bf2d5..bee86b4617c74 100644
--- a/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -44,7 +44,7 @@
 define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 {
 entry:
   %0 = load %struct.Foo*, %struct.Foo** @pfoo, align 8
-  tail call void @llvm.dbg.value(metadata %struct.Foo* %0, i64 0, metadata !62, metadata !DIExpression()), !dbg !DILocation(scope: !DISubprogram())
+  tail call void @llvm.dbg.value(metadata %struct.Foo* %0, i64 0, metadata !62, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   %cmp.i = icmp eq %struct.Foo* %0, %this
   ret i1 %cmp.i
 }
@@ -53,7 +53,7 @@ entry:
 define void @_Z3bazv() #1 {
 entry:
   %0 = load %struct.Wibble*, %struct.Wibble** @wibble1, align 8
-  tail call void @llvm.dbg.value(metadata %struct.Flibble* undef, i64 0, metadata !65, metadata !DIExpression()), !dbg !DILocation(scope: !DISubprogram())
+  tail call void @llvm.dbg.value(metadata %struct.Flibble* undef, i64 0, metadata !65, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   %1 = load %struct.Wibble*, %struct.Wibble** @wibble2, align 8
   %cmp.i = icmp ugt %struct.Wibble* %1, %0
   br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit
@@ -75,9 +75,10 @@ attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "no-fra
 attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
+!1 = distinct !DISubprogram()
 
 !17 = !DIDerivedType(tag: DW_TAG_reference_type, baseType: null)
 !45 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: null)
-!62 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "arg", line: 4, arg: 2, scope: !DISubprogram(), type: !17)
+!62 = !DILocalVariable(name: "arg", line: 4, arg: 2, scope: !1, type: !17)
 !64 = !{%struct.Flibble* undef}
-!65 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 13, arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !DISubprogram(), type: !45)
+!65 = !DILocalVariable(name: "this", line: 13, arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !1, type: !45)
diff --git a/test/CodeGen/X86/dbg-combine.ll b/test/CodeGen/X86/dbg-combine.ll
index 5eb2ea9df513b..3e78c316a06fd 100644
--- a/test/CodeGen/X86/dbg-combine.ll
+++ b/test/CodeGen/X86/dbg-combine.ll
@@ -24,7 +24,7 @@
 
 ; ModuleID = 'dbg-combine.c'
 ; Function Attrs: nounwind uwtable
-define i32 @foo() #0 {
+define i32 @foo() #0 !dbg !4 {
 entry:
   %elems = alloca i32, align 4
   %saved_stack = alloca i8*
@@ -74,11 +74,11 @@ attributes #2 = { nounwind }
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.7.0 (trunk 227074)", isOptimized: false, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.7.0 (trunk 227074)", isOptimized: false, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
 !1 = !DIFile(filename: "dbg-combine.c", directory: "/home/probinson/projects/scratch")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, isOptimized: false, scopeLine: 2, file: !1, scope: !5, type: !6, function: i32 ()* @foo, variables: !2)
+!4 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, isOptimized: false, scopeLine: 2, file: !1, scope: !5, type: !6, variables: !2)
 !5 = !DIFile(filename: "dbg-combine.c", directory: "/home/probinson/projects/scratch")
 !6 = !DISubroutineType(types: !7)
 !7 = !{!8}
@@ -86,12 +86,12 @@ attributes #2 = { nounwind }
 !9 = !{i32 2, !"Dwarf Version", i32 4}
 !10 = !{i32 2, !"Debug Info Version", i32 3}
 !11 = !{!"clang version 3.7.0 (trunk 227074)"}
-!12 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "elems", line: 3, scope: !4, file: !5, type: !8)
+!12 = !DILocalVariable(name: "elems", line: 3, scope: !4, file: !5, type: !8)
 !13 = !DIExpression()
 !14 = !DILocation(line: 3, column: 8, scope: !4)
 !15 = !DILocation(line: 4, column: 15, scope: !4)
 !16 = !DILocation(line: 4, column: 4, scope: !4)
-!17 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "array1", line: 4, scope: !4, file: !5, type: !18)
+!17 = !DILocalVariable(name: "array1", line: 4, scope: !4, file: !5, type: !18)
 !18 = !DICompositeType(tag: DW_TAG_array_type, align: 32, baseType: !8, elements: !19)
 !19 = !{!20}
 !20 = !DISubrange(count: -1)
@@ -105,7 +105,7 @@ attributes #2 = { nounwind }
 !28 = !DILocation(line: 7, column: 13, scope: !4)
 !29 = !DILocation(line: 8, column: 15, scope: !4)
 !30 = !DILocation(line: 8, column: 4, scope: !4)
-!31 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "array2", line: 8, scope: !4, file: !5, type: !18)
+!31 = !DILocalVariable(name: "array2", line: 8, scope: !4, file: !5, type: !18)
 !32 = !DILocation(line: 8, column: 8, scope: !4)
 !33 = !DILocation(line: 9, column: 4, scope: !4)
 !34 = !DILocation(line: 9, column: 13, scope: !4)
diff --git a/test/CodeGen/X86/debugloc-argsize.ll b/test/CodeGen/X86/debugloc-argsize.ll
new file mode 100644
index 0000000000000..0283154abab25
--- /dev/null
+++ b/test/CodeGen/X86/debugloc-argsize.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s
+
+; CHECK-LABEL: _Z3foov:
+; CHECK: .loc    1 4 3 prologue_end
+; CHECK: .cfi_escape 0x2e, 0x10
+define void @_Z3foov() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !4 {
+entry:
+  tail call void @_Z3bariii(i32 0, i32 1, i32 2) #1, !dbg !10
+  invoke void @_Z3bariii(i32 4, i32 5, i32 6) #1
+          to label %try.cont unwind label %lpad, !dbg !11
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 }
+          catch i8* null, !dbg !13
+  %1 = extractvalue { i8*, i32 } %0, 0, !dbg !13
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2, !dbg !14
+  tail call void @__cxa_end_catch(), !dbg !15
+  br label %try.cont, !dbg !15
+
+try.cont:                                         ; preds = %entry, %lpad
+  ret void, !dbg !17
+}
+
+; Function Attrs: optsize
+declare void @_Z3bariii(i32, i32, i32) #0
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 249520)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!1 = !DIFile(filename: "foo.cpp", directory: "foo")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, variables: !2)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null}
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.8.0 (trunk 249520)"}
+!10 = !DILocation(line: 4, column: 3, scope: !4)
+!11 = !DILocation(line: 6, column: 5, scope: !12)
+!12 = distinct !DILexicalBlock(scope: !4, file: !1, line: 5, column: 7)
+!13 = !DILocation(line: 10, column: 1, scope: !12)
+!14 = !DILocation(line: 7, column: 3, scope: !12)
+!15 = !DILocation(line: 9, column: 3, scope: !16)
+!16 = distinct !DILexicalBlock(scope: !4, file: !1, line: 7, column: 17)
+!17 = !DILocation(line: 10, column: 1, scope: !4)
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index fd07a3f551001..9543d6c4d7498 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -94,3 +94,35 @@ define i8 @test9(i8 %x) nounwind {
 ; CHECK: shrl $11
 ; CHECK: ret
 }
+
+define i32 @testsize1(i32 %x) minsize nounwind {
+entry:
+	%div = sdiv i32 %x, 32
+	ret i32 %div
+; CHECK-LABEL: testsize1:
+; CHECK: divl
+}
+
+define i32 @testsize2(i32 %x) minsize nounwind {
+entry:
+	%div = sdiv i32 %x, 33
+	ret i32 %div
+; CHECK-LABEL: testsize2:
+; CHECK: divl
+}
+
+define i32 @testsize3(i32 %x) minsize nounwind {
+entry:
+	%div = udiv i32 %x, 32
+	ret i32 %div
+; CHECK-LABEL: testsize3:
+; CHECK: shrl
+}
+
+define i32 @testsize4(i32 %x) minsize nounwind {
+entry:
+	%div = udiv i32 %x, 33
+	ret i32 %div
+; CHECK-LABEL: testsize4:
+; CHECK: divl
+}
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index bb5e92f98c7d7..58e25f923971e 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -53,22 +53,22 @@ define weak_odr dllexport void @weak1() {
 
 ; CHECK: .globl alias
 ; CHECK: alias = notExported
-@alias = dllexport alias void()* @notExported
+@alias = dllexport alias void(), void()* @notExported
 
 ; CHECK: .globl alias2
 ; CHECK: alias2 = f1
-@alias2 = dllexport alias void()* @f1
+@alias2 = dllexport alias void(), void()* @f1
 
 ; CHECK: .globl alias3
 ; CHECK: alias3 = notExported
-@alias3 = dllexport alias void()* @notExported
+@alias3 = dllexport alias void(), void()* @notExported
 
 ; CHECK: .weak weak_alias
 ; CHECK: weak_alias = f1
-@weak_alias = weak_odr dllexport alias void()* @f1
+@weak_alias = weak_odr dllexport alias void(), void()* @f1
 
 @blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
-@blob_alias = dllexport alias bitcast ([6 x i8]* @blob to i32 ()*)
+@blob_alias = dllexport alias i32 (), bitcast ([6 x i8]* @blob to i32 ()*)
 
 ; CHECK: .section .drectve
 ; WIN32: /EXPORT:f1
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index 915567de5bf77..cde0955410b73 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -74,19 +74,19 @@ define weak_odr dllexport void @weak1() {
 
 ; CHECK: .globl _alias
 ; CHECK: _alias = _notExported
-@alias = dllexport alias void()* @notExported
+@alias = dllexport alias void(), void()* @notExported
 
 ; CHECK: .globl _alias2
 ; CHECK: _alias2 = _f1
-@alias2 = dllexport alias void()* @f1
+@alias2 = dllexport alias void(), void()* @f1
 
 ; CHECK: .globl _alias3
 ; CHECK: _alias3 = _notExported
-@alias3 = dllexport alias void()* @notExported
+@alias3 = dllexport alias void(), void()* @notExported
 
 ; CHECK: .weak _weak_alias
 ; CHECK: _weak_alias = _f1
-@weak_alias = weak_odr dllexport alias void()* @f1
+@weak_alias = weak_odr dllexport alias void(), void()* @f1
 
 ; CHECK: .section .drectve
 ; CHECK-CL-NOT: not_exported
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index 27b8b1552ec1c..31d2724aade37 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: 0, file: !4, enums: !2, retainedTypes: !7, subprograms: !2, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: 0, file: !4, enums: !2, retainedTypes: !7, subprograms: !2, globals: !2)
 !2 = !{}
 !3 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
 !4 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
diff --git a/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
index 2925f243b0e37..b0334d6a63ef7 100644
--- a/test/CodeGen/X86/dynamic-allocas-VLAs.ll
+++ b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mcpu=generic -force-align-stack -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
+; RUN: llc < %s -mcpu=generic -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
 ; rdar://11496434
 
 ; no VLAs or dynamic alignment
diff --git a/test/CodeGen/X86/eh-null-personality.ll b/test/CodeGen/X86/eh-null-personality.ll
new file mode 100644
index 0000000000000..536f060db8d92
--- /dev/null
+++ b/test/CodeGen/X86/eh-null-personality.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+
+; We should treat non-Function personalities as the unknown personality, which
+; is usually Itanium.
+
+declare void @g()
+declare void @terminate(i8*)
+
+define void @f() personality i8* null {
+  invoke void @g()
+    to label %ret unwind label %lpad
+ret:
+  ret void
+lpad:
+  %vals = landingpad { i8*, i32 } catch i8* null
+  %ptr = extractvalue { i8*, i32 } %vals, 0
+  call void @terminate(i8* %ptr)
+  unreachable
+}
+
+; CHECK: f:
+; CHECK: callq g
+; CHECK: retq
+; CHECK: movq %rax, %rdi
+; CHECK: callq terminate
diff --git a/test/CodeGen/X86/eh_frame.ll b/test/CodeGen/X86/eh_frame.ll
index 3b792b235cb5e..0472e773df564 100644
--- a/test/CodeGen/X86/eh_frame.ll
+++ b/test/CodeGen/X86/eh_frame.ll
@@ -7,8 +7,8 @@
 @bar1 = constant i8* bitcast (i32* @foo to i8*), section "my_bar1", align 8
 
 
-; STATIC: .section	.eh_frame,"a",@progbits
+; STATIC: .section	.eh_frame,"a",@unwind
 ; STATIC: .section	my_bar1,"a",@progbits
 
-; PIC:	.section	.eh_frame,"a",@progbits
+; PIC:	.section	.eh_frame,"a",@unwind
 ; PIC:	.section	my_bar1,"aw",@progbits
diff --git a/test/CodeGen/X86/emutls-pic.ll b/test/CodeGen/X86/emutls-pic.ll
new file mode 100644
index 0000000000000..11676aff18924
--- /dev/null
+++ b/test/CodeGen/X86/emutls-pic.ll
@@ -0,0 +1,168 @@
+; RUN: llc < %s -emulated-tls -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -emulated-tls -march=x86 -mtriple=i386-linux-android -relocation-model=pic | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck -check-prefix=X64 %s
+
+; Use my_emutls_get_address like __emutls_get_address.
+@my_emutls_v_xyz = external global i8*, align 4
+declare i8* @my_emutls_get_address(i8*)
+
+define i32 @my_get_xyz() {
+; X32-LABEL: my_get_xyz:
+; X32:      movl my_emutls_v_xyz@GOT(%ebx), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll my_emutls_get_address@PLT
+; X64-LABEL: my_get_xyz:
+; X64:      movq my_emutls_v_xyz@GOTPCREL(%rip), %rdi
+; X64-NEXT: callq my_emutls_get_address@PLT
+; X64-NEXT: movl (%rax), %eax
+
+entry:
+  %call = call i8* @my_emutls_get_address(i8* bitcast (i8** @my_emutls_v_xyz to i8*))
+  %0 = bitcast i8* %call to i32*
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+@i = thread_local global i32 15
+@j = internal thread_local global i32 42
+@k = internal thread_local global i32 0, align 8
+
+define i32 @f1() {
+entry:
+  %tmp1 = load i32, i32* @i
+  ret i32 %tmp1
+}
+
+; X32-LABEL: f1:
+; X32:      movl __emutls_v.i@GOT(%ebx), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll __emutls_get_address@PLT
+; X64-LABEL: f1:
+; X64:      movq __emutls_v.i@GOTPCREL(%rip), %rdi
+; X64-NEXT: callq __emutls_get_address@PLT
+; X64-NEXT: movl (%rax), %eax
+
+@i2 = external thread_local global i32
+
+define i32* @f2() {
+entry:
+  ret i32* @i
+}
+
+; X32-LABEL: f2:
+; X64-LABEL: f2:
+
+
+define i32 @f3() {
+entry:
+  %tmp1 = load i32, i32* @i  ; <i32> [#uses=1]
+  ret i32 %tmp1
+}
+
+; X32-LABEL: f3:
+; X64-LABEL: f3:
+
+
+define i32* @f4() nounwind {
+entry:
+  ret i32* @i
+}
+
+; X32-LABEL: f4:
+; X64-LABEL: f4:
+
+
+define i32 @f5() nounwind {
+entry:
+  %0 = load i32, i32* @j, align 4
+  %1 = load i32, i32* @k, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+; X32-LABEL: f5:
+; X32:      movl __emutls_v.j@GOT(%ebx), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll __emutls_get_address@PLT
+; X32-NEXT: movl (%eax), %esi
+; X32-NEXT: movl __emutls_v.k@GOT(%ebx), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll __emutls_get_address@PLT
+; X32-NEXT: addl (%eax), %esi
+; X32-NEXT: movl %esi, %eax
+
+; X64-LABEL: f5:
+; X64:      movq __emutls_v.j@GOTPCREL(%rip), %rdi
+; X64-NEXT: callq __emutls_get_address@PLT
+; X64-NEXT: movl (%rax), %ebx
+; X64-NEXT: movq __emutls_v.k@GOTPCREL(%rip), %rdi
+; X64-NEXT: callq __emutls_get_address@PLT
+; X64-NEXT: addl (%rax), %ebx
+; X64-NEXT: movl %ebx, %eax
+
+;;;;; 32-bit targets
+
+; X32:      .data
+; X32-LABEL: __emutls_v.i:
+; X32-NEXT: .long 4
+; X32-NEXT: .long 4
+; X32-NEXT: .long 0
+; X32-NEXT: .long __emutls_t.i
+
+; X32:      .section .rodata,
+; X32-LABEL: __emutls_t.i:
+; X32-NEXT: .long 15
+
+; X32:      .data
+; X32-LABEL: __emutls_v.j:
+; X32-NEXT: .long 4
+; X32-NEXT: .long 4
+; X32-NEXT: .long 0
+; X32-NEXT: .long __emutls_t.j
+
+; X32:      .section .rodata,
+; X32-LABEL: __emutls_t.j:
+; X32-NEXT: .long 42
+
+; X32:      .data
+; X32-LABEL: __emutls_v.k:
+; X32-NEXT: .long 4
+; X32-NEXT: .long 8
+; X32-NEXT: .long 0
+; X32-NEXT: .long 0
+
+; X32-NOT:   __emutls_t.k:
+
+;;;;; 64-bit targets
+
+; X64:      .data
+; X64-LABEL: __emutls_v.i:
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 0
+; X64-NEXT: .quad __emutls_t.i
+
+; X64:      .section .rodata,
+; X64-LABEL: __emutls_t.i:
+; X64-NEXT: .long 15
+
+; X64:      .data
+; X64-LABEL: __emutls_v.j:
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 0
+; X64-NEXT: .quad __emutls_t.j
+
+; X64:      .section .rodata,
+; X64-LABEL: __emutls_t.j:
+; X64-NEXT: .long 42
+
+; X64:      .data
+; X64-LABEL: __emutls_v.k:
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 8
+; X64-NEXT: .quad 0
+; X64-NEXT: .quad 0
+
+; X64-NOT:   __emutls_t.k:
diff --git a/test/CodeGen/X86/emutls-pie.ll b/test/CodeGen/X86/emutls-pie.ll
new file mode 100644
index 0000000000000..45e5c38c0d8a1
--- /dev/null
+++ b/test/CodeGen/X86/emutls-pie.ll
@@ -0,0 +1,131 @@
+; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
+; RUN:   | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
+; RUN:   | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-android -relocation-model=pic -enable-pie \
+; RUN:   | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-android -relocation-model=pic -enable-pie \
+; RUN:   | FileCheck -check-prefix=X64 %s
+
+; Use my_emutls_get_address like __emutls_get_address.
+@my_emutls_v_xyz = external global i8*, align 4
+declare i8* @my_emutls_get_address(i8*)
+
+define i32 @my_get_xyz() {
+; X32-LABEL: my_get_xyz:
+; X32:      movl my_emutls_v_xyz@GOT(%ebx), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll my_emutls_get_address@PLT
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: addl $8, %esp
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+; X64-LABEL: my_get_xyz:
+; X64:      movq my_emutls_v_xyz@GOTPCREL(%rip), %rdi
+; X64-NEXT: callq my_emutls_get_address@PLT
+; X64-NEXT: movl (%rax), %eax
+; X64-NEXT: popq %rcx
+; X64-NEXT: retq
+
+entry:
+  %call = call i8* @my_emutls_get_address(i8* bitcast (i8** @my_emutls_v_xyz to i8*))
+  %0 = bitcast i8* %call to i32*
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+@i = thread_local global i32 15
+@i2 = external thread_local global i32
+
+define i32 @f1() {
+; X32-LABEL: f1:
+; X32:      movl __emutls_v.i@GOT(%ebx), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll __emutls_get_address@PLT
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: addl $8, %esp
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+; X64-LABEL: f1:
+; X64:      movq __emutls_v.i@GOTPCREL(%rip), %rdi
+; X64-NEXT: callq __emutls_get_address@PLT
+; X64-NEXT: movl (%rax), %eax
+; X64-NEXT: popq %rcx
+; X64-NEXT: retq
+
+entry:
+  %tmp1 = load i32, i32* @i
+  ret i32 %tmp1
+}
+
+define i32* @f2() {
+; X32-LABEL: f2:
+; X32:      movl __emutls_v.i@GOT(%ebx), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll __emutls_get_address@PLT
+; X64-LABEL: f2:
+; X64:      movq __emutls_v.i@GOTPCREL(%rip), %rdi
+; X64-NEXT: callq __emutls_get_address@PLT
+
+entry:
+  ret i32* @i
+}
+
+define i32 @f3() {
+; X32-LABEL: f3:
+; X32:      movl __emutls_v.i2@GOT(%ebx), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll __emutls_get_address@PLT
+; X64-LABEL: f3:
+; X64:      movq __emutls_v.i2@GOTPCREL(%rip), %rdi
+; X64-NEXT: callq __emutls_get_address@PLT
+
+entry:
+  %tmp1 = load i32, i32* @i2
+  ret i32 %tmp1
+}
+
+define i32* @f4() {
+; X32-LABEL: f4:
+; X32:      movl __emutls_v.i2@GOT(%ebx), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll __emutls_get_address@PLT
+; X64-LABEL: f4:
+; X64:      movq __emutls_v.i2@GOTPCREL(%rip), %rdi
+; X64-NEXT: callq __emutls_get_address@PLT
+
+entry:
+  ret i32* @i2
+}
+
+;;;;; 32-bit targets
+
+; X32:      .data
+; X32-LABEL: __emutls_v.i:
+; X32-NEXT: .long 4
+; X32-NEXT: .long 4
+; X32-NEXT: .long 0
+; X32-NEXT: .long __emutls_t.i
+
+; X32:      .section .rodata,
+; X32-LABEL: __emutls_t.i:
+; X32-NEXT: .long 15
+
+; X32-NOT:   __emutls_v.i2
+; X32-NOT:   __emutls_t.i2
+
+;;;;; 64-bit targets
+
+; X64:      .data
+; X64-LABEL: __emutls_v.i:
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 0
+; X64-NEXT: .quad __emutls_t.i
+
+; X64:      .section .rodata,
+; X64-LABEL: __emutls_t.i:
+; X64-NEXT: .long 15
+
+; X64-NOT:   __emutls_v.i2
+; X64-NOT:   __emutls_t.i2
diff --git a/test/CodeGen/X86/emutls.ll b/test/CodeGen/X86/emutls.ll
new file mode 100644
index 0000000000000..9266fe962df27
--- /dev/null
+++ b/test/CodeGen/X86/emutls.ll
@@ -0,0 +1,347 @@
+; RUN: llc < %s -emulated-tls -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -emulated-tls -march=x86 -mtriple=x86-linux-android | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-android | FileCheck -check-prefix=X64 %s
+
+; Copied from tls.ll; emulated TLS model is not implemented
+; for *-pc-win32 and *-pc-winows targets yet.
+
+; Use my_emutls_get_address like __emutls_get_address.
+@my_emutls_v_xyz = external global i8*, align 4
+declare i8* @my_emutls_get_address(i8*)
+
+define i32 @my_get_xyz() {
+; X32-LABEL: my_get_xyz:
+; X32:         movl $my_emutls_v_xyz, (%esp)
+; X32-NEXT:    calll my_emutls_get_address
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+; X64-LABEL: my_get_xyz:
+; X64:         movl $my_emutls_v_xyz, %edi
+; X64-NEXT:    callq my_emutls_get_address
+; X64-NEXT:    movl (%rax), %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+
+entry:
+  %call = call i8* @my_emutls_get_address(i8* bitcast (i8** @my_emutls_v_xyz to i8*))
+  %0 = bitcast i8* %call to i32*
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+@i1 = thread_local global i32 15
+@i2 = external thread_local global i32
+@i3 = internal thread_local global i32 15
+@i4 = hidden thread_local global i32 15
+@i5 = external hidden thread_local global i32
+@s1 = thread_local global i16 15
+@b1 = thread_local global i8 0
+
+define i32 @f1() {
+; X32-LABEL: f1:
+; X32:         movl $__emutls_v.i1, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+; X64-LABEL: f1:
+; X64:         movl $__emutls_v.i1, %edi
+; X64-NEXT:    callq __emutls_get_address
+; X64-NEXT:    movl (%rax), %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+
+entry:
+  %tmp1 = load i32, i32* @i1
+  ret i32 %tmp1
+}
+
+define i32* @f2() {
+; X32-LABEL: f2:
+; X32:         movl $__emutls_v.i1, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+; X64-LABEL: f2:
+; X64:         movl $__emutls_v.i1, %edi
+; X64-NEXT:    callq __emutls_get_address
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+
+entry:
+  ret i32* @i1
+}
+
+define i32 @f3() nounwind {
+; X32-LABEL: f3:
+; X32:         movl $__emutls_v.i2, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  %tmp1 = load i32, i32* @i2
+  ret i32 %tmp1
+}
+
+define i32* @f4() {
+; X32-LABEL: f4:
+; X32:         movl $__emutls_v.i2, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  ret i32* @i2
+}
+
+define i32 @f5() nounwind {
+; X32-LABEL: f5:
+; X32:         movl $__emutls_v.i3, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  %tmp1 = load i32, i32* @i3
+  ret i32 %tmp1
+}
+
+define i32* @f6() {
+; X32-LABEL: f6:
+; X32:         movl $__emutls_v.i3, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  ret i32* @i3
+}
+
+define i32 @f7() {
+; X32-LABEL: f7:
+; X32:         movl $__emutls_v.i4, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  %tmp1 = load i32, i32* @i4
+  ret i32 %tmp1
+}
+
+define i32* @f8() {
+; X32-LABEL: f8:
+; X32:         movl $__emutls_v.i4, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  ret i32* @i4
+}
+
+define i32 @f9() {
+; X32-LABEL: f9:
+; X32:         movl $__emutls_v.i5, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  %tmp1 = load i32, i32* @i5
+  ret i32 %tmp1
+}
+
+define i32* @f10() {
+; X32-LABEL: f10:
+; X32:         movl $__emutls_v.i5, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  ret i32* @i5
+}
+
+define i16 @f11() {
+; X32-LABEL: f11:
+; X32:         movl $__emutls_v.s1, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    movzwl (%eax), %eax
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  %tmp1 = load i16, i16* @s1
+  ret i16 %tmp1
+}
+
+define i32 @f12() {
+; X32-LABEL: f12:
+; X32:         movl $__emutls_v.s1, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    movswl (%eax), %eax
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  %tmp1 = load i16, i16* @s1
+  %tmp2 = sext i16 %tmp1 to i32
+  ret i32 %tmp2
+}
+
+define i8 @f13() {
+; X32-LABEL: f13:
+; X32:         movl $__emutls_v.b1, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    movb (%eax), %al
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  %tmp1 = load i8, i8* @b1
+  ret i8 %tmp1
+}
+
+define i32 @f14() {
+; X32-LABEL: f14:
+; X32:         movl $__emutls_v.b1, (%esp)
+; X32-NEXT:    calll __emutls_get_address
+; X32-NEXT:    movsbl (%eax), %eax
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+
+entry:
+  %tmp1 = load i8, i8* @b1
+  %tmp2 = sext i8 %tmp1 to i32
+  ret i32 %tmp2
+}
+
+;;;;;;;;;;;;;; 32-bit __emutls_v. and __emutls_t.
+
+; X32       .section .data.rel.local,
+; X32-LABEL: __emutls_v.i1:
+; X32-NEXT: .long 4
+; X32-NEXT: .long 4
+; X32-NEXT: .long 0
+; X32-NEXT: .long __emutls_t.i1
+
+; X32       .section .rodata,
+; X32-LABEL: __emutls_t.i1:
+; X32-NEXT: .long 15
+
+; X32-NOT:   __emutls_v.i2
+
+; X32       .section .data.rel.local,
+; X32-LABEL: __emutls_v.i3:
+; X32-NEXT: .long 4
+; X32-NEXT: .long 4
+; X32-NEXT: .long 0
+; X32-NEXT: .long __emutls_t.i3
+
+; X32       .section .rodata,
+; X32-LABEL: __emutls_t.i3:
+; X32-NEXT: .long 15
+
+; X32       .section .data.rel.local,
+; X32-LABEL: __emutls_v.i4:
+; X32-NEXT: .long 4
+; X32-NEXT: .long 4
+; X32-NEXT: .long 0
+; X32-NEXT: .long __emutls_t.i4
+
+; X32       .section .rodata,
+; X32-LABEL: __emutls_t.i4:
+; X32-NEXT: .long 15
+
+; X32-NOT:   __emutls_v.i5:
+; X32       .hidden __emutls_v.i5
+; X32-NOT:   __emutls_v.i5:
+
+; X32 .section .data.rel.local,
+; X32-LABEL: __emutls_v.s1:
+; X32-NEXT: .long 2
+; X32-NEXT: .long 2
+; X32-NEXT: .long 0
+; X32-NEXT: .long __emutls_t.s1
+
+; X32 .section .rodata,
+; X32-LABEL: __emutls_t.s1:
+; X32-NEXT: .short 15
+
+; X32 .section .data.rel.local,
+; X32-LABEL: __emutls_v.b1:
+; X32-NEXT: .long 1
+; X32-NEXT: .long 1
+; X32-NEXT: .long 0
+; X32-NEXT: .long 0
+
+; X32-NOT:   __emutls_t.b1
+
+;;;;;;;;;;;;;; 64-bit __emutls_v. and __emutls_t.
+
+; X64       .section .data.rel.local,
+; X64-LABEL: __emutls_v.i1:
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 0
+; X64-NEXT: .quad __emutls_t.i1
+
+; X64       .section .rodata,
+; X64-LABEL: __emutls_t.i1:
+; X64-NEXT: .long 15
+
+; X64-NOT:   __emutls_v.i2
+
+; X64       .section .data.rel.local,
+; X64-LABEL: __emutls_v.i3:
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 0
+; X64-NEXT: .quad __emutls_t.i3
+
+; X64       .section .rodata,
+; X64-LABEL: __emutls_t.i3:
+; X64-NEXT: .long 15
+
+; X64       .section .data.rel.local,
+; X64-LABEL: __emutls_v.i4:
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 4
+; X64-NEXT: .quad 0
+; X64-NEXT: .quad __emutls_t.i4
+
+; X64       .section .rodata,
+; X64-LABEL: __emutls_t.i4:
+; X64-NEXT: .long 15
+
+; X64-NOT:   __emutls_v.i5:
+; X64       .hidden __emutls_v.i5
+; X64-NOT:   __emutls_v.i5:
+
+; X64       .section .data.rel.local,
+; X64-LABEL: __emutls_v.s1:
+; X64-NEXT: .quad 2
+; X64-NEXT: .quad 2
+; X64-NEXT: .quad 0
+; X64-NEXT: .quad __emutls_t.s1
+
+; X64       .section .rodata,
+; X64-LABEL: __emutls_t.s1:
+; X64-NEXT: .short 15
+
+; X64       .section .data.rel.local,
+; X64-LABEL: __emutls_v.b1:
+; X64-NEXT: .quad 1
+; X64-NEXT: .quad 1
+; X64-NEXT: .quad 0
+; X64-NEXT: .quad 0
+
+; X64-NOT:  __emutls_t.b1
diff --git a/test/CodeGen/X86/emutls_generic.ll b/test/CodeGen/X86/emutls_generic.ll
new file mode 100644
index 0000000000000..b99a195426c2b
--- /dev/null
+++ b/test/CodeGen/X86/emutls_generic.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -emulated-tls -mtriple=i686-linux-android -relocation-model=pic \
+; RUN:     | FileCheck -check-prefix=X86_32 %s
+; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-android -march=x86 -relocation-model=pic \
+; RUN:     | FileCheck -check-prefix=X86_32 %s
+; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-android -relocation-model=pic \
+; RUN:     | FileCheck -check-prefix=X86_64 %s
+; RUN: llc < %s -emulated-tls -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic \
+; RUN:     | FileCheck %s
+
+; Make sure that TLS symbols are emitted in expected order.
+
+@external_x = external thread_local global i32, align 8
+@external_y = thread_local global i8 7, align 2
+@internal_y = internal thread_local global i64 9, align 16
+
+define i32* @get_external_x() {
+entry:
+  ret i32* @external_x
+}
+
+define i8* @get_external_y() {
+entry:
+  ret i8* @external_y
+}
+
+define i64* @get_internal_y() {
+entry:
+  ret i64* @internal_y
+}
+
+; CHECK-LABEL: get_external_x:
+; CHECK-NOT:   _tls_get_address
+; CHECK:       __emutls_get_address
+; CHECK-LABEL: get_external_y:
+; CHECK:       __emutls_get_address
+; CHECK-NOT:   _tls_get_address
+; CHECK-LABEL: get_internal_y:
+; CHECK-NOT:   __emutls_t.external_x:
+; CHECK-NOT:   __emutls_v.external_x:
+; CHECK-LABEL: __emutls_v.external_y:
+; CHECK-LABEL: __emutls_t.external_y:
+; CHECK:       __emutls_t.external_y
+; CHECK-LABEL: __emutls_v.internal_y:
+; CHECK-LABEL: __emutls_t.internal_y:
+; CHECK:       __emutls_t.internal_y
+
+; X86_32-LABEL:  get_external_x:
+; X86_32:        movl __emutls_v.external_x
+; X86_32:        calll __emutls_get_address
+; X86_32-LABEL:  get_external_y:
+; X86_32:        movl __emutls_v.external_y
+; X86_32:        calll __emutls_get_address
+; X86_32-LABEL:  get_internal_y:
+; X86_32:      movl __emutls_v.internal_y
+; X86_32:      calll __emutls_get_address
+; X86_32-NOT:   __emutls_t.external_x
+; X86_32-NOT:   __emutls_v.external_x:
+; X86_32:        .data
+; X86_32:        .align 4
+; X86_32-LABEL:  __emutls_v.external_y:
+; X86_32-NEXT:   .long 1
+; X86_32-NEXT:   .long 2
+; X86_32-NEXT:   .long 0
+; X86_32-NEXT:   .long __emutls_t.external_y
+; X86_32:        .section .rodata,
+; X86_32-LABEL:  __emutls_t.external_y:
+; X86_32-NEXT:   .byte 7
+; X86_32:        .data
+; X86_32:        .align 4
+; X86_32-LABEL:  __emutls_v.internal_y:
+; X86_32-NEXT:   .long 8
+; X86_32-NEXT:   .long 16
+; X86_32-NEXT:   .long 0
+; X86_32-NEXT:   .long __emutls_t.internal_y
+; X86_32-LABEL:  __emutls_t.internal_y:
+; X86_32-NEXT:   .quad 9
+; X86_64-LABEL:  get_external_x:
+; X86_64:      __emutls_v.external_x
+; X86_64:      __emutls_get_address
+; X86_64-LABEL:  get_external_y:
+; X86_64:      __emutls_v.external_y
+; X86_64:      __emutls_get_address
+; X86_64-LABEL:  get_internal_y:
+; X86_64:      __emutls_v.internal_y
+; X86_64:      __emutls_get_address
+; X86_64-NOT:   __emutls_t.external_x
+; X86_64-NOT:   __emutls_v.external_x:
+; X86_64:        .align 8
+; X86_64-LABEL:  __emutls_v.external_y:
+; X86_64-NEXT:   .quad 1
+; X86_64-NEXT:   .quad 2
+; X86_64-NEXT:   .quad 0
+; X86_64-NEXT:   .quad __emutls_t.external_y
+; X86_64-NOT:    __emutls_v.external_x:
+; X86_64:        .section .rodata,
+; X86_64-LABEL:  __emutls_t.external_y:
+; X86_64-NEXT:   .byte 7
+; X86_64:        .data
+; X86_64:        .align 8
+; X86_64-LABEL:  __emutls_v.internal_y:
+; X86_64-NEXT:   .quad 8
+; X86_64-NEXT:   .quad 16
+; X86_64-NEXT:   .quad 0
+; X86_64-NEXT:   .quad __emutls_t.internal_y
+; X86_64:        .section .rodata,
+; X86_64-LABEL:  __emutls_t.internal_y:
+; X86_64-NEXT:   .quad 9
diff --git a/test/CodeGen/X86/exedeps-movq.ll b/test/CodeGen/X86/exedeps-movq.ll
index a5873be6f27f2..c1c60981edf56 100644
--- a/test/CodeGen/X86/exedeps-movq.ll
+++ b/test/CodeGen/X86/exedeps-movq.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=AVX
 
@@ -66,3 +67,21 @@ define void @store_int(<4 x i32> %x, <2 x float>* %p) {
   ret void
 }
 
+define void @store_h_double(<2 x double> %x, i64* %p) {
+; SSE-LABEL: store_h_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    addpd %xmm0, %xmm0
+; SSE-NEXT:    movhpd %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: store_h_double:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovhpd %xmm0, (%rdi)
+; AVX-NEXT:    retq
+  %a = fadd <2 x double> %x, %x
+  %b = extractelement <2 x double> %a, i32 1
+  %c = bitcast double %b to i64
+  store i64 %c, i64* %p
+  ret void
+}
diff --git a/test/CodeGen/X86/expand-vr64-gr64-copy.mir b/test/CodeGen/X86/expand-vr64-gr64-copy.mir
new file mode 100644
index 0000000000000..8ce1c7eaae70f
--- /dev/null
+++ b/test/CodeGen/X86/expand-vr64-gr64-copy.mir
@@ -0,0 +1,36 @@
+# RUN: llc -run-pass postrapseudos -mtriple=x86_64-unknown-unknown -mattr=+3dnow -o /dev/null %s | FileCheck %s
+# This test verifies that the ExpandPostRA pass expands the GR64 <-> VR64
+# copies into appropriate MMX_MOV instructions.
+
+--- |
+
+  define <2 x i32> @test_pswapdsi(<2 x i32> %a) nounwind readnone {
+  entry:
+    %0 = bitcast <2 x i32> %a to x86_mmx
+    %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0)
+    %2 = bitcast x86_mmx %1 to <2 x i32>
+    ret <2 x i32> %2
+  }
+
+  declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone
+
+...
+---
+name:            test_pswapdsi
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: %xmm0
+
+    %xmm0 = PSHUFDri killed %xmm0, -24
+    MOVPQI2QImr %rsp, 1, _, -8, _, killed %xmm0
+    %mm0 = PSWAPDrm %rsp, 1, _, -8, _
+  ; CHECK:      %rax = MMX_MOVD64from64rr %mm0
+  ; CHECK-NEXT: %mm0 = MMX_MOVD64to64rr %rax
+    %rax = COPY %mm0
+    %mm0 = COPY %rax
+    MMX_MOVQ64mr %rsp, 1, _, -16, _, killed %mm0
+    %xmm0 = MOVQI2PQIrm %rsp, 1, _, -16, _
+    %xmm0 = PSHUFDri killed %xmm0, -44
+    RETQ %xmm0
+...
diff --git a/test/CodeGen/X86/extractelement-legalization-cycle.ll b/test/CodeGen/X86/extractelement-legalization-cycle.ll
new file mode 100644
index 0000000000000..d75f03ba1680d
--- /dev/null
+++ b/test/CodeGen/X86/extractelement-legalization-cycle.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; When the extractelement is converted to a load the store can be re-used.
+; This will, however, introduce a cycle into the selection DAG (the load
+; of the extractelement index is dependent on the store, and so after the
+; conversion it becomes dependent on the new load, which is dependent on
+; the index).  Make sure we skip the store, and conservatively instead
+; use a store to the stack.
+
+define float @foo(i32* %i, <4 x float>* %v) {
+; CHECK-LABEL: foo:
+; CHECK:    movaps %xmm0, -[[OFFSET:[0-9]+]](%rsp)
+; CHECK:    movss -[[OFFSET]](%rsp,{{.*}}), %xmm0 {{.*}}
+; CHECK-NEXT:    retq
+  %1 = load <4 x float>, <4 x float>* %v, align 16
+  %mul = fmul <4 x float> %1, %1
+  store <4 x float> %mul, <4 x float>* %v, align 16
+  %2 = load i32, i32* %i, align 4
+  %vecext = extractelement <4 x float> %mul, i32 %2
+  ret float %vecext
+}
diff --git a/test/CodeGen/X86/extractelement-shuffle.ll b/test/CodeGen/X86/extractelement-shuffle.ll
index d1ba9a845800d..1b04c41d5c6fe 100644
--- a/test/CodeGen/X86/extractelement-shuffle.ll
+++ b/test/CodeGen/X86/extractelement-shuffle.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; REQUIRES: default_triple
 
 ; Examples that exhibits a bug in DAGCombine.  The case is triggered by the
 ; following program.  The bug is DAGCombine assumes that the bit convert
diff --git a/test/CodeGen/X86/fadd-combines.ll b/test/CodeGen/X86/fadd-combines.ll
new file mode 100644
index 0000000000000..2df0e06dc2528
--- /dev/null
+++ b/test/CodeGen/X86/fadd-combines.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+define float @fadd_zero_f32(float %x) #0 {
+; CHECK-LABEL: fadd_zero_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %y = fadd float %x, 0.0
+  ret float %y
+}
+
+define <4 x float> @fadd_zero_4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fadd_zero_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %y = fadd <4 x float> %x, zeroinitializer
+  ret <4 x float> %y
+}
+
+; CHECK: float 3
+define float @fadd_2const_f32(float %x) #0 {
+; CHECK-LABEL: fadd_2const_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    addss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd float %x, 1.0
+  %z = fadd float %y, 2.0
+  ret float %z
+}
+
+; CHECK: float 5
+; CHECK: float 5
+; CHECK: float 5
+; CHECK: float 5
+define <4 x float> @fadd_2const_4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fadd_2const_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    addps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fadd <4 x float> %y, <float 4.0, float 3.0, float 2.0, float 1.0>
+  ret <4 x float> %z
+}
+
+; CHECK: float 3
+define float @fadd_x_fmul_x_c_f32(float %x) #0 {
+; CHECK-LABEL: fadd_x_fmul_x_c_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fmul float %x, 2.0
+  %z = fadd float %x, %y
+  ret float %z
+}
+
+; CHECK: float 2
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
+define <4 x float> @fadd_x_fmul_x_c_4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fadd_x_fmul_x_c_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fadd <4 x float> %x, %y
+  ret <4 x float> %z
+}
+
+; CHECK: float 3
+define float @fadd_fmul_x_c_x_f32(float %x) #0 {
+; CHECK-LABEL: fadd_fmul_x_c_x_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fmul float %x, 2.0
+  %z = fadd float %y, %x
+  ret float %z
+}
+
+; CHECK: float 2
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
+define <4 x float> @fadd_fmul_x_c_x_4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fadd_fmul_x_c_x_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fadd <4 x float> %y, %x
+  ret <4 x float> %z
+}
+
+; CHECK: float 4
+define float @fadd_fadd_x_x_fmul_x_c_f32(float %x) #0 {
+; CHECK-LABEL: fadd_fadd_x_x_fmul_x_c_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd float %x, %x
+  %z = fmul float %x, 2.0
+  %w = fadd float %y, %z
+  ret float %w
+}
+
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
+; CHECK: float 6
+define <4 x float> @fadd_fadd_x_x_fmul_x_c_4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fadd_fadd_x_x_fmul_x_c_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd <4 x float> %x, %x
+  %z = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %w = fadd <4 x float> %y, %z
+  ret <4 x float> %w
+}
+
+; CHECK: float 4
+define float @fadd_fmul_x_c_fadd_x_x_f32(float %x) #0 {
+; CHECK-LABEL: fadd_fmul_x_c_fadd_x_x_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd float %x, %x
+  %z = fmul float %x, 2.0
+  %w = fadd float %z, %y
+  ret float %w
+}
+
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
+; CHECK: float 6
+define <4 x float> @fadd_fmul_x_c_fadd_x_x_4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fadd_fmul_x_c_fadd_x_x_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd <4 x float> %x, %x
+  %z = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %w = fadd <4 x float> %z, %y
+  ret <4 x float> %w
+}
+
+; CHECK: float 3
+define float @fadd_x_fadd_x_x_f32(float %x) #0 {
+; CHECK-LABEL: fadd_x_fadd_x_x_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd float %x, %x
+  %z = fadd float %x, %y
+  ret float %z
+}
+
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
+define <4 x float> @fadd_x_fadd_x_x_4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fadd_x_fadd_x_x_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd <4 x float> %x, %x
+  %z = fadd <4 x float> %x, %y
+  ret <4 x float> %z
+}
+
+; CHECK: float 3
+define float @fadd_fadd_x_x_x_f32(float %x) #0 {
+; CHECK-LABEL: fadd_fadd_x_x_x_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd float %x, %x
+  %z = fadd float %y, %x
+  ret float %z
+}
+
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
+define <4 x float> @fadd_fadd_x_x_x_4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fadd_fadd_x_x_x_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd <4 x float> %x, %x
+  %z = fadd <4 x float> %y, %x
+  ret <4 x float> %z
+}
+
+; CHECK: float 4
+define float @fadd_fadd_x_x_fadd_x_x_f32(float %x) #0 {
+; CHECK-LABEL: fadd_fadd_x_x_fadd_x_x_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd float %x, %x
+  %z = fadd float %y, %y
+  ret float %z
+}
+
+; CHECK: float 4
+; CHECK: float 4
+; CHECK: float 4
+; CHECK: float 4
+define <4 x float> @fadd_fadd_x_x_fadd_x_x_4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fadd_fadd_x_x_fadd_x_x_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %y = fadd <4 x float> %x, %x
+  %z = fadd <4 x float> %y, %y
+  ret <4 x float> %z
+}
+
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fast-isel-bitcasts-avx.ll b/test/CodeGen/X86/fast-isel-bitcasts-avx.ll
new file mode 100644
index 0000000000000..03cefbc868228
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-bitcasts-avx.ll
@@ -0,0 +1,244 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -fast-isel-abort=1 -asm-verbose=0 | FileCheck %s
+;
+; Bitcasts between 256-bit vector types are no-ops since no instruction is
+; needed for the conversion.
+
+define <4 x i64> @v8i32_to_v4i64(<8 x i32> %a) {
+;CHECK-LABEL: v8i32_to_v4i64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x i32> %a to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @v16i16_to_v4i64(<16 x i16> %a) {
+;CHECK-LABEL: v16i16_to_v4i64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <16 x i16> %a to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @v32i8_to_v4i64(<32 x i8> %a) {
+;CHECK-LABEL: v32i8_to_v4i64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <32 x i8> %a to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @v4f64_to_v4i64(<4 x double> %a) {
+;CHECK-LABEL: v4f64_to_v4i64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x double> %a to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @v8f32_to_v4i64(<8 x float> %a) {
+;CHECK-LABEL: v8f32_to_v4i64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x float> %a to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <8 x i32> @v4i64_to_v8i32(<4 x i64> %a) {
+;CHECK-LABEL: v4i64_to_v8i32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x i64> %a to <8 x i32>
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @v16i16_to_v8i32(<16 x i16> %a) {
+;CHECK-LABEL: v16i16_to_v8i32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <16 x i16> %a to <8 x i32>
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @v32i8_to_v8i32(<32 x i8> %a) {
+;CHECK-LABEL: v32i8_to_v8i32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <32 x i8> %a to <8 x i32>
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @v4f64_to_v8i32(<4 x double> %a) {
+;CHECK-LABEL: v4f64_to_v8i32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x double> %a to <8 x i32>
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @v8f32_to_v8i32(<8 x float> %a) {
+;CHECK-LABEL: v8f32_to_v8i32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x float> %a to <8 x i32>
+  ret <8 x i32> %1
+}
+
+define <16 x i16> @v4i64_to_v16i16(<4 x i64> %a) {
+;CHECK-LABEL: v4i64_to_v16i16:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x i64> %a to <16 x i16>
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @v8i32_to_v16i16(<8 x i32> %a) {
+;CHECK-LABEL: v8i32_to_v16i16:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x i32> %a to <16 x i16>
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @v32i8_to_v16i16(<32 x i8> %a) {
+;CHECK-LABEL: v32i8_to_v16i16:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <32 x i8> %a to <16 x i16>
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @v4f64_to_v16i16(<4 x double> %a) {
+;CHECK-LABEL: v4f64_to_v16i16:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x double> %a to <16 x i16>
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @v8f32_to_v16i16(<8 x float> %a) {
+;CHECK-LABEL: v8f32_to_v16i16:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x float> %a to <16 x i16>
+  ret <16 x i16> %1
+}
+
+define <32 x i8> @v16i16_to_v32i8(<16 x i16> %a) {
+;CHECK-LABEL: v16i16_to_v32i8:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <16 x i16> %a to <32 x i8>
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @v4i64_to_v32i8(<4 x i64> %a) {
+;CHECK-LABEL: v4i64_to_v32i8:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x i64> %a to <32 x i8>
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @v8i32_to_v32i8(<8 x i32> %a) {
+;CHECK-LABEL: v8i32_to_v32i8:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x i32> %a to <32 x i8>
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @v4f64_to_v32i8(<4 x double> %a) {
+;CHECK-LABEL: v4f64_to_v32i8:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x double> %a to <32 x i8>
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @v8f32_to_v32i8(<8 x float> %a) {
+;CHECK-LABEL: v8f32_to_v32i8:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x float> %a to <32 x i8>
+  ret <32 x i8> %1
+}
+
+define <8 x float> @v32i8_to_v8f32(<32 x i8> %a) {
+;CHECK-LABEL: v32i8_to_v8f32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <32 x i8> %a to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x float> @v16i16_to_v8f32(<16 x i16> %a) {
+;CHECK-LABEL: v16i16_to_v8f32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <16 x i16> %a to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x float> @v4i64_to_v8f32(<4 x i64> %a) {
+;CHECK-LABEL: v4i64_to_v8f32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x i64> %a to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x float> @v8i32_to_v8f32(<8 x i32> %a) {
+;CHECK-LABEL: v8i32_to_v8f32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x i32> %a to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x float> @v4f64_to_v8f32(<4 x double> %a) {
+;CHECK-LABEL: v4f64_to_v8f32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x double> %a to <8 x float>
+  ret <8 x float> %1
+}
+
+define <4 x double> @v8f32_to_v4f64(<8 x float> %a) {
+;CHECK-LABEL: v8f32_to_v4f64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x float> %a to <4 x double>
+  ret <4 x double> %1
+}
+
+define <4 x double> @v32i8_to_v4f64(<32 x i8> %a) {
+;CHECK-LABEL: v32i8_to_v4f64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <32 x i8> %a to <4 x double>
+  ret <4 x double> %1
+}
+
+define <4 x double> @v16i16_to_v4f64(<16 x i16> %a) {
+;CHECK-LABEL: v16i16_to_v4f64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <16 x i16> %a to <4 x double>
+  ret <4 x double> %1
+}
+
+define <4 x double> @v4i64_to_v4f64(<4 x i64> %a) {
+;CHECK-LABEL: v4i64_to_v4f64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x i64> %a to <4 x double>
+  ret <4 x double> %1
+}
+
+define <4 x double> @v8i32_to_v4f64(<8 x i32> %a) {
+;CHECK-LABEL: v8i32_to_v4f64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x i32> %a to <4 x double>
+  ret <4 x double> %1
+}
diff --git a/test/CodeGen/X86/fast-isel-bitcasts.ll b/test/CodeGen/X86/fast-isel-bitcasts.ll
new file mode 100644
index 0000000000000..892b517fe873c
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-bitcasts.ll
@@ -0,0 +1,245 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -fast-isel -fast-isel-abort=1 -asm-verbose=0 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -fast-isel-abort=1 -asm-verbose=0 | FileCheck %s
+;
+; Bitcasts between 128-bit vector types are no-ops since no instruction is
+; needed for the conversion.
+
+define <2 x i64> @v4i32_to_v2i64(<4 x i32> %a) {
+;CHECK-LABEL: v4i32_to_v2i64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x i32> %a to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @v8i16_to_v2i64(<8 x i16> %a) {
+;CHECK-LABEL: v8i16_to_v2i64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x i16> %a to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @v16i8_to_v2i64(<16 x i8> %a) {
+;CHECK-LABEL: v16i8_to_v2i64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <16 x i8> %a to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @v2f64_to_v2i64(<2 x double> %a) {
+;CHECK-LABEL: v2f64_to_v2i64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <2 x double> %a to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @v4f32_to_v2i64(<4 x float> %a) {
+;CHECK-LABEL: v4f32_to_v2i64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x float> %a to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <4 x i32> @v2i64_to_v4i32(<2 x i64> %a) {
+;CHECK-LABEL: v2i64_to_v4i32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <2 x i64> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @v8i16_to_v4i32(<8 x i16> %a) {
+;CHECK-LABEL: v8i16_to_v4i32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x i16> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @v16i8_to_v4i32(<16 x i8> %a) {
+;CHECK-LABEL: v16i8_to_v4i32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <16 x i8> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @v2f64_to_v4i32(<2 x double> %a) {
+;CHECK-LABEL: v2f64_to_v4i32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <2 x double> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @v4f32_to_v4i32(<4 x float> %a) {
+;CHECK-LABEL: v4f32_to_v4i32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x float> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <8 x i16> @v2i64_to_v8i16(<2 x i64> %a) {
+;CHECK-LABEL: v2i64_to_v8i16:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <2 x i64> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @v4i32_to_v8i16(<4 x i32> %a) {
+;CHECK-LABEL: v4i32_to_v8i16:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x i32> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @v16i8_to_v8i16(<16 x i8> %a) {
+;CHECK-LABEL: v16i8_to_v8i16:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <16 x i8> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @v2f64_to_v8i16(<2 x double> %a) {
+;CHECK-LABEL: v2f64_to_v8i16:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <2 x double> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @v4f32_to_v8i16(<4 x float> %a) {
+;CHECK-LABEL: v4f32_to_v8i16:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x float> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @v8i16_to_v16i8(<8 x i16> %a) {
+;CHECK-LABEL: v8i16_to_v16i8:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x i16> %a to <16 x i8>
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @v2i64_to_v16i8(<2 x i64> %a) {
+;CHECK-LABEL: v2i64_to_v16i8:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <2 x i64> %a to <16 x i8>
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @v4i32_to_v16i8(<4 x i32> %a) {
+;CHECK-LABEL: v4i32_to_v16i8:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x i32> %a to <16 x i8>
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @v2f64_to_v16i8(<2 x double> %a) {
+;CHECK-LABEL: v2f64_to_v16i8:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <2 x double> %a to <16 x i8>
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @v4f32_to_v16i8(<4 x float> %a) {
+;CHECK-LABEL: v4f32_to_v16i8:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x float> %a to <16 x i8>
+  ret <16 x i8> %1
+}
+
+define <4 x float> @v16i8_to_v4f32(<16 x i8> %a) {
+;CHECK-LABEL: v16i8_to_v4f32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <16 x i8> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x float> @v8i16_to_v4f32(<8 x i16> %a) {
+;CHECK-LABEL: v8i16_to_v4f32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x i16> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x float> @v2i64_to_v4f32(<2 x i64> %a) {
+;CHECK-LABEL: v2i64_to_v4f32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <2 x i64> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x float> @v4i32_to_v4f32(<4 x i32> %a) {
+;CHECK-LABEL: v4i32_to_v4f32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x i32> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x float> @v2f64_to_v4f32(<2 x double> %a) {
+;CHECK-LABEL: v2f64_to_v4f32:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <2 x double> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x double> @v4f32_to_v2f64(<4 x float> %a) {
+;CHECK-LABEL: v4f32_to_v2f64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x float> %a to <2 x double>
+  ret <2 x double> %1
+}
+
+define <2 x double> @v16i8_to_v2f64(<16 x i8> %a) {
+;CHECK-LABEL: v16i8_to_v2f64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <16 x i8> %a to <2 x double>
+  ret <2 x double> %1
+}
+
+define <2 x double> @v8i16_to_v2f64(<8 x i16> %a) {
+;CHECK-LABEL: v8i16_to_v2f64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <8 x i16> %a to <2 x double>
+  ret <2 x double> %1
+}
+
+define <2 x double> @v2i64_to_v2f64(<2 x i64> %a) {
+;CHECK-LABEL: v2i64_to_v2f64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <2 x i64> %a to <2 x double>
+  ret <2 x double> %1
+}
+
+define <2 x double> @v4i32_to_v2f64(<4 x i32> %a) {
+;CHECK-LABEL: v4i32_to_v2f64:
+;CHECK-NEXT: .cfi_startproc
+;CHECK-NEXT: ret
+  %1 = bitcast <4 x i32> %a to <2 x double>
+  ret <2 x double> %1
+}
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch.ll b/test/CodeGen/X86/fast-isel-cmp-branch.ll
index d7b64ed3a5b88..e262448468ebd 100644
--- a/test/CodeGen/X86/fast-isel-cmp-branch.ll
+++ b/test/CodeGen/X86/fast-isel-cmp-branch.ll
@@ -1,5 +1,18 @@
-; RUN: llc -O0 -mtriple=x86_64-linux -asm-verbose=false < %s | FileCheck %s
-; RUN: llc -O0 -mtriple=x86_64-windows-itanium -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=x86_64-linux -asm-verbose=false -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=x86_64-windows-itanium -asm-verbose=false -verify-machineinstrs < %s | FileCheck %s
+
+; Fast-isel mustn't add a block to the MBB successor/predecessor list twice.
+; The machine verifier will catch and complain about this case.
+; CHECK-LABEL: baz
+; CHECK: retq
+define void @baz() {
+entry:
+  br i1 undef, label %exit, label %exit
+
+exit:
+  ret void
+}
+
 ; rdar://8337108
 
 ; Fast-isel shouldn't try to look through the compare because it's in a
diff --git a/test/CodeGen/X86/fast-isel-deadcode.ll b/test/CodeGen/X86/fast-isel-deadcode.ll
new file mode 100644
index 0000000000000..0a53d60f83529
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-deadcode.ll
@@ -0,0 +1,147 @@
+; RUN: llc < %s | FileCheck %s
+;
+; Generated with clang -O2 -S -emit-llvm
+;
+; /* Test 1 */
+; extern "C" bool bar (long double);
+; __attribute__((optnone))
+; extern "C" bool foo(long double x, long double y)
+; {
+;   return (x == y) || (bar(x));
+; }
+;
+; /* Test 2 */
+; struct FVector {
+;   float x, y, z;
+;   inline __attribute__((always_inline)) FVector(float f): x(f), y(f), z(f) {}
+;   inline __attribute__((always_inline)) FVector func(float p) const
+;   {
+;     if( x == 1.f ) {
+;       return *this;
+;     } else if( x < p ) {
+;       return FVector(0.f);
+;     }
+;     return FVector(x);
+;   }
+; };
+; 
+; __attribute__((optnone))
+; int main()
+; {
+;   FVector v(1.0);
+;   v = v.func(1.e-8);
+;   return 0;
+; }
+;
+; ModuleID = 'test.cpp'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.FVector = type { float, float, float }
+
+define zeroext i1 @foo(x86_fp80 %x, x86_fp80 %y) noinline optnone {
+entry:
+  %x.addr = alloca x86_fp80, align 16
+  %y.addr = alloca x86_fp80, align 16
+  store x86_fp80 %x, x86_fp80* %x.addr, align 16
+  store x86_fp80 %y, x86_fp80* %y.addr, align 16
+  %0 = load x86_fp80, x86_fp80* %x.addr, align 16
+  %1 = load x86_fp80, x86_fp80* %y.addr, align 16
+  %cmp = fcmp oeq x86_fp80 %0, %1
+
+; Test 1
+; Make sure that there is no dead code generated
+; from Fast-ISel Phi-node handling. We should only
+; see one movb of the constant 1, feeding the PHI
+; node in lor.end. This covers the code path with
+; handlePHINodesInSuccessorBlocks() returning true.
+;
+; CHECK-LABEL: foo:
+; CHECK: movb $1,
+; CHECK-NOT: movb $1,
+; CHECK-LABEL: .LBB0_1:
+
+  br i1 %cmp, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %entry
+  %2 = load x86_fp80, x86_fp80* %x.addr, align 16
+  %call = call zeroext i1 @bar(x86_fp80 %2)
+  br label %lor.end
+
+lor.end:                                          ; preds = %lor.rhs, %entry
+  %3 = phi i1 [ true, %entry ], [ %call, %lor.rhs ]
+  ret i1 %3
+}
+
+declare zeroext i1 @bar(x86_fp80)
+
+define i32 @main() noinline optnone {
+entry:
+  %retval = alloca i32, align 4
+  %v = alloca %struct.FVector, align 4
+  %ref.tmp = alloca %struct.FVector, align 4
+  %tmp = alloca { <2 x float>, float }, align 8
+  store i32 0, i32* %retval, align 4
+  %0 = bitcast %struct.FVector* %v to i8*
+  call void @llvm.lifetime.start(i64 12, i8* %0) nounwind
+  %x.i = getelementptr inbounds %struct.FVector, %struct.FVector* %v, i64 0, i32 0
+  store float 1.000000e+00, float* %x.i, align 4
+  %y.i = getelementptr inbounds %struct.FVector, %struct.FVector* %v, i64 0, i32 1
+  store float 1.000000e+00, float* %y.i, align 4
+  %z.i = getelementptr inbounds %struct.FVector, %struct.FVector* %v, i64 0, i32 2
+  store float 1.000000e+00, float* %z.i, align 4
+  %x.i.1 = getelementptr inbounds %struct.FVector, %struct.FVector* %v, i64 0, i32 0
+  %1 = load float, float* %x.i.1, align 4
+  %cmp.i = fcmp oeq float %1, 1.000000e+00
+  br i1 %cmp.i, label %if.then.i, label %if.else.i
+
+if.then.i:                                        ; preds = %entry
+  %retval.sroa.0.0..sroa_cast.i = bitcast %struct.FVector* %v to <2 x float>*
+  %retval.sroa.0.0.copyload.i = load <2 x float>, <2 x float>* %retval.sroa.0.0..sroa_cast.i, align 4
+  %retval.sroa.6.0..sroa_idx16.i = getelementptr inbounds %struct.FVector, %struct.FVector* %v, i64 0, i32 2
+  %retval.sroa.6.0.copyload.i = load float, float* %retval.sroa.6.0..sroa_idx16.i, align 4
+  br label %func.exit
+
+if.else.i:                                        ; preds = %entry
+
+; Test 2
+; In order to feed the first PHI node in func.exit handlePHINodesInSuccessorBlocks()
+; generates a local value instruction, but it cannot handle the second PHI node and
+; returns false to let SelectionDAGISel handle both cases. Make sure the generated 
+; local value instruction is removed.
+; CHECK-LABEL: main:
+; CHECK-LABEL: .LBB1_2:
+; CHECK:       xorps [[REG:%xmm[0-7]]], [[REG]]
+; CHECK-NOT:   xorps [[REG]], [[REG]]
+; CHECK-LABEL: .LBB1_3:
+
+  %cmp3.i = fcmp olt float %1, 0x3E45798EE0000000
+  br i1 %cmp3.i, label %func.exit, label %if.end.5.i
+
+if.end.5.i:                                       ; preds = %if.else.i
+  %retval.sroa.0.0.vec.insert13.i = insertelement <2 x float> undef, float %1, i32 0
+  %retval.sroa.0.4.vec.insert15.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert13.i, float %1, i32 1
+  br label %func.exit
+
+func.exit:                         ; preds = %if.then.i, %if.else.i, %if.end.5.i
+  %retval.sroa.6.0.i = phi float [ %retval.sroa.6.0.copyload.i, %if.then.i ], [ %1, %if.end.5.i ], [ 0.000000e+00, %if.else.i ]
+  %retval.sroa.0.0.i = phi <2 x float> [ %retval.sroa.0.0.copyload.i, %if.then.i ], [ %retval.sroa.0.4.vec.insert15.i, %if.end.5.i ], [ zeroinitializer, %if.else.i ]
+  %.fca.0.insert.i = insertvalue { <2 x float>, float } undef, <2 x float> %retval.sroa.0.0.i, 0
+  %.fca.1.insert.i = insertvalue { <2 x float>, float } %.fca.0.insert.i, float %retval.sroa.6.0.i, 1
+  store { <2 x float>, float } %.fca.1.insert.i, { <2 x float>, float }* %tmp, align 8
+  %2 = bitcast { <2 x float>, float }* %tmp to i8*
+  %3 = bitcast %struct.FVector* %ref.tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* %2, i64 12, i32 4, i1 false)
+  %4 = bitcast %struct.FVector* %v to i8*
+  %5 = bitcast %struct.FVector* %ref.tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 12, i32 4, i1 false)
+  %6 = bitcast %struct.FVector* %v to i8*
+  call void @llvm.lifetime.end(i64 12, i8* %6) nounwind
+  ret i32 0
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) argmemonly nounwind
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) argmemonly nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) argmemonly nounwind
diff --git a/test/CodeGen/X86/fast-isel-emutls.ll b/test/CodeGen/X86/fast-isel-emutls.ll
new file mode 100644
index 0000000000000..cb8012c0fa390
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-emutls.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -emulated-tls -march=x86 -relocation-model=pic -mtriple=i686-unknown-linux-gnu -fast-isel | FileCheck %s
+; PR3654
+
+@v = thread_local global i32 0
+define i32 @f() nounwind {
+entry:
+          %t = load i32, i32* @v
+          %s = add i32 %t, 1
+          ret i32 %s
+}
+
+; CHECK-LABEL: f:
+; CHECK:      movl __emutls_v.v@GOT(%ebx), %eax
+; CHECK-NEXT: movl %eax, (%esp)
+; CHECK-NEXT: calll __emutls_get_address@PLT
+; CHECK-NEXT: movl (%eax), %eax
+
+@alias = internal alias i32, i32* @v
+define i32 @f_alias() nounwind {
+entry:
+          %t = load i32, i32* @v
+          %s = add i32 %t, 1
+          ret i32 %s
+}
+
+; CHECK-LABEL: f_alias:
+; CHECK:      movl __emutls_v.v@GOT(%ebx), %eax
+; CHECK-NEXT: movl %eax, (%esp)
+; CHECK-NEXT: calll __emutls_get_address@PLT
+; CHECK-NEXT: movl (%eax), %eax
+
+; Use my_emutls_get_address like __emutls_get_address.
+@my_emutls_v_xyz = external global i8*, align 4
+declare i8* @my_emutls_get_address(i8*)
+
+define i32 @my_get_xyz() {
+entry:
+  %call = call i8* @my_emutls_get_address(i8* bitcast (i8** @my_emutls_v_xyz to i8*))
+  %0 = bitcast i8* %call to i32*
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+; CHECK-LABEL: my_get_xyz:
+; CHECK:      movl my_emutls_v_xyz@GOT(%ebx), %eax
+; CHECK-NEXT: movl %eax, (%esp)
+; CHECK-NEXT: calll my_emutls_get_address@PLT
+; CHECK-NEXT: movl (%eax), %eax
diff --git a/test/CodeGen/X86/fast-isel-nontemporal.ll b/test/CodeGen/X86/fast-isel-nontemporal.ll
new file mode 100644
index 0000000000000..6a174dbf5a8a2
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4a -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE4A
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+
+define void @test_nti32(i32* nocapture %ptr, i32 %X) {
+; ALL-LABEL: test_nti32:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    movntil %esi, (%rdi)
+; ALL-NEXT:    retq
+entry:
+  store i32 %X, i32* %ptr, align 4, !nontemporal !1
+  ret void
+}
+
+define void @test_nti64(i64* nocapture %ptr, i64 %X) {
+; ALL-LABEL: test_nti64:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    movntiq %rsi, (%rdi)
+; ALL-NEXT:    retq
+entry:
+  store i64 %X, i64* %ptr, align 8, !nontemporal !1
+  ret void
+}
+
+define void @test_ntfloat(float* nocapture %ptr, float %X) {
+; SSE2-LABEL: test_ntfloat:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movss %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_ntfloat:
+; SSE4A:       # BB#0: # %entry
+; SSE4A-NEXT:    movntss %xmm0, (%rdi)
+; SSE4A-NEXT:    retq
+;
+; AVX-LABEL: test_ntfloat:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovss %xmm0, (%rdi)
+; AVX-NEXT:    retq
+entry:
+  store float %X, float* %ptr, align 4, !nontemporal !1
+  ret void
+}
+
+define void @test_ntdouble(double* nocapture %ptr, double %X) {
+; SSE2-LABEL: test_ntdouble:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_ntdouble:
+; SSE4A:       # BB#0: # %entry
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    retq
+;
+; AVX-LABEL: test_ntdouble:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovsd %xmm0, (%rdi)
+; AVX-NEXT:    retq
+entry:
+  store double %X, double* %ptr, align 8, !nontemporal !1
+  ret void
+}
+
+define void @test_nt4xfloat(<4 x float>* nocapture %ptr, <4 x float> %X) {
+; SSE-LABEL: test_nt4xfloat:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movntps %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_nt4xfloat:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX-NEXT:    retq
+entry:
+  store <4 x float> %X, <4 x float>* %ptr, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_nt2xdouble(<2 x double>* nocapture %ptr, <2 x double> %X) {
+; SSE-LABEL: test_nt2xdouble:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movntpd %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_nt2xdouble:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovntpd %xmm0, (%rdi)
+; AVX-NEXT:    retq
+entry:
+  store <2 x double> %X, <2 x double>* %ptr, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_nt2xi64(<2 x i64>* nocapture %ptr, <2 x i64> %X) {
+; SSE-LABEL: test_nt2xi64:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movntdq %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_nt2xi64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovntdq %xmm0, (%rdi)
+; AVX-NEXT:    retq
+entry:
+  store <2 x i64> %X, <2 x i64>* %ptr, align 16, !nontemporal !1
+  ret void
+}
+
+!1 = !{i32 1}
diff --git a/test/CodeGen/X86/fast-isel-stackcheck.ll b/test/CodeGen/X86/fast-isel-stackcheck.ll
new file mode 100644
index 0000000000000..3b7318fa77d92
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-stackcheck.ll
@@ -0,0 +1,44 @@
+; RUN: llc -o - %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; selectiondag stack protector uses a GuardReg which the fast-isel stack
+; protection code did not but the state was not reset properly.
+; The optnone attribute on @bar forces fast-isel.
+
+; CHECK-LABEL: foo:
+; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
+; CHECK-NOT: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
+define void @foo() #0 {
+entry:
+  %_tags = alloca [3 x i32], align 4
+  ret void
+}
+
+; CHECK-LABEL: bar:
+; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
+; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
+define void @bar() #1 {
+entry:
+  %vt = alloca [2 x double], align 16
+  br i1 undef, label %cleanup.4091, label %for.cond.3850
+
+unreachable:
+  unreachable
+
+for.cond.3850:
+  br i1 undef, label %land.rhs.3853, label %land.end.3857
+
+land.rhs.3853:
+  br label %land.end.3857
+
+land.end.3857:
+  %0 = phi i1 [ false, %for.cond.3850 ], [ false, %land.rhs.3853 ]
+  br i1 %0, label %unreachable, label %unreachable
+
+cleanup.4091:
+  ret void
+}
+
+attributes #0 = { ssp }
+attributes #1 = { noinline optnone ssp }
diff --git a/test/CodeGen/X86/fast-isel-tls.ll b/test/CodeGen/X86/fast-isel-tls.ll
index 18bb9c13ff017..0b7a5d9759d23 100644
--- a/test/CodeGen/X86/fast-isel-tls.ll
+++ b/test/CodeGen/X86/fast-isel-tls.ll
@@ -13,7 +13,7 @@ entry:
 ; CHECK: leal	v@TLSGD
 ; CHECK: __tls_get_addr
 
-@alias = internal alias i32* @v
+@alias = internal alias i32, i32* @v
 define i32 @f_alias() nounwind {
 entry:
           %t = load i32, i32* @v
diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll
index b65e9d01ab8ba..d9d9ac401fb50 100644
--- a/test/CodeGen/X86/fdiv-combine.ll
+++ b/test/CodeGen/X86/fdiv-combine.ll
@@ -1,9 +1,11 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
 
-; Anything more than one division using a single divisor operand
+; More than one 'arcp' division using a single divisor operand
 ; should be converted into a reciprocal and multiplication.
 
-define float @div1_arcp(float %x, float %y, float %z) #0 {
+; Don't do anything for just one division.
+
+define float @div1_arcp(float %x, float %y, float %z) {
 ; CHECK-LABEL: div1_arcp:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    divss %xmm1, %xmm0
@@ -12,13 +14,15 @@ define float @div1_arcp(float %x, float %y, float %z) #0 {
   ret float %div1
 }
 
-define float @div2_arcp(float %x, float %y, float %z) #0 {
-; CHECK-LABEL: div2_arcp:
+; All math instructions are 'arcp', so optimize.
+
+define float @div2_arcp_all(float %x, float %y, float %z) {
+; CHECK-LABEL: div2_arcp_all:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    divss %xmm2, %xmm3
-; CHECK-NEXT:    mulss %xmm1, %xmm0
 ; CHECK-NEXT:    mulss %xmm3, %xmm0
+; CHECK-NEXT:    mulss %xmm1, %xmm0
 ; CHECK-NEXT:    mulss %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %div1 = fdiv arcp float %x, %z
@@ -27,10 +31,57 @@ define float @div2_arcp(float %x, float %y, float %z) #0 {
   ret float %div2
 }
 
+; The first division is not 'arcp', so do not optimize.
+
+define float @div2_arcp_partial1(float %x, float %y, float %z) {
+; CHECK-LABEL: div2_arcp_partial1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    divss %xmm2, %xmm0
+; CHECK-NEXT:    mulss %xmm1, %xmm0
+; CHECK-NEXT:    divss %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %div1 = fdiv float %x, %z
+  %mul = fmul arcp float %div1, %y
+  %div2 = fdiv arcp float %mul, %z
+  ret float %div2
+}
+
+; The second division is not 'arcp', so do not optimize.
+
+define float @div2_arcp_partial2(float %x, float %y, float %z) {
+; CHECK-LABEL: div2_arcp_partial2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    divss %xmm2, %xmm0
+; CHECK-NEXT:    mulss %xmm1, %xmm0
+; CHECK-NEXT:    divss %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %div1 = fdiv arcp float %x, %z
+  %mul = fmul arcp float %div1, %y
+  %div2 = fdiv float %mul, %z
+  ret float %div2
+}
+
+; The multiply is not 'arcp', but that does not prevent optimizing the divisions.
+
+define float @div2_arcp_partial3(float %x, float %y, float %z) {
+; CHECK-LABEL: div2_arcp_partial3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm2, %xmm3
+; CHECK-NEXT:    mulss %xmm3, %xmm0
+; CHECK-NEXT:    mulss %xmm1, %xmm0
+; CHECK-NEXT:    mulss %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %div1 = fdiv arcp float %x, %z
+  %mul = fmul float %div1, %y
+  %div2 = fdiv arcp float %mul, %z
+  ret float %div2
+}
+
 ; If the reciprocal is already calculated, we should not
 ; generate an extra multiplication by 1.0. 
 
-define double @div3_arcp(double %x, double %y, double %z) #0 {
+define double @div3_arcp(double %x, double %y, double %z) {
 ; CHECK-LABEL: div3_arcp:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movsd{{.*#+}} xmm2 = mem[0],zero
@@ -44,7 +95,7 @@ define double @div3_arcp(double %x, double %y, double %z) #0 {
   ret double %ret
 }
 
-define void @PR24141() #0 {
+define void @PR24141() {
 ; CHECK-LABEL: PR24141:
 ; CHECK:	callq
 ; CHECK-NEXT:	divsd
@@ -57,11 +108,9 @@ while.body:
   %call = call { double, double } @g(double %x.0)
   %xv0 = extractvalue { double, double } %call, 0
   %xv1 = extractvalue { double, double } %call, 1
-  %div = fdiv double %xv0, %xv1
+  %div = fdiv arcp double %xv0, %xv1
   br label %while.body
 }
 
 declare { double, double } @g(double)
 
-; FIXME: If the backend understands 'arcp', then this attribute is unnecessary.
-attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fdiv.ll b/test/CodeGen/X86/fdiv.ll
index 0749682e2f681..226e6d269c3b2 100644
--- a/test/CodeGen/X86/fdiv.ll
+++ b/test/CodeGen/X86/fdiv.ll
@@ -1,41 +1,69 @@
-; RUN: llc < %s -march=x86-64 -enable-unsafe-fp-math | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck %s
 
 define double @exact(double %x) {
 ; Exact division by a constant converted to multiplication.
-; CHECK: @exact
-; CHECK: mulsd
+; CHECK-LABEL: exact:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulsd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
   %div = fdiv double %x, 2.0
   ret double %div
 }
 
 define double @inexact(double %x) {
 ; Inexact division by a constant converted to multiplication.
-; CHECK: @inexact
-; CHECK: mulsd
-  %div = fdiv double %x, 0x41DFFFFFFFC00000 
+; CHECK-LABEL: inexact:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mulsd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %div = fdiv double %x, 0x41DFFFFFFFC00000
   ret double %div
 }
 
 define double @funky(double %x) {
 ; No conversion to multiplication if too funky.
-; CHECK: @funky
-; CHECK: divsd
+; CHECK-LABEL: funky:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorpd %xmm1, %xmm1
+; CHECK-NEXT:    divsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %div = fdiv double %x, 0.0
   ret double %div
 }
 
 define double @denormal1(double %x) {
 ; Don't generate multiplication by a denormal.
-; CHECK: @denormal1
-; CHECK: divsd
+; CHECK-LABEL: denormal1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    divsd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
   %div = fdiv double %x, 0x7FD0000000000001
   ret double %div
 }
 
 define double @denormal2(double %x) {
 ; Don't generate multiplication by a denormal.
-; CHECK: @denormal
-; CHECK: divsd
+; CHECK-LABEL: denormal2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    divsd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
   %div = fdiv double %x, 0x7FEFFFFFFFFFFFFF
   ret double %div
 }
+
+; Deleting the negates does not require unsafe-fp-math.
+
+define float @double_negative(float %x, float %y) #0 {
+; CHECK-LABEL: double_negative:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    divss %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %neg1 = fsub float -0.0, %x
+  %neg2 = fsub float -0.0, %y
+  %div = fdiv float %neg1, %neg2
+  ret float %div
+}
+
+attributes #0 = { "unsafe-fp-math"="false" }
+
diff --git a/test/CodeGen/X86/fixup-lea.ll b/test/CodeGen/X86/fixup-lea.ll
new file mode 100644
index 0000000000000..1ddc099ffd62f
--- /dev/null
+++ b/test/CodeGen/X86/fixup-lea.ll
@@ -0,0 +1,34 @@
+;RUN: llc < %s -march=x86 | FileCheck %s
+
+define void @foo(i32 inreg %dns) minsize {
+entry:
+; CHECK-LABEL: foo
+; CHECK: dec
+  br label %for.body
+
+for.body:
+  %i.05 = phi i16 [ %dec, %for.body ], [ 0, %entry ]
+  %dec = add i16 %i.05, -1
+  %conv = zext i16 %dec to i32
+  %cmp = icmp slt i32 %conv, %dns
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+define void @bar(i32 inreg %dns) minsize {
+entry:
+; CHECK-LABEL: bar
+; CHECK: inc
+  br label %for.body
+
+for.body:
+  %i.05 = phi i16 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i16 %i.05, 1
+  %conv = zext i16 %inc to i32
+  %cmp = icmp slt i32 %conv, %dns
+  br i1 %cmp, label %for.body, label %for.end
+for.end:
+  ret void
+}
diff --git a/test/CodeGen/X86/float-asmprint.ll b/test/CodeGen/X86/float-asmprint.ll
index 5de9700fc0648..0108430ee93ef 100644
--- a/test/CodeGen/X86/float-asmprint.ll
+++ b/test/CodeGen/X86/float-asmprint.ll
@@ -9,6 +9,8 @@
 @var64 = global double -0.0, align 8
 @var32 = global float -0.0, align 4
 @var16 = global half -0.0, align 2
+@var4f32 = global <4 x float> <float -0.0, float 0.0, float 1.0, float 2.0>
+@var4f16 = global <4 x half> <half -0.0, half 0.0, half 1.0, half 2.0>
 
 ; CHECK: var128:
 ; CHECK-NEXT: .quad 0                         # fp128 -0
@@ -39,3 +41,16 @@
 ; CHECK-NEXT: .short 32768                    # half -0
 ; CHECK-NEXT: .size
 
+; CHECK: var4f32:
+; CHECK-NEXT: .long 2147483648               # float -0
+; CHECK-NEXT: .long 0                        # float 0
+; CHECK-NEXT: .long 1065353216               # float 1
+; CHECK-NEXT: .long 1073741824               # float 2
+; CHECK-NEXT: .size
+
+; CHECK: var4f16:
+; CHECK-NEXT: .short 32768                    # half -0
+; CHECK-NEXT: .short 0                        # half 0
+; CHECK-NEXT: .short 15360                    # half 1
+; CHECK-NEXT: .short 16384                    # half 2
+; CHECK-NEXT: .size
diff --git a/test/CodeGen/X86/floor-soft-float.ll b/test/CodeGen/X86/floor-soft-float.ll
index 7bb738513f547..3b28ecc6379d3 100644
--- a/test/CodeGen/X86/floor-soft-float.ll
+++ b/test/CodeGen/X86/floor-soft-float.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
 declare float @llvm.floor.f32(float)
 
 ; CHECK-SOFT-FLOAT: callq floorf
-; CHECK-HARD-FLOAT: roundss $1, %xmm0, %xmm0
+; CHECK-HARD-FLOAT: roundss $9, %xmm0, %xmm0
 define float @myfloor(float %a) {
   %val = tail call float @llvm.floor.f32(float %a)
   ret float %val
diff --git a/test/CodeGen/X86/fma-commute-x86.ll b/test/CodeGen/X86/fma-commute-x86.ll
new file mode 100644
index 0000000000000..162a97ac025c5
--- /dev/null
+++ b/test/CodeGen/X86/fma-commute-x86.ll
@@ -0,0 +1,761 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
+
+attributes #0 = { nounwind }
+
+declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_baa_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vfmadd213ss %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_aba_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfmadd132ss (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_bba_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vfmadd213ss (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_baa_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfmadd132ps (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_aba_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfmadd231ps (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_bba_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vfmadd213ps (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_baa_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %ymm0
+; CHECK-NEXT: vfmadd132ps (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_aba_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %ymm0
+; CHECK-NEXT: vfmadd231ps (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_bba_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %ymm0
+; CHECK-NEXT: vfmadd213ps (%rcx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_baa_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vfmadd213sd %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_aba_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfmadd132sd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_bba_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
+; CHECK-NEXT: vfmadd213sd (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_baa_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfmadd132pd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_aba_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfmadd231pd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_bba_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
+; CHECK-NEXT: vfmadd213pd (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_baa_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %ymm0
+; CHECK-NEXT: vfmadd132pd (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_aba_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %ymm0
+; CHECK-NEXT: vfmadd231pd (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmadd_bba_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %ymm0
+; CHECK-NEXT: vfmadd213pd (%rcx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+
+declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fnmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_baa_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_aba_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfnmadd132ss (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_bba_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vfnmadd213ss (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_baa_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfnmadd132ps (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_aba_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfnmadd231ps (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_bba_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vfnmadd213ps (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_baa_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %ymm0
+; CHECK-NEXT: vfnmadd132ps (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_aba_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %ymm0
+; CHECK-NEXT: vfnmadd231ps (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_bba_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %ymm0
+; CHECK-NEXT: vfnmadd213ps (%rcx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fnmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_baa_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_aba_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfnmadd132sd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_bba_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
+; CHECK-NEXT: vfnmadd213sd (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_baa_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfnmadd132pd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_aba_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfnmadd231pd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_bba_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
+; CHECK-NEXT: vfnmadd213pd (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_baa_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %ymm0
+; CHECK-NEXT: vfnmadd132pd (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_aba_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %ymm0
+; CHECK-NEXT: vfnmadd231pd (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmadd_bba_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %ymm0
+; CHECK-NEXT: vfnmadd213pd (%rcx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+
+declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_baa_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vfmsub213ss %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmsub_aba_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_aba_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfmsub132ss (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmsub_bba_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_bba_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vfmsub213ss (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_baa_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfmsub132ps (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_aba_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfmsub231ps (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_bba_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vfmsub213ps (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_baa_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %ymm0
+; CHECK-NEXT: vfmsub132ps (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_aba_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %ymm0
+; CHECK-NEXT: vfmsub231ps (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_bba_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %ymm0
+; CHECK-NEXT: vfmsub213ps (%rcx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_baa_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vfmsub213sd %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_aba_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfmsub132sd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmsub_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_bba_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
+; CHECK-NEXT: vfmsub213sd (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_baa_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfmsub132pd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_aba_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfmsub231pd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_bba_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
+; CHECK-NEXT: vfmsub213pd (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_baa_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %ymm0
+; CHECK-NEXT: vfmsub132pd (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_aba_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %ymm0
+; CHECK-NEXT: vfmsub231pd (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fmsub_bba_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %ymm0
+; CHECK-NEXT: vfmsub213pd (%rcx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+
+declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fnmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_baa_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vfnmsub213ss %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmsub_aba_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_aba_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfnmsub132ss (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmsub_bba_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_bba_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vfnmsub213ss (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_baa_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfnmsub132ps (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_aba_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vfnmsub231ps (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_bba_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vfnmsub213ps (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_baa_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %ymm0
+; CHECK-NEXT: vfnmsub132ps (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_aba_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rcx), %ymm0
+; CHECK-NEXT: vfnmsub231ps (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_bba_ps_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovaps	(%rdx), %ymm0
+; CHECK-NEXT: vfnmsub213ps (%rcx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fnmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_baa_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vfnmsub213sd %xmm1, %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_aba_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfnmsub132sd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmsub_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_bba_sd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
+; CHECK-NEXT: vfnmsub213sd (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_baa_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfnmsub132pd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_aba_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
+; CHECK-NEXT: vfnmsub231pd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_bba_pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
+; CHECK-NEXT: vfnmsub213pd (%rcx), %xmm0, %xmm0
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_baa_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %ymm0
+; CHECK-NEXT: vfnmsub132pd (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_aba_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rcx), %ymm0
+; CHECK-NEXT: vfnmsub231pd (%rdx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_x86_fnmsub_bba_pd_y:
+; CHECK:       # BB#0:
+; CHECK-NEXT: vmovapd	(%rdx), %ymm0
+; CHECK-NEXT: vfnmsub213pd (%rcx), %ymm0, %ymm0
+; CHECK-NEXT: retq
+  %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
diff --git a/test/CodeGen/X86/fma-do-not-commute.ll b/test/CodeGen/X86/fma-do-not-commute.ll
index 1f6a19cfff83c..89be0795d2067 100644
--- a/test/CodeGen/X86/fma-do-not-commute.ll
+++ b/test/CodeGen/X86/fma-do-not-commute.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-apple-macosx"
 
 ; CHECK-LABEL: test1:
 ; %arg lives in xmm0 and it shouldn't be redefined until it is used in the FMA.
-; CHECK-NOT {{.*}}, %xmm0
+; CHECK-NOT: {{.*}}, %xmm0
 ; %addr lives in rdi.
 ; %addr2 lives in rsi.
 ; CHECK: vmovss (%rsi), [[ADDR2:%xmm[0-9]+]]
diff --git a/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll b/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
index f7d0cdf3c65a1..8d0318bb93e0d 100644
--- a/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
+++ b/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
@@ -1,8 +1,337 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s
 
-; CHECK-LABEL: fmaddsubpd_loop:
-; CHECK:   vfmaddsub231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+; CHECK-LABEL: fmaddsubpd_loop_128:
+; CHECK:   vfmaddsub231pd %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <2 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddpd_loop_128:
+; CHECK:   vfmsubadd231pd %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <2 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddpd_loop_128:
+; CHECK:   vfmadd231pd %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <2 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubpd_loop_128:
+; CHECK:   vfmsub231pd %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <2 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fnmaddpd_loop_128:
+; CHECK:   vfnmadd231pd %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <2 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fnmsubpd_loop_128:
+; CHECK:   vfnmsub231pd %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <2 x double> %c.addr.0
+}
+
+declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
+
+
+; CHECK-LABEL: fmaddsubps_loop_128:
+; CHECK:   vfmaddsub231ps %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmaddsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddps_loop_128:
+; CHECK:   vfmsubadd231ps %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmsubaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddps_loop_128:
+; CHECK:   vfmadd231ps %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubps_loop_128:
+; CHECK:   vfmsub231ps %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fnmaddps_loop_128:
+; CHECK:   vfnmadd231ps %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fnmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fnmsubps_loop_128:
+; CHECK:   vfnmsub231ps %xmm1, %xmm0, %xmm2
+; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fnmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x float> %c.addr.0
+}
+
+declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK-LABEL: fmaddsubpd_loop_256:
+; CHECK:   vfmaddsub231pd %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
   br label %for.cond
 
@@ -24,9 +353,11 @@ for.end:
   ret <4 x double> %c.addr.0
 }
 
-; CHECK-LABEL: fmsubaddpd_loop:
-; CHECK:   vfmsubadd231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+; CHECK-LABEL: fmsubaddpd_loop_256:
+; CHECK:   vfmsubadd231pd %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
   br label %for.cond
 
@@ -48,9 +379,11 @@ for.end:
   ret <4 x double> %c.addr.0
 }
 
-; CHECK-LABEL: fmaddpd_loop:
-; CHECK:   vfmadd231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+; CHECK-LABEL: fmaddpd_loop_256:
+; CHECK:   vfmadd231pd %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
   br label %for.cond
 
@@ -72,9 +405,11 @@ for.end:
   ret <4 x double> %c.addr.0
 }
 
-; CHECK-LABEL: fmsubpd_loop:
-; CHECK:   vfmsub231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+; CHECK-LABEL: fmsubpd_loop_256:
+; CHECK:   vfmsub231pd %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
   br label %for.cond
 
@@ -96,15 +431,71 @@ for.end:
   ret <4 x double> %c.addr.0
 }
 
+; CHECK-LABEL: fnmaddpd_loop_256:
+; CHECK:   vfnmadd231pd %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fnmsubpd_loop_256:
+; CHECK:   vfnmsub231pd %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x double> %c.addr.0
+}
+
 declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
 declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
 declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
 declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
 
 
-; CHECK-LABEL: fmaddsubps_loop:
-; CHECK:   vfmaddsub231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: fmaddsubps_loop_256:
+; CHECK:   vfmaddsub231ps %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fmaddsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
 entry:
   br label %for.cond
 
@@ -126,9 +517,11 @@ for.end:
   ret <8 x float> %c.addr.0
 }
 
-; CHECK-LABEL: fmsubaddps_loop:
-; CHECK:   vfmsubadd231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: fmsubaddps_loop_256:
+; CHECK:   vfmsubadd231ps %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fmsubaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
 entry:
   br label %for.cond
 
@@ -150,9 +543,11 @@ for.end:
   ret <8 x float> %c.addr.0
 }
 
-; CHECK-LABEL: fmaddps_loop:
-; CHECK:   vfmadd231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: fmaddps_loop_256:
+; CHECK:   vfmadd231ps %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
 entry:
   br label %for.cond
 
@@ -174,9 +569,11 @@ for.end:
   ret <8 x float> %c.addr.0
 }
 
-; CHECK-LABEL: fmsubps_loop:
-; CHECK:   vfmsub231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: fmsubps_loop_256:
+; CHECK:   vfmsub231ps %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
 entry:
   br label %for.cond
 
@@ -198,7 +595,61 @@ for.end:
   ret <8 x float> %c.addr.0
 }
 
+; CHECK-LABEL: fnmaddps_loop_256:
+; CHECK:   vfnmadd231ps %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fnmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fnmsubps_loop_256:
+; CHECK:   vfnmsub231ps %ymm1, %ymm0, %ymm2
+; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fnmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <8 x float> %c.addr.0
+}
+
 declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
 declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
 declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
 declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll
index 881436386bac8..cf4c8933fcabd 100644
--- a/test/CodeGen/X86/fma-intrinsics-x86.ll
+++ b/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -1,95 +1,149 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-pc-windows -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
 
 ; VFMADD
 define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_ss:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfmadd132ss     (%rdx), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ss:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   ret <4 x float> %res
 }
+
+define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmadd_bac_ss:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rcx), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
+  ret <4 x float> %res
+}
 declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_sd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rdx), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_sd:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   ret <2 x double> %res
 }
+
+define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmadd_bac_sd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rcx), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
+  ret <2 x double> %res
+}
 declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
 
 define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
 
 define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
 
 define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   ret <4 x double> %res
 }
@@ -97,90 +151,144 @@ declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4
 
 ; VFMSUB
 define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_ss:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rdx), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ss:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   ret <4 x float> %res
 }
+
+define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsub_bac_ss:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rcx), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
+  ret <4 x float> %res
+}
 declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_sd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rdx), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_sd:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   ret <2 x double> %res
 }
+
+define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsub_bac_sd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rcx), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
+  ret <2 x double> %res
+}
 declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
 
 define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
 
 define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
 
 define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   ret <4 x double> %res
 }
@@ -188,90 +296,144 @@ declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4
 
 ; VFNMADD
 define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_ss:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ss:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   ret <4 x float> %res
 }
+
+define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_bac_ss:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rcx), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
+  ret <4 x float> %res
+}
 declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_sd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rdx), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_sd:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   ret <2 x double> %res
 }
+
+define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_bac_sd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rcx), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
+  ret <2 x double> %res
+}
 declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
 
 define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
 
 define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
 
 define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   ret <4 x double> %res
 }
@@ -279,90 +441,144 @@ declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4
 
 ; VFNMSUB
 define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_ss:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ss:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   ret <4 x float> %res
 }
+
+define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_bac_ss:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rcx), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
+  ret <4 x float> %res
+}
 declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_sd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rdx), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_sd:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   ret <2 x double> %res
 }
+
+define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_bac_sd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rcx), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
+  ret <2 x double> %res
+}
 declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
 
 define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
 
 define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
 
 define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   ret <4 x double> %res
 }
@@ -370,60 +586,72 @@ declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4
 
 ; VFMADDSUB
 define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
 
 define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
 
 define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   ret <4 x double> %res
 }
@@ -431,60 +659,72 @@ declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>,
 
 ; VFMSUBADD
 define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0
+;
 ; CHECK-FMA-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
 
 define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
 
 define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
-; CHECK-FMA:       # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK-NEXT:  # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0
+;
 ; CHECK-FMA-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT:    retq
 ;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd_256:
-; CHECK-FMA4:       # BB#0:
-; CHECK-FMA4-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT:    retq
+; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
   %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   ret <4 x double> %res
 }
diff --git a/test/CodeGen/X86/fma-scalar-memfold.ll b/test/CodeGen/X86/fma-scalar-memfold.ll
new file mode 100644
index 0000000000000..0ceaa562a5d42
--- /dev/null
+++ b/test/CodeGen/X86/fma-scalar-memfold.ll
@@ -0,0 +1,383 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+
+attributes #0 = { nounwind }
+
+declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
+
+declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define void @fmadd_aab_ss(float* %a, float* %b) #0 {
+; CHECK-LABEL: fmadd_aab_ss:
+; CHECK:      vmovss (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfmadd213ss (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load float, float* %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, float* %a
+  ret void
+}
+
+define void @fmadd_aba_ss(float* %a, float* %b) #0 {
+; CHECK-LABEL: fmadd_aba_ss:
+; CHECK:      vmovss (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfmadd132ss (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load float, float* %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, float* %a
+  ret void
+}
+
+define void @fmsub_aab_ss(float* %a, float* %b) #0 {
+; CHECK-LABEL: fmsub_aab_ss:
+; CHECK:      vmovss (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfmsub213ss (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load float, float* %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, float* %a
+  ret void
+}
+
+define void @fmsub_aba_ss(float* %a, float* %b) #0 {
+; CHECK-LABEL: fmsub_aba_ss:
+; CHECK:      vmovss (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfmsub132ss (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load float, float* %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, float* %a
+  ret void
+}
+
+define void @fnmadd_aab_ss(float* %a, float* %b) #0 {
+; CHECK-LABEL: fnmadd_aab_ss:
+; CHECK:      vmovss (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfnmadd213ss (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load float, float* %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, float* %a
+  ret void
+}
+
+define void @fnmadd_aba_ss(float* %a, float* %b) #0 {
+; CHECK-LABEL: fnmadd_aba_ss:
+; CHECK:      vmovss (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfnmadd132ss (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load float, float* %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, float* %a
+  ret void
+}
+
+define void @fnmsub_aab_ss(float* %a, float* %b) #0 {
+; CHECK-LABEL: fnmsub_aab_ss:
+; CHECK:      vmovss (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfnmsub213ss (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load float, float* %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, float* %a
+  ret void
+}
+
+define void @fnmsub_aba_ss(float* %a, float* %b) #0 {
+; CHECK-LABEL: fnmsub_aba_ss:
+; CHECK:      vmovss (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfnmsub132ss (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load float, float* %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, float* %a
+  ret void
+}
+
+define void @fmadd_aab_sd(double* %a, double* %b) #0 {
+; CHECK-LABEL: fmadd_aab_sd:
+; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfmadd213sd (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load double, double* %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av  = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, double* %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv  = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, double* %a
+  ret void
+}
+
+define void @fmadd_aba_sd(double* %a, double* %b) #0 {
+; CHECK-LABEL: fmadd_aba_sd:
+; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfmadd132sd (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load double, double* %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av  = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, double* %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv  = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, double* %a
+  ret void
+}
+
+define void @fmsub_aab_sd(double* %a, double* %b) #0 {
+; CHECK-LABEL: fmsub_aab_sd:
+; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfmsub213sd (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load double, double* %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av  = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, double* %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv  = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, double* %a
+  ret void
+}
+
+define void @fmsub_aba_sd(double* %a, double* %b) #0 {
+; CHECK-LABEL: fmsub_aba_sd:
+; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfmsub132sd (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load double, double* %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av  = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, double* %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv  = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, double* %a
+  ret void
+}
+
+define void @fnmadd_aab_sd(double* %a, double* %b) #0 {
+; CHECK-LABEL: fnmadd_aab_sd:
+; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfnmadd213sd (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load double, double* %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av  = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, double* %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv  = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, double* %a
+  ret void
+}
+
+define void @fnmadd_aba_sd(double* %a, double* %b) #0 {
+; CHECK-LABEL: fnmadd_aba_sd:
+; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfnmadd132sd (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load double, double* %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av  = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, double* %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv  = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, double* %a
+  ret void
+}
+
+define void @fnmsub_aab_sd(double* %a, double* %b) #0 {
+; CHECK-LABEL: fnmsub_aab_sd:
+; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfnmsub213sd (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load double, double* %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av  = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, double* %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv  = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, double* %a
+  ret void
+}
+
+define void @fnmsub_aba_sd(double* %a, double* %b) #0 {
+; CHECK-LABEL: fnmsub_aba_sd:
+; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
+; CHECK-NEXT: vfnmsub132sd (%rdx), %[[XMM]], %[[XMM]]
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
+; CHECK-NEXT: ret
+  %a.val = load double, double* %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av  = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, double* %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv  = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, double* %a
+  ret void
+}
+
+
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index a27b760face77..76a4acf00f907 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,212 +1,1195 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
-
-; CHECK: test_x86_fmadd_ps
-; CHECK: vfmadd213ps     %xmm2, %xmm1, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fmadd_ps
-; CHECK_FMA4: vfmaddps     %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK_FMA4: ret
-define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  %x = fmul <4 x float> %a0, %a1
-  %res = fadd <4 x float> %x, %a2
-  ret <4 x float> %res
-}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512
 
-; CHECK: test_x86_fmsub_ps
-; CHECK: fmsub213ps     %xmm2, %xmm1, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fmsub_ps
-; CHECK_FMA4: vfmsubps     %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK_FMA4: ret
-define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  %x = fmul <4 x float> %a0, %a1
-  %res = fsub <4 x float> %x, %a2
-  ret <4 x float> %res
-}
+;
+; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
+;
 
-; CHECK: test_x86_fnmadd_ps
-; CHECK: fnmadd213ps     %xmm2, %xmm1, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fnmadd_ps
-; CHECK_FMA4: vfnmaddps     %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK_FMA4: ret
-define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  %x = fmul <4 x float> %a0, %a1
-  %res = fsub <4 x float> %a2, %x
-  ret <4 x float> %res
+define float @test_f32_fmadd(float %a0, float %a1, float %a2) {
+; FMA-LABEL: test_f32_fmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f32_fmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f32_fmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x = fmul float %a0, %a1
+  %res = fadd float %x, %a2
+  ret float %res
 }
 
-; CHECK: test_x86_fnmsub_ps
-; CHECK: fnmsub213ps     %xmm2, %xmm1, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fnmsub_ps
-; CHECK_FMA4: fnmsubps     %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK_FMA4: ret
-define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+define <4 x float> @test_4f32_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; FMA-LABEL: test_4f32_fmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_4f32_fmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_4f32_fmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
   %x = fmul <4 x float> %a0, %a1
-  %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
-  %res = fsub <4 x float> %y, %a2
+  %res = fadd <4 x float> %x, %a2
   ret <4 x float> %res
 }
 
-; CHECK: test_x86_fmadd_ps_y
-; CHECK: vfmadd213ps     %ymm2, %ymm1, %ymm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fmadd_ps_y
-; CHECK_FMA4: vfmaddps     %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK_FMA4: ret
-define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+define <8 x float> @test_8f32_fmadd(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+; FMA-LABEL: test_8f32_fmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_8f32_fmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_8f32_fmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
   %x = fmul <8 x float> %a0, %a1
   %res = fadd <8 x float> %x, %a2
   ret <8 x float> %res
 }
 
-; CHECK: test_x86_fmsub_ps_y
-; CHECK: vfmsub213ps     %ymm2, %ymm1, %ymm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fmsub_ps_y
-; CHECK_FMA4: vfmsubps     %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK_FMA4: ret
-define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
-  %x = fmul <8 x float> %a0, %a1
-  %res = fsub <8 x float> %x, %a2
-  ret <8 x float> %res
-}
-
-; CHECK: test_x86_fnmadd_ps_y
-; CHECK: vfnmadd213ps     %ymm2, %ymm1, %ymm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fnmadd_ps_y
-; CHECK_FMA4: vfnmaddps     %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK_FMA4: ret
-define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
-  %x = fmul <8 x float> %a0, %a1
-  %res = fsub <8 x float> %a2, %x
-  ret <8 x float> %res
+define double @test_f64_fmadd(double %a0, double %a1, double %a2) {
+; FMA-LABEL: test_f64_fmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f64_fmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f64_fmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x = fmul double %a0, %a1
+  %res = fadd double %x, %a2
+  ret double %res
 }
 
-; CHECK: test_x86_fnmsub_ps_y
-; CHECK: vfnmsub213ps     %ymm2, %ymm1, %ymm0
-; CHECK: ret
-define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
-  %x = fmul <8 x float> %a0, %a1
-  %y = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
-  %res = fsub <8 x float> %y, %a2
-  ret <8 x float> %res
+define <2 x double> @test_2f64_fmadd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; FMA-LABEL: test_2f64_fmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_2f64_fmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_2f64_fmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %x = fmul <2 x double> %a0, %a1
+  %res = fadd <2 x double> %x, %a2
+  ret <2 x double> %res
 }
 
-; CHECK: test_x86_fmadd_pd_y
-; CHECK: vfmadd213pd     %ymm2, %ymm1, %ymm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fmadd_pd_y
-; CHECK_FMA4: vfmaddpd     %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK_FMA4: ret
-define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+define <4 x double> @test_4f64_fmadd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+; FMA-LABEL: test_4f64_fmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_4f64_fmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_4f64_fmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
   %x = fmul <4 x double> %a0, %a1
   %res = fadd <4 x double> %x, %a2
   ret <4 x double> %res
 }
 
-; CHECK: test_x86_fmsub_pd_y
-; CHECK: vfmsub213pd     %ymm2, %ymm1, %ymm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fmsub_pd_y
-; CHECK_FMA4: vfmsubpd     %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK_FMA4: ret
-define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
-  %x = fmul <4 x double> %a0, %a1
-  %res = fsub <4 x double> %x, %a2
-  ret <4 x double> %res
+;
+; Pattern: (fsub (fmul x, y), z) -> (fmsub x, y, z)
+;
+
+define float @test_f32_fmsub(float %a0, float %a1, float %a2) {
+; FMA-LABEL: test_f32_fmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f32_fmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f32_fmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x = fmul float %a0, %a1
+  %res = fsub float %x, %a2
+  ret float %res
+}
+
+define <4 x float> @test_4f32_fmsub(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; FMA-LABEL: test_4f32_fmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_4f32_fmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_4f32_fmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %x = fmul <4 x float> %a0, %a1
+  %res = fsub <4 x float> %x, %a2
+  ret <4 x float> %res
 }
 
-; CHECK: test_x86_fmsub_pd
-; CHECK: vfmsub213pd     %xmm2, %xmm1, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fmsub_pd
-; CHECK_FMA4: vfmsubpd     %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK_FMA4: ret
-define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+define <8 x float> @test_8f32_fmsub(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+; FMA-LABEL: test_8f32_fmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_8f32_fmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_8f32_fmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %x = fmul <8 x float> %a0, %a1
+  %res = fsub <8 x float> %x, %a2
+  ret <8 x float> %res
+}
+
+define double @test_f64_fmsub(double %a0, double %a1, double %a2) {
+; FMA-LABEL: test_f64_fmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f64_fmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f64_fmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x = fmul double %a0, %a1
+  %res = fsub double %x, %a2
+  ret double %res
+}
+
+define <2 x double> @test_2f64_fmsub(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; FMA-LABEL: test_2f64_fmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_2f64_fmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_2f64_fmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
   %x = fmul <2 x double> %a0, %a1
   %res = fsub <2 x double> %x, %a2
   ret <2 x double> %res
 }
 
-; CHECK: test_x86_fnmadd_ss
-; CHECK: vfnmadd213ss    %xmm2, %xmm1, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fnmadd_ss
-; CHECK_FMA4: vfnmaddss    %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK_FMA4: ret
-define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
+define <4 x double> @test_4f64_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+; FMA-LABEL: test_4f64_fmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_4f64_fmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_4f64_fmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %x = fmul <4 x double> %a0, %a1
+  %res = fsub <4 x double> %x, %a2
+  ret <4 x double> %res
+}
+
+;
+; Pattern: (fsub z, (fmul x, y)) -> (fnmadd x, y, z)
+;
+
+define float @test_f32_fnmadd(float %a0, float %a1, float %a2) {
+; FMA-LABEL: test_f32_fnmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f32_fnmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f32_fnmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
   %x = fmul float %a0, %a1
   %res = fsub float %a2, %x
   ret float %res
 }
 
-; CHECK: test_x86_fnmadd_sd
-; CHECK: vfnmadd213sd     %xmm2, %xmm1, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fnmadd_sd
-; CHECK_FMA4: vfnmaddsd     %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK_FMA4: ret
-define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
+define <4 x float> @test_4f32_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; FMA-LABEL: test_4f32_fnmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_4f32_fnmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_4f32_fnmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %x = fmul <4 x float> %a0, %a1
+  %res = fsub <4 x float> %a2, %x
+  ret <4 x float> %res
+}
+
+define <8 x float> @test_8f32_fnmadd(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+; FMA-LABEL: test_8f32_fnmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_8f32_fnmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_8f32_fnmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %x = fmul <8 x float> %a0, %a1
+  %res = fsub <8 x float> %a2, %x
+  ret <8 x float> %res
+}
+
+define double @test_f64_fnmadd(double %a0, double %a1, double %a2) {
+; FMA-LABEL: test_f64_fnmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f64_fnmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f64_fnmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
   %x = fmul double %a0, %a1
   %res = fsub double %a2, %x
   ret double %res
 }
 
-; CHECK: test_x86_fmsub_sd
-; CHECK: vfmsub213sd     %xmm2, %xmm1, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fmsub_sd
-; CHECK_FMA4: vfmsubsd     %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK_FMA4: ret
-define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
-  %x = fmul double %a0, %a1
-  %res = fsub double %x, %a2
-  ret double %res
+define <2 x double> @test_2f64_fnmadd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; FMA-LABEL: test_2f64_fnmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_2f64_fnmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_2f64_fnmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %x = fmul <2 x double> %a0, %a1
+  %res = fsub <2 x double> %a2, %x
+  ret <2 x double> %res
 }
 
-; CHECK: test_x86_fnmsub_ss
-; CHECK: vfnmsub213ss     %xmm2, %xmm1, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fnmsub_ss
-; CHECK_FMA4: vfnmsubss     %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK_FMA4: ret
-define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
-  %x = fsub float -0.000000e+00, %a0
-  %y = fmul float %x, %a1
+define <4 x double> @test_4f64_fnmadd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+; FMA-LABEL: test_4f64_fnmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_4f64_fnmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_4f64_fnmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %x = fmul <4 x double> %a0, %a1
+  %res = fsub <4 x double> %a2, %x
+  ret <4 x double> %res
+}
+
+;
+; Pattern: (fsub (fneg (fmul x, y)), z) -> (fnmsub x, y, z)
+;
+
+define float @test_f32_fnmsub(float %a0, float %a1, float %a2) {
+; FMA-LABEL: test_f32_fnmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f32_fnmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f32_fnmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x = fmul float %a0, %a1
+  %y = fsub float -0.000000e+00, %x
   %res = fsub float %y, %a2
   ret float %res
 }
 
-; CHECK: test_x86_fmadd_ps_load
-; CHECK: vmovaps         (%rdi), %xmm2
-; CHECK: vfmadd213ps     %xmm1, %xmm2, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fmadd_ps_load
-; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
-; CHECK_FMA4: ret
-define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
-  %x = load <4 x float>, <4 x float>* %a0
-  %y = fmul <4 x float> %x, %a1
-  %res = fadd <4 x float> %y, %a2
+define <4 x float> @test_4f32_fnmsub(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; FMA-LABEL: test_4f32_fnmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_4f32_fnmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_4f32_fnmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %x = fmul <4 x float> %a0, %a1
+  %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub <4 x float> %y, %a2
   ret <4 x float> %res
 }
 
-; CHECK: test_x86_fmsub_ps_load
-; CHECK: vmovaps         (%rdi), %xmm2
-; CHECK: fmsub213ps     %xmm1, %xmm2, %xmm0
-; CHECK: ret
-; CHECK_FMA4: test_x86_fmsub_ps_load
-; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0
-; CHECK_FMA4: ret
-define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+define <8 x float> @test_8f32_fnmsub(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+; FMA-LABEL: test_8f32_fnmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_8f32_fnmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_8f32_fnmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %x = fmul <8 x float> %a0, %a1
+  %y = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub <8 x float> %y, %a2
+  ret <8 x float> %res
+}
+
+define double @test_f64_fnmsub(double %a0, double %a1, double %a2) {
+; FMA-LABEL: test_f64_fnmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f64_fnmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f64_fnmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x = fmul double %a0, %a1
+  %y = fsub double -0.000000e+00, %x
+  %res = fsub double %y, %a2
+  ret double %res
+}
+
+define <2 x double> @test_2f64_fnmsub(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; FMA-LABEL: test_2f64_fnmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_2f64_fnmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_2f64_fnmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %x = fmul <2 x double> %a0, %a1
+  %y = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x
+  %res = fsub <2 x double> %y, %a2
+  ret <2 x double> %res
+}
+
+define <4 x double> @test_4f64_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+; FMA-LABEL: test_4f64_fnmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_4f64_fnmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_4f64_fnmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %x = fmul <4 x double> %a0, %a1
+  %y = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
+  %res = fsub <4 x double> %y, %a2
+  ret <4 x double> %res
+}
+
+;
+; Load Folding Patterns
+;
+
+define <4 x float> @test_4f32_fmadd_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+; FMA-LABEL: test_4f32_fmadd_load:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd132ps (%rdi), %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_4f32_fmadd_load:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_4f32_fmadd_load:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovaps (%rdi), %xmm2
+; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512-NEXT:    retq
   %x = load <4 x float>, <4 x float>* %a0
   %y = fmul <4 x float> %x, %a1
-  %res = fsub <4 x float> %y, %a2
+  %res = fadd <4 x float> %y, %a2
   ret <4 x float> %res
 }
 
+define <2 x double> @test_2f64_fmsub_load(<2 x double>* %a0, <2 x double> %a1, <2 x double> %a2) {
+; FMA-LABEL: test_2f64_fmsub_load:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub132pd (%rdi), %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_2f64_fmsub_load:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_2f64_fmsub_load:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovapd (%rdi), %xmm2
+; AVX512-NEXT:    vfmsub213pd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %x = load <2 x double>, <2 x double>* %a0
+  %y = fmul <2 x double> %x, %a1
+  %res = fsub <2 x double> %y, %a2
+  ret <2 x double> %res
+}
+
+;
+; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
+;
+
+define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_add_x_one_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_add_x_one_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_add_x_one_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul <4 x float> %a, %y
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_add_x_one:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_add_x_one:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_add_x_one:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul <4 x float> %y, %a
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_add_x_negone_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_add_x_negone_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_add_x_negone_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul <4 x float> %a, %y
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_add_x_negone:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_add_x_negone:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_add_x_negone:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul <4 x float> %y, %a
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_sub_one_x_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_sub_one_x_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_sub_one_x_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  %m = fmul <4 x float> %s, %y
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_sub_one_x:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_sub_one_x:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_sub_one_x:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_sub_negone_x_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_sub_negone_x_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_sub_negone_x_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
+  %m = fmul <4 x float> %s, %y
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_sub_negone_x:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_sub_x_one_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_sub_x_one_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_sub_x_one_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul <4 x float> %s, %y
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_sub_x_one:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_sub_x_one:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_sub_x_one:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_sub_x_negone_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_sub_x_negone_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_sub_x_negone_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul <4 x float> %s, %y
+  ret <4 x float> %m
+}
+
+define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_sub_x_negone:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
+;
+; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
+;
+
+define float @test_f32_interp(float %x, float %y, float %t) {
+; FMA-LABEL: test_f32_interp:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ss %xmm1, %xmm2, %xmm1
+; FMA-NEXT:    vfmadd213ss %xmm1, %xmm2, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f32_interp:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddss %xmm1, %xmm1, %xmm2, %xmm1
+; FMA4-NEXT:    vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f32_interp:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213ss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vfmadd213ss %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %t1 = fsub float 1.0, %t
+  %tx = fmul float %x, %t
+  %ty = fmul float %y, %t1
+  %r = fadd float %tx, %ty
+  ret float %r
+}
+
+define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
+; FMA-LABEL: test_v4f32_interp:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ps %xmm1, %xmm2, %xmm1
+; FMA-NEXT:    vfmadd213ps %xmm1, %xmm2, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_interp:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm2, %xmm1
+; FMA4-NEXT:    vfmaddps %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_interp:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovaps %zmm2, %zmm3
+; AVX512-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm3
+; AVX512-NEXT:    vfmadd213ps %xmm3, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+  %t1 = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %t
+  %tx = fmul <4 x float> %x, %t
+  %ty = fmul <4 x float> %y, %t1
+  %r = fadd <4 x float> %tx, %ty
+  ret <4 x float> %r
+}
+
+define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
+; FMA-LABEL: test_v8f32_interp:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ps %ymm1, %ymm2, %ymm1
+; FMA-NEXT:    vfmadd213ps %ymm1, %ymm2, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f32_interp:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddps %ymm1, %ymm1, %ymm2, %ymm1
+; FMA4-NEXT:    vfmaddps %ymm1, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f32_interp:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovaps %zmm2, %zmm3
+; AVX512-NEXT:    vfnmadd213ps %ymm1, %ymm1, %ymm3
+; AVX512-NEXT:    vfmadd213ps %ymm3, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %t1 = fsub <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
+  %tx = fmul <8 x float> %x, %t
+  %ty = fmul <8 x float> %y, %t1
+  %r = fadd <8 x float> %tx, %ty
+  ret <8 x float> %r
+}
+
+define double @test_f64_interp(double %x, double %y, double %t) {
+; FMA-LABEL: test_f64_interp:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213sd %xmm1, %xmm2, %xmm1
+; FMA-NEXT:    vfmadd213sd %xmm1, %xmm2, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f64_interp:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddsd %xmm1, %xmm1, %xmm2, %xmm1
+; FMA4-NEXT:    vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f64_interp:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213sd %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vfmadd213sd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %t1 = fsub double 1.0, %t
+  %tx = fmul double %x, %t
+  %ty = fmul double %y, %t1
+  %r = fadd double %tx, %ty
+  ret double %r
+}
+
+define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
+; FMA-LABEL: test_v2f64_interp:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213pd %xmm1, %xmm2, %xmm1
+; FMA-NEXT:    vfmadd213pd %xmm1, %xmm2, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v2f64_interp:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddpd %xmm1, %xmm1, %xmm2, %xmm1
+; FMA4-NEXT:    vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v2f64_interp:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovaps %zmm2, %zmm3
+; AVX512-NEXT:    vfnmadd213pd %xmm1, %xmm1, %xmm3
+; AVX512-NEXT:    vfmadd213pd %xmm3, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+  %t1 = fsub <2 x double> <double 1.0, double 1.0>, %t
+  %tx = fmul <2 x double> %x, %t
+  %ty = fmul <2 x double> %y, %t1
+  %r = fadd <2 x double> %tx, %ty
+  ret <2 x double> %r
+}
+
+define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
+; FMA-LABEL: test_v4f64_interp:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213pd %ymm1, %ymm2, %ymm1
+; FMA-NEXT:    vfmadd213pd %ymm1, %ymm2, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f64_interp:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddpd %ymm1, %ymm1, %ymm2, %ymm1
+; FMA4-NEXT:    vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f64_interp:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovaps %zmm2, %zmm3
+; AVX512-NEXT:    vfnmadd213pd %ymm1, %ymm1, %ymm3
+; AVX512-NEXT:    vfmadd213pd %ymm3, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %t1 = fsub <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %t
+  %tx = fmul <4 x double> %x, %t
+  %ty = fmul <4 x double> %y, %t1
+  %r = fadd <4 x double> %tx, %ty
+  ret <4 x double> %r
+}
+
+;
+; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
+;
+
+define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; FMA-LABEL: test_v4f32_fneg_fmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_fneg_fmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_fneg_fmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %mul = fmul <4 x float> %a0, %a1
+  %add = fadd <4 x float> %mul, %a2
+  %neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
+  ret <4 x float> %neg
+}
+
+define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; FMA-LABEL: test_v4f64_fneg_fmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f64_fneg_fmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f64_fneg_fmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %mul = fmul <4 x double> %a0, %a1
+  %sub = fsub <4 x double> %mul, %a2
+  %neg = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
+  ret <4 x double> %neg
+}
+
+define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; FMA-LABEL: test_v4f32_fneg_fnmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_fneg_fnmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_fneg_fnmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %mul = fmul <4 x float> %a0, %a1
+  %neg0 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %mul
+  %add = fadd <4 x float> %neg0, %a2
+  %neg1 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
+  ret <4 x float> %neg1
+}
+
+define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; FMA-LABEL: test_v4f64_fneg_fnmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f64_fneg_fnmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f64_fneg_fnmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %mul = fmul <4 x double> %a0, %a1
+  %neg0 = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %mul
+  %sub = fsub <4 x double> %neg0, %a2
+  %neg1 = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
+  ret <4 x double> %neg1
+}
+
+;
+; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
+;
+
+define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
+; FMA-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
+; FMA:       # BB#0:
+; FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmulps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %m1 = fmul <4 x float> %x, <float 4.0, float 3.0, float 2.0, float 1.0>
+  %a  = fadd <4 x float> %m0, %m1
+  ret <4 x float> %a
+}
+
+;
+; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
+;
+
+define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) #0 {
+; FMA-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd132ps {{.*}}(%rip), %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %xmm1, {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd231ps {{.*}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m0 = fmul <4 x float> %x,  <float 1.0, float 2.0, float 3.0, float 4.0>
+  %m1 = fmul <4 x float> %m0, <float 4.0, float 3.0, float 2.0, float 1.0>
+  %a  = fadd <4 x float> %m1, %y
+  ret <4 x float> %a
+}
+
+; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
+
+define double @test_f64_fneg_fmul(double %x, double %y) #0 {
+; FMA-LABEL: test_f64_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f64_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; FMA4-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f64_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz double %x, %y
+  %n = fsub double -0.0, %m
+  ret double %n
+}
+
+define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
+; FMA-LABEL: test_v4f32_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA4-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <4 x float> %x, %y
+  %n = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %m
+  ret <4 x float> %n
+}
+
+define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
+; FMA-LABEL: test_v4f64_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
+; FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f64_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
+; FMA4-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f64_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <4 x double> %x, %y
+  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <4 x double> %n
+}
+
+define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %y) #0 {
+; ALL-LABEL: test_v4f64_fneg_fmul_no_nsz:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vxorpd {{.*}}(%rip), %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %m = fmul <4 x double> %x, %y
+  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <4 x double> %n
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fma_patterns_wide.ll b/test/CodeGen/X86/fma_patterns_wide.ll
index 04db2d76cd8c5..7b6509ad51c79 100644
--- a/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/test/CodeGen/X86/fma_patterns_wide.ll
@@ -1,84 +1,821 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
-
-; CHECK-LABEL: test_x86_fmadd_ps_y_wide
-; CHECK: vfmadd213ps
-; CHECK: vfmadd213ps
-; CHECK: ret
-; CHECK_FMA4-LABEL: test_x86_fmadd_ps_y_wide
-; CHECK_FMA4: vfmaddps
-; CHECK_FMA4: vfmaddps
-; CHECK_FMA4: ret
-define <16 x float> @test_x86_fmadd_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast | FileCheck %s --check-prefix=AVX512
+
+;
+; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
+;
+
+define <16 x float> @test_16f32_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; FMA-LABEL: test_16f32_fmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213ps %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfmadd213ps %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_16f32_fmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddps %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_16f32_fmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %res = fadd <16 x float> %x, %a2
   ret <16 x float> %res
 }
 
-; CHECK-LABEL: test_x86_fmsub_ps_y_wide
-; CHECK: vfmsub213ps
-; CHECK: vfmsub213ps
-; CHECK: ret
-; CHECK_FMA4-LABEL: test_x86_fmsub_ps_y_wide
-; CHECK_FMA4: vfmsubps
-; CHECK_FMA4: vfmsubps
-; CHECK_FMA4: ret
-define <16 x float> @test_x86_fmsub_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+define <8 x double> @test_8f64_fmadd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; FMA-LABEL: test_8f64_fmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213pd %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfmadd213pd %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_8f64_fmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddpd %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_8f64_fmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x = fmul <8 x double> %a0, %a1
+  %res = fadd <8 x double> %x, %a2
+  ret <8 x double> %res
+}
+
+;
+; Pattern: (fsub (fmul x, y), z) -> (fmsub x, y, z)
+;
+
+define <16 x float> @test_16f32_fmsub(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; FMA-LABEL: test_16f32_fmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfmsub213ps %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_16f32_fmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmsubps %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_16f32_fmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %res = fsub <16 x float> %x, %a2
   ret <16 x float> %res
 }
 
-; CHECK-LABEL: test_x86_fnmadd_ps_y_wide
-; CHECK: vfnmadd213ps
-; CHECK: vfnmadd213ps
-; CHECK: ret
-; CHECK_FMA4-LABEL: test_x86_fnmadd_ps_y_wide
-; CHECK_FMA4: vfnmaddps
-; CHECK_FMA4: vfnmaddps
-; CHECK_FMA4: ret
-define <16 x float> @test_x86_fnmadd_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+define <8 x double> @test_8f64_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; FMA-LABEL: test_8f64_fmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213pd %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfmsub213pd %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_8f64_fmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmsubpd %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_8f64_fmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x = fmul <8 x double> %a0, %a1
+  %res = fsub <8 x double> %x, %a2
+  ret <8 x double> %res
+}
+
+;
+; Pattern: (fsub z, (fmul x, y)) -> (fnmadd x, y, z)
+;
+
+define <16 x float> @test_16f32_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; FMA-LABEL: test_16f32_fnmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ps %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmadd213ps %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_16f32_fnmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmaddps %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_16f32_fnmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %res = fsub <16 x float> %a2, %x
   ret <16 x float> %res
 }
 
-; CHECK-LABEL: test_x86_fnmsub_ps_y_wide
-; CHECK: vfnmsub213ps
-; CHECK: vfnmsub213ps
-; CHECK: ret
-define <16 x float> @test_x86_fnmsub_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+define <8 x double> @test_8f64_fnmadd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; FMA-LABEL: test_8f64_fnmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213pd %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmadd213pd %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_8f64_fnmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmaddpd %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_8f64_fnmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x = fmul <8 x double> %a0, %a1
+  %res = fsub <8 x double> %a2, %x
+  ret <8 x double> %res
+}
+
+;
+; Pattern: (fsub (fneg (fmul x, y)), z) -> (fnmsub x, y, z)
+;
+
+define <16 x float> @test_16f32_fnmsub(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; FMA-LABEL: test_16f32_fnmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213ps %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213ps %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_16f32_fnmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubps %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_16f32_fnmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
   %res = fsub <16 x float> %y, %a2
   ret <16 x float> %res
 }
 
-; CHECK-LABEL: test_x86_fmadd_pd_y_wide
-; CHECK: vfmadd213pd
-; CHECK: vfmadd213pd
-; CHECK: ret
-; CHECK_FMA4-LABEL: test_x86_fmadd_pd_y_wide
-; CHECK_FMA4: vfmaddpd
-; CHECK_FMA4: vfmaddpd
-; CHECK_FMA4: ret
-define <8 x double> @test_x86_fmadd_pd_y_wide(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+define <8 x double> @test_8f64_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; FMA-LABEL: test_8f64_fnmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213pd %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213pd %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_8f64_fnmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubpd %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_8f64_fnmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
   %x = fmul <8 x double> %a0, %a1
-  %res = fadd <8 x double> %x, %a2
+  %y = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
+  %res = fsub <8 x double> %y, %a2
   ret <8 x double> %res
 }
 
-; CHECK-LABEL: test_x86_fmsub_pd_y_wide
-; CHECK: vfmsub213pd
-; CHECK: vfmsub213pd
-; CHECK: ret
-; CHECK_FMA4-LABEL: test_x86_fmsub_pd_y_wide
-; CHECK_FMA4: vfmsubpd
-; CHECK_FMA4: vfmsubpd
-; CHECK_FMA4: ret
-define <8 x double> @test_x86_fmsub_pd_y_wide(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  %x = fmul <8 x double> %a0, %a1
-  %res = fsub <8 x double> %x, %a2
+;
+; Load Folding Patterns
+;
+
+define <16 x float> @test_16f32_fmadd_load(<16 x float>* %a0, <16 x float> %a1, <16 x float> %a2) {
+; FMA-LABEL: test_16f32_fmadd_load:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd132ps (%rdi), %ymm2, %ymm0
+; FMA-NEXT:    vfmadd132ps 32(%rdi), %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_16f32_fmadd_load:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %ymm2, (%rdi), %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddps %ymm3, 32(%rdi), %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_16f32_fmadd_load:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovaps (%rdi), %zmm2
+; AVX512-NEXT:    vfmadd213ps %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %x = load <16 x float>, <16 x float>* %a0
+  %y = fmul <16 x float> %x, %a1
+  %res = fadd <16 x float> %y, %a2
+  ret <16 x float> %res
+}
+
+define <8 x double> @test_8f64_fmsub_load(<8 x double>* %a0, <8 x double> %a1, <8 x double> %a2) {
+; FMA-LABEL: test_8f64_fmsub_load:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub132pd (%rdi), %ymm2, %ymm0
+; FMA-NEXT:    vfmsub132pd 32(%rdi), %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_8f64_fmsub_load:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubpd %ymm2, (%rdi), %ymm0, %ymm0
+; FMA4-NEXT:    vfmsubpd %ymm3, 32(%rdi), %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_8f64_fmsub_load:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovapd (%rdi), %zmm2
+; AVX512-NEXT:    vfmsub213pd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %x = load <8 x double>, <8 x double>* %a0
+  %y = fmul <8 x double> %x, %a1
+  %res = fsub <8 x double> %y, %a2
   ret <8 x double> %res
 }
+
+;
+; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
+;
+
+define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) {
+; FMA-LABEL: test_v16f32_mul_add_x_one_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213ps %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfmadd213ps %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_mul_add_x_one_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_mul_add_x_one_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213ps %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %a = fadd <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul <16 x float> %a, %y
+  ret <16 x float> %m
+}
+
+define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) {
+; FMA-LABEL: test_v8f64_mul_y_add_x_one:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213pd %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfmadd213pd %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_mul_y_add_x_one:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_mul_y_add_x_one:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213pd %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %a = fadd <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
+  %m = fmul <8 x double> %y, %a
+  ret <8 x double> %m
+}
+
+define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) {
+; FMA-LABEL: test_v16f32_mul_add_x_negone_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfmsub213ps %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_mul_add_x_negone_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmsubps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_mul_add_x_negone_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %a = fadd <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul <16 x float> %a, %y
+  ret <16 x float> %m
+}
+
+define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) {
+; FMA-LABEL: test_v8f64_mul_y_add_x_negone:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213pd %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfmsub213pd %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_mul_y_add_x_negone:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmsubpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_mul_y_add_x_negone:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213pd %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %a = fadd <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
+  %m = fmul <8 x double> %y, %a
+  ret <8 x double> %m
+}
+
+define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) {
+; FMA-LABEL: test_v16f32_mul_sub_one_x_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ps %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfnmadd213ps %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_mul_sub_one_x_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmaddps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_mul_sub_one_x_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213ps %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %s = fsub <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  %m = fmul <16 x float> %s, %y
+  ret <16 x float> %m
+}
+
+define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) {
+; FMA-LABEL: test_v8f64_mul_y_sub_one_x:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213pd %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfnmadd213pd %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_mul_y_sub_one_x:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmaddpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_mul_y_sub_one_x:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213pd %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %s = fsub <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %x
+  %m = fmul <8 x double> %y, %s
+  ret <8 x double> %m
+}
+
+define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) {
+; FMA-LABEL: test_v16f32_mul_sub_negone_x_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213ps %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213ps %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_mul_sub_negone_x_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_mul_sub_negone_x_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213ps %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %s = fsub <16 x float> <float -1.0, float -1.0, float -1.0, float -1.0,float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>, %x
+  %m = fmul <16 x float> %s, %y
+  ret <16 x float> %m
+}
+
+define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) {
+; FMA-LABEL: test_v8f64_mul_y_sub_negone_x:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213pd %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213pd %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_mul_y_sub_negone_x:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_mul_y_sub_negone_x:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213pd %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %s = fsub <8 x double> <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>, %x
+  %m = fmul <8 x double> %y, %s
+  ret <8 x double> %m
+}
+
+define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) {
+; FMA-LABEL: test_v16f32_mul_sub_x_one_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfmsub213ps %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_mul_sub_x_one_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmsubps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_mul_sub_x_one_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %s = fsub <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  %m = fmul <16 x float> %s, %y
+  ret <16 x float> %m
+}
+
+define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) {
+; FMA-LABEL: test_v8f64_mul_y_sub_x_one:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213pd %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfmsub213pd %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_mul_y_sub_x_one:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmsubpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_mul_y_sub_x_one:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213pd %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %s = fsub <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
+  %m = fmul <8 x double> %y, %s
+  ret <8 x double> %m
+}
+
+define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) {
+; FMA-LABEL: test_v16f32_mul_sub_x_negone_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213ps %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfmadd213ps %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_mul_sub_x_negone_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_mul_sub_x_negone_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213ps %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %s = fsub <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
+  %m = fmul <16 x float> %s, %y
+  ret <16 x float> %m
+}
+
+define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) {
+; FMA-LABEL: test_v8f64_mul_y_sub_x_negone:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213pd %ymm2, %ymm2, %ymm0
+; FMA-NEXT:    vfmadd213pd %ymm3, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_mul_y_sub_x_negone:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_mul_y_sub_x_negone:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213pd %zmm1, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %s = fsub <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
+  %m = fmul <8 x double> %y, %s
+  ret <8 x double> %m
+}
+
+;
+; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
+;
+
+define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) {
+; FMA-LABEL: test_v16f32_interp:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213ps %ymm3, %ymm5, %ymm3
+; FMA-NEXT:    vfnmadd213ps %ymm2, %ymm4, %ymm2
+; FMA-NEXT:    vfmadd213ps %ymm2, %ymm4, %ymm0
+; FMA-NEXT:    vfmadd213ps %ymm3, %ymm5, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_interp:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddps %ymm3, %ymm3, %ymm5, %ymm3
+; FMA4-NEXT:    vfnmaddps %ymm2, %ymm2, %ymm4, %ymm2
+; FMA4-NEXT:    vfmaddps %ymm2, %ymm4, %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddps %ymm3, %ymm5, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_interp:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovaps %zmm2, %zmm3
+; AVX512-NEXT:    vfnmadd213ps %zmm1, %zmm1, %zmm3
+; AVX512-NEXT:    vfmadd213ps %zmm3, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %t1 = fsub <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
+  %tx = fmul <16 x float> %x, %t
+  %ty = fmul <16 x float> %y, %t1
+  %r = fadd <16 x float> %tx, %ty
+  ret <16 x float> %r
+}
+
+define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) {
+; FMA-LABEL: test_v8f64_interp:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213pd %ymm3, %ymm5, %ymm3
+; FMA-NEXT:    vfnmadd213pd %ymm2, %ymm4, %ymm2
+; FMA-NEXT:    vfmadd213pd %ymm2, %ymm4, %ymm0
+; FMA-NEXT:    vfmadd213pd %ymm3, %ymm5, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_interp:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddpd %ymm3, %ymm3, %ymm5, %ymm3
+; FMA4-NEXT:    vfnmaddpd %ymm2, %ymm2, %ymm4, %ymm2
+; FMA4-NEXT:    vfmaddpd %ymm2, %ymm4, %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddpd %ymm3, %ymm5, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_interp:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovaps %zmm2, %zmm3
+; AVX512-NEXT:    vfnmadd213pd %zmm1, %zmm1, %zmm3
+; AVX512-NEXT:    vfmadd213pd %zmm3, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %t1 = fsub <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %t
+  %tx = fmul <8 x double> %x, %t
+  %ty = fmul <8 x double> %y, %t1
+  %r = fadd <8 x double> %tx, %ty
+  ret <8 x double> %r
+}
+
+;
+; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
+;
+
+define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
+; FMA-LABEL: test_v16f32_fneg_fmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmsub213ps %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213ps %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_fneg_fmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmsubps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubps %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_fneg_fmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %mul = fmul <16 x float> %a0, %a1
+  %add = fadd <16 x float> %mul, %a2
+  %neg = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %add
+  ret <16 x float> %neg
+}
+
+define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
+; FMA-LABEL: test_v8f64_fneg_fmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfnmadd213pd %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmadd213pd %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_fneg_fmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfnmaddpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmaddpd %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_fneg_fmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfnmadd213pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %mul = fmul <8 x double> %a0, %a1
+  %sub = fsub <8 x double> %mul, %a2
+  %neg = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %sub
+  ret <8 x double> %neg
+}
+
+define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
+; FMA-LABEL: test_v16f32_fneg_fnmadd:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmsub213ps %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfmsub213ps %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_fneg_fnmadd:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmsubps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmsubps %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_fneg_fnmadd:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %mul = fmul <16 x float> %a0, %a1
+  %neg0 = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %mul
+  %add = fadd <16 x float> %neg0, %a2
+  %neg1 = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %add
+  ret <16 x float> %neg1
+}
+
+define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
+; FMA-LABEL: test_v8f64_fneg_fnmsub:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd213pd %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfmadd213pd %ymm5, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_fneg_fnmsub:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddpd %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_fneg_fnmsub:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %mul = fmul <8 x double> %a0, %a1
+  %neg0 = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %mul
+  %sub = fsub <8 x double> %neg0, %a2
+  %neg1 = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %sub
+  ret <8 x double> %neg1
+}
+
+;
+; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
+;
+
+define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
+; FMA-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
+; FMA:       # BB#0:
+; FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA4-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %m0 = fmul <16 x float> %x, <float 17.0, float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0>
+  %m1 = fmul <16 x float> %x, <float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0, float 1.0>
+  %a  = fadd <16 x float> %m0, %m1
+  ret <16 x float> %a
+}
+
+;
+; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
+;
+
+define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) #0 {
+; FMA-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
+; FMA:       # BB#0:
+; FMA-NEXT:    vfmadd132ps {{.*}}(%rip), %ymm2, %ymm0
+; FMA-NEXT:    vfmadd132ps {{.*}}(%rip), %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vfmaddps %ymm2, {{.*}}(%rip), %ymm0, %ymm0
+; FMA4-NEXT:    vfmaddps %ymm3, {{.*}}(%rip), %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vfmadd231ps {{.*}}(%rip), %zmm0, %zmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m0 = fmul <16 x float> %x,  <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>
+  %m1 = fmul <16 x float> %m0, <float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0, float 1.0>
+  %a  = fadd <16 x float> %m1, %y
+  ret <16 x float> %a
+}
+
+; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
+
+define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0 {
+; FMA-LABEL: test_v16f32_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorps %ymm4, %ymm4, %ymm4
+; FMA-NEXT:    vfnmsub213ps %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213ps %ymm4, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorps %ymm4, %ymm4, %ymm4
+; FMA4-NEXT:    vfnmsubps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubps %ymm4, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <16 x float> %x, %y
+  %n = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %m
+  ret <16 x float> %n
+}
+
+define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
+; FMA-LABEL: test_v8f64_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorpd %ymm4, %ymm4, %ymm4
+; FMA-NEXT:    vfnmsub213pd %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213pd %ymm4, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorpd %ymm4, %ymm4, %ymm4
+; FMA4-NEXT:    vfnmsubpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubpd %ymm4, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <8 x double> %x, %y
+  %n = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <8 x double> %n
+}
+
+define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) #0 {
+; FMA-LABEL: test_v8f64_fneg_fmul_no_nsz:
+; FMA:       # BB#0:
+; FMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; FMA-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
+; FMA-NEXT:    vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; FMA-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
+; FMA-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_fneg_fmul_no_nsz:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; FMA4-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_fneg_fmul_no_nsz:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vxorpd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul <8 x double> %x, %y
+  %n = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <8 x double> %n
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fmaxnum.ll b/test/CodeGen/X86/fmaxnum.ll
index 23678c46dba07..ebfbd064572a9 100644
--- a/test/CodeGen/X86/fmaxnum.ll
+++ b/test/CodeGen/X86/fmaxnum.ll
@@ -1,4 +1,5 @@
-; RUN: llc  -march=x86 -mtriple=i386-linux-gnu  < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2  < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx  < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
 
 declare float @fmaxf(float, float)
 declare double @fmax(double, double)
@@ -7,44 +8,232 @@ declare float @llvm.maxnum.f32(float, float)
 declare double @llvm.maxnum.f64(double, double)
 declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80)
 
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
+declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
+
+; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
+
 ; CHECK-LABEL: @test_fmaxf
-; CHECK: calll fmaxf
+; SSE:         movaps %xmm0, %xmm2
+; SSE-NEXT:    cmpunordss %xmm2, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm3
+; SSE-NEXT:    andps %xmm1, %xmm3
+; SSE-NEXT:    maxss %xmm0, %xmm1
+; SSE-NEXT:    andnps %xmm1, %xmm2
+; SSE-NEXT:    orps %xmm3, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vmaxss %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define float @test_fmaxf(float %x, float %y) {
   %z = call float @fmaxf(float %x, float %y) readnone
   ret float %z
 }
 
+; CHECK-LABEL: @test_fmaxf_minsize
+; CHECK:       jmp fmaxf
+define float @test_fmaxf_minsize(float %x, float %y) minsize {
+  %z = call float @fmaxf(float %x, float %y) readnone
+  ret float %z
+}
+
+; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
+
 ; CHECK-LABEL: @test_fmax
-; CHECK: calll fmax
+; SSE:         movapd %xmm0, %xmm2
+; SSE-NEXT:    cmpunordsd %xmm2, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm3
+; SSE-NEXT:    andpd %xmm1, %xmm3
+; SSE-NEXT:    maxsd %xmm0, %xmm1
+; SSE-NEXT:    andnpd %xmm1, %xmm2
+; SSE-NEXT:    orpd %xmm3, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vmaxsd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define double @test_fmax(double %x, double %y) {
   %z = call double @fmax(double %x, double %y) readnone
   ret double %z
 }
 
 ; CHECK-LABEL: @test_fmaxl
-; CHECK: calll fmaxl
+; CHECK: callq fmaxl
 define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmaxf
-; CHECK: calll fmaxf
+; SSE:         movaps %xmm0, %xmm2
+; SSE-NEXT:    cmpunordss %xmm2, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm3
+; SSE-NEXT:    andps %xmm1, %xmm3
+; SSE-NEXT:    maxss %xmm0, %xmm1
+; SSE-NEXT:    andnps %xmm1, %xmm2
+; SSE-NEXT:    orps %xmm3, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vmaxss %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define float @test_intrinsic_fmaxf(float %x, float %y) {
   %z = call float @llvm.maxnum.f32(float %x, float %y) readnone
   ret float %z
 }
 
+
 ; CHECK-LABEL: @test_intrinsic_fmax
-; CHECK: calll fmax
+; SSE:         movapd %xmm0, %xmm2
+; SSE-NEXT:    cmpunordsd %xmm2, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm3
+; SSE-NEXT:    andpd %xmm1, %xmm3
+; SSE-NEXT:    maxsd %xmm0, %xmm1
+; SSE-NEXT:    andnpd %xmm1, %xmm2
+; SSE-NEXT:    orpd %xmm3, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vmaxsd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define double @test_intrinsic_fmax(double %x, double %y) {
   %z = call double @llvm.maxnum.f64(double %x, double %y) readnone
   ret double %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmaxl
-; CHECK: calll fmaxl
+; CHECK: callq fmaxl
 define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
+
+; CHECK-LABEL: @test_intrinsic_fmax_v2f32
+; SSE:         movaps %xmm1, %xmm2
+; SSE-NEXT:    maxps %xmm0, %xmm2
+; SSE-NEXT:    cmpunordps %xmm0, %xmm0
+; SSE-NEXT:    andps %xmm0, %xmm1
+; SSE-NEXT:    andnps %xmm2, %xmm0
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vmaxps %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
+define <2 x float> @test_intrinsic_fmax_v2f32(<2 x float> %x, <2 x float> %y) {
+  %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
+  ret <2 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v4f32
+; SSE:         movaps %xmm1, %xmm2
+; SSE-NEXT:    maxps %xmm0, %xmm2
+; SSE-NEXT:    cmpunordps %xmm0, %xmm0
+; SSE-NEXT:    andps %xmm0, %xmm1
+; SSE-NEXT:    andnps %xmm2, %xmm0
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vmaxps %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
+define <4 x float> @test_intrinsic_fmax_v4f32(<4 x float> %x, <4 x float> %y) {
+  %z = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v2f64
+; SSE:         movapd %xmm1, %xmm2
+; SSE-NEXT:    maxpd %xmm0, %xmm2
+; SSE-NEXT:    cmpunordpd %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm1
+; SSE-NEXT:    andnpd %xmm2, %xmm0
+; SSE-NEXT:    orpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vmaxpd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
+define <2 x double> @test_intrinsic_fmax_v2f64(<2 x double> %x, <2 x double> %y) {
+  %z = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
+  ret <2 x double> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v4f64
+; SSE:         movapd  %xmm2, %xmm4
+; SSE-NEXT:    maxpd %xmm0, %xmm4
+; SSE-NEXT:    cmpunordpd  %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    andnpd  %xmm4, %xmm0
+; SSE-NEXT:    orpd  %xmm2, %xmm0
+; SSE-NEXT:    movapd  %xmm3, %xmm2
+; SSE-NEXT:    maxpd %xmm1, %xmm2
+; SSE-NEXT:    cmpunordpd  %xmm1, %xmm1
+; SSE-NEXT:    andpd %xmm1, %xmm3
+; SSE-NEXT:    andnpd  %xmm2, %xmm1
+; SSE-NEXT:    orpd  %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX:         vmaxpd  %ymm0, %ymm1, %ymm2
+; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
+; AVX-NEXT:    retq
+define <4 x double> @test_intrinsic_fmax_v4f64(<4 x double> %x, <4 x double> %y) {
+  %z = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
+  ret <4 x double> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v8f64
+; SSE:         movapd  %xmm4, %xmm8
+; SSE-NEXT:    maxpd %xmm0, %xmm8
+; SSE-NEXT:    cmpunordpd  %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm4
+; SSE-NEXT:    andnpd  %xmm8, %xmm0
+; SSE-NEXT:    orpd  %xmm4, %xmm0
+; SSE-NEXT:    movapd  %xmm5, %xmm4
+; SSE-NEXT:    maxpd %xmm1, %xmm4
+; SSE-NEXT:    cmpunordpd  %xmm1, %xmm1
+; SSE-NEXT:    andpd %xmm1, %xmm5
+; SSE-NEXT:    andnpd  %xmm4, %xmm1
+; SSE-NEXT:    orpd  %xmm5, %xmm1
+; SSE-NEXT:    movapd  %xmm6, %xmm4
+; SSE-NEXT:    maxpd %xmm2, %xmm4
+; SSE-NEXT:    cmpunordpd  %xmm2, %xmm2
+; SSE-NEXT:    andpd %xmm2, %xmm6
+; SSE-NEXT:    andnpd  %xmm4, %xmm2
+; SSE-NEXT:    orpd  %xmm6, %xmm2
+; SSE-NEXT:    movapd  %xmm7, %xmm4
+; SSE-NEXT:    maxpd %xmm3, %xmm4
+; SSE-NEXT:    cmpunordpd  %xmm3, %xmm3
+; SSE-NEXT:    andpd %xmm3, %xmm7
+; SSE-NEXT:    andnpd  %xmm4, %xmm3
+; SSE-NEXT:    orpd  %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX:         vmaxpd  %ymm0, %ymm2, %ymm4
+; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vblendvpd %ymm0, %ymm2, %ymm4, %ymm0
+; AVX-NEXT:    vmaxpd  %ymm1, %ymm3, %ymm2
+; AVX-NEXT:    vcmpunordpd %ymm1, %ymm1, %ymm1
+; AVX-NEXT:    vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    retq
+define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y) {
+  %z = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
+  ret <8 x double> %z
+}
+
diff --git a/test/CodeGen/X86/fminnum.ll b/test/CodeGen/X86/fminnum.ll
index 1e33cf4696af4..afe8b804f2677 100644
--- a/test/CodeGen/X86/fminnum.ll
+++ b/test/CodeGen/X86/fminnum.ll
@@ -1,4 +1,5 @@
-; RUN: llc  -march=x86 -mtriple=i386-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2  < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx  < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
 
 declare float @fminf(float, float)
 declare double @fmin(double, double)
@@ -10,85 +11,219 @@ declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80)
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
 declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
 declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
 
+; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
+
 ; CHECK-LABEL: @test_fminf
-; CHECK: jmp fminf
+; SSE:         movaps %xmm0, %xmm2
+; SSE-NEXT:    cmpunordss %xmm2, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm3
+; SSE-NEXT:    andps %xmm1, %xmm3
+; SSE-NEXT:    minss %xmm0, %xmm1
+; SSE-NEXT:    andnps %xmm1, %xmm2
+; SSE-NEXT:    orps %xmm3, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vminss %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define float @test_fminf(float %x, float %y) {
   %z = call float @fminf(float %x, float %y) readnone
   ret float %z
 }
 
+; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
+
 ; CHECK-LABEL: @test_fmin
-; CHECK: jmp fmin
+; SSE:         movapd %xmm0, %xmm2
+; SSE-NEXT:    cmpunordsd %xmm2, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm3
+; SSE-NEXT:    andpd %xmm1, %xmm3
+; SSE-NEXT:    minsd %xmm0, %xmm1
+; SSE-NEXT:    andnpd %xmm1, %xmm2
+; SSE-NEXT:    orpd %xmm3, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vminsd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define double @test_fmin(double %x, double %y) {
   %z = call double @fmin(double %x, double %y) readnone
   ret double %z
 }
 
 ; CHECK-LABEL: @test_fminl
-; CHECK: calll fminl
+; CHECK: callq fminl
 define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @fminl(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fminf
-; CHECK: jmp fminf
+; SSE:         movaps %xmm0, %xmm2
+; SSE-NEXT:    cmpunordss %xmm2, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm3
+; SSE-NEXT:    andps %xmm1, %xmm3
+; SSE-NEXT:    minss %xmm0, %xmm1
+; SSE-NEXT:    andnps %xmm1, %xmm2
+; SSE-NEXT:    orps %xmm3, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vminss %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define float @test_intrinsic_fminf(float %x, float %y) {
   %z = call float @llvm.minnum.f32(float %x, float %y) readnone
   ret float %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmin
-; CHECK: jmp fmin
+; SSE:         movapd %xmm0, %xmm2
+; SSE-NEXT:    cmpunordsd %xmm2, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm3
+; SSE-NEXT:    andpd %xmm1, %xmm3
+; SSE-NEXT:    minsd %xmm0, %xmm1
+; SSE-NEXT:    andnpd %xmm1, %xmm2
+; SSE-NEXT:    orpd %xmm3, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vminsd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define double @test_intrinsic_fmin(double %x, double %y) {
   %z = call double @llvm.minnum.f64(double %x, double %y) readnone
   ret double %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fminl
-; CHECK: calll fminl
+; CHECK: callq fminl
 define x86_fp80 @test_intrinsic_fminl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @llvm.minnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmin_v2f32
-; CHECK: calll fminf
-; CHECK: calll fminf
+; SSE:         movaps %xmm1, %xmm2
+; SSE-NEXT:    minps %xmm0, %xmm2
+; SSE-NEXT:    cmpunordps %xmm0, %xmm0
+; SSE-NEXT:    andps %xmm0, %xmm1
+; SSE-NEXT:    andnps %xmm2, %xmm0
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vminps %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define <2 x float> @test_intrinsic_fmin_v2f32(<2 x float> %x, <2 x float> %y) {
   %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
   ret <2 x float> %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmin_v4f32
-; CHECK: calll fminf
-; CHECK: calll fminf
-; CHECK: calll fminf
-; CHECK: calll fminf
+; SSE:         movaps %xmm1, %xmm2
+; SSE-NEXT:    minps %xmm0, %xmm2
+; SSE-NEXT:    cmpunordps %xmm0, %xmm0
+; SSE-NEXT:    andps %xmm0, %xmm1
+; SSE-NEXT:    andnps %xmm2, %xmm0
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vminps %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) {
   %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
   ret <4 x float> %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmin_v2f64
-; CHECK: calll fmin
-; CHECK: calll fmin
+; SSE:         movapd %xmm1, %xmm2
+; SSE-NEXT:    minpd %xmm0, %xmm2
+; SSE-NEXT:    cmpunordpd %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm1
+; SSE-NEXT:    andnpd %xmm2, %xmm0
+; SSE-NEXT:    orpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX:         vminpd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
 define <2 x double> @test_intrinsic_fmin_v2f64(<2 x double> %x, <2 x double> %y) {
   %z = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
   ret <2 x double> %z
 }
 
+; CHECK-LABEL: @test_intrinsic_fmin_v4f64
+; SSE:         movapd  %xmm2, %xmm4
+; SSE-NEXT:    minpd %xmm0, %xmm4
+; SSE-NEXT:    cmpunordpd  %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    andnpd  %xmm4, %xmm0
+; SSE-NEXT:    orpd  %xmm2, %xmm0
+; SSE-NEXT:    movapd  %xmm3, %xmm2
+; SSE-NEXT:    minpd %xmm1, %xmm2
+; SSE-NEXT:    cmpunordpd  %xmm1, %xmm1
+; SSE-NEXT:    andpd %xmm1, %xmm3
+; SSE-NEXT:    andnpd  %xmm2, %xmm1
+; SSE-NEXT:    orpd  %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX:         vminpd  %ymm0, %ymm1, %ymm2
+; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
+; AVX-NEXT:    retq
+define <4 x double> @test_intrinsic_fmin_v4f64(<4 x double> %x, <4 x double> %y) {
+  %z = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
+  ret <4 x double> %z
+}
+
 ; CHECK-LABEL: @test_intrinsic_fmin_v8f64
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
+; SSE:         movapd  %xmm4, %xmm8
+; SSE-NEXT:    minpd %xmm0, %xmm8
+; SSE-NEXT:    cmpunordpd  %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm4
+; SSE-NEXT:    andnpd  %xmm8, %xmm0
+; SSE-NEXT:    orpd  %xmm4, %xmm0
+; SSE-NEXT:    movapd  %xmm5, %xmm4
+; SSE-NEXT:    minpd %xmm1, %xmm4
+; SSE-NEXT:    cmpunordpd  %xmm1, %xmm1
+; SSE-NEXT:    andpd %xmm1, %xmm5
+; SSE-NEXT:    andnpd  %xmm4, %xmm1
+; SSE-NEXT:    orpd  %xmm5, %xmm1
+; SSE-NEXT:    movapd  %xmm6, %xmm4
+; SSE-NEXT:    minpd %xmm2, %xmm4
+; SSE-NEXT:    cmpunordpd  %xmm2, %xmm2
+; SSE-NEXT:    andpd %xmm2, %xmm6
+; SSE-NEXT:    andnpd  %xmm4, %xmm2
+; SSE-NEXT:    orpd  %xmm6, %xmm2
+; SSE-NEXT:    movapd  %xmm7, %xmm4
+; SSE-NEXT:    minpd %xmm3, %xmm4
+; SSE-NEXT:    cmpunordpd  %xmm3, %xmm3
+; SSE-NEXT:    andpd %xmm3, %xmm7
+; SSE-NEXT:    andnpd  %xmm4, %xmm3
+; SSE-NEXT:    orpd  %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX:         vminpd  %ymm0, %ymm2, %ymm4
+; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vblendvpd %ymm0, %ymm2, %ymm4, %ymm0
+; AVX-NEXT:    vminpd  %ymm1, %ymm3, %ymm2
+; AVX-NEXT:    vcmpunordpd %ymm1, %ymm1, %ymm1
+; AVX-NEXT:    vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    retq
 define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y) {
   %z = call <8 x double> @llvm.minnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
   ret <8 x double> %z
diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll
index 7d75611e13305..564ce42fdb75f 100644
--- a/test/CodeGen/X86/fmul-combines.ll
+++ b/test/CodeGen/X86/fmul-combines.ll
@@ -56,10 +56,10 @@ define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 {
 }
 
 ; We should be able to pre-multiply the two constant vectors.
-; CHECK: float 5.000000e+00
-; CHECK: float 1.200000e+01
-; CHECK: float 2.100000e+01
-; CHECK: float 3.200000e+01
+; CHECK: float 5
+; CHECK: float 12
+; CHECK: float 21
+; CHECK: float 32
 ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat:
 ; CHECK: mulps
 ; CHECK-NOT: mulps
@@ -71,10 +71,10 @@ define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 {
 }
 
 ; Same as above, but reverse operands to make sure non-canonical form is also handled.
-; CHECK: float 5.000000e+00
-; CHECK: float 1.200000e+01
-; CHECK: float 2.100000e+01
-; CHECK: float 3.200000e+01
+; CHECK: float 5
+; CHECK: float 12
+; CHECK: float 21
+; CHECK: float 32
 ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical:
 ; CHECK: mulps
 ; CHECK-NOT: mulps
@@ -86,15 +86,13 @@ define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x)
 }
 
 ; More than one use of a constant multiply should not inhibit the optimization.
-; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. 
-; CHECK: float 5.000000e+00
-; CHECK: float 1.200000e+01
-; CHECK: float 2.100000e+01
-; CHECK: float 3.200000e+01
+; Instead of a chain of 2 dependent mults, this test will have 2 independent mults.
+; CHECK: float 6
+; CHECK: float 14
+; CHECK: float 24
+; CHECK: float 36
 ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use:
 ; CHECK: mulps
-; CHECK: mulps
-; CHECK: addps
 ; CHECK: ret
 define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) #0 {
   %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
@@ -112,10 +110,10 @@ define <4 x float> @PR22698_splats(<4 x float> %a) #0 {
   %mul3 = fmul fast <4 x float> %a, %mul2
   ret <4 x float> %mul3
 
-; CHECK: float 2.400000e+01
-; CHECK: float 2.400000e+01
-; CHECK: float 2.400000e+01
-; CHECK: float 2.400000e+01
+; CHECK: float 24
+; CHECK: float 24
+; CHECK: float 24
+; CHECK: float 24
 ; CHECK-LABEL: PR22698_splats:
 ; CHECK: mulps
 ; CHECK: ret
@@ -128,10 +126,10 @@ define <4 x float> @PR22698_no_splats(<4 x float> %a) #0 {
   %mul3 = fmul fast <4 x float> %a, %mul2
   ret <4 x float> %mul3
 
-; CHECK: float 4.500000e+01
-; CHECK: float 1.200000e+02
-; CHECK: float 2.310000e+02
-; CHECK: float 3.840000e+02
+; CHECK: float 45
+; CHECK: float 120
+; CHECK: float 231
+; CHECK: float 384
 ; CHECK-LABEL: PR22698_no_splats:
 ; CHECK: mulps
 ; CHECK: ret
diff --git a/test/CodeGen/X86/fold-load-binops.ll b/test/CodeGen/X86/fold-load-binops.ll
index 6d501c74fe57b..43966f60718b8 100644
--- a/test/CodeGen/X86/fold-load-binops.ll
+++ b/test/CodeGen/X86/fold-load-binops.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
 
diff --git a/test/CodeGen/X86/fold-load-unops.ll b/test/CodeGen/X86/fold-load-unops.ll
index fcde0218158af..bedda3f297da8 100644
--- a/test/CodeGen/X86/fold-load-unops.ll
+++ b/test/CodeGen/X86/fold-load-unops.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
 
diff --git a/test/CodeGen/X86/fold-push.ll b/test/CodeGen/X86/fold-push.ll
new file mode 100644
index 0000000000000..eaf91351021fe
--- /dev/null
+++ b/test/CodeGen/X86/fold-push.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=i686-windows -mattr=call-reg-indirect | FileCheck %s -check-prefix=CHECK -check-prefix=SLM
+
+declare void @foo(i32 %r)
+
+define void @test(i32 %a, i32 %b) optsize nounwind {
+; CHECK-LABEL: test:
+; CHECK: movl [[EAX:%e..]], (%esp)
+; CHECK-NEXT: pushl [[EAX]]
+; CHECK-NEXT: calll
+; CHECK-NEXT: addl $4, %esp
+; CHECK: nop
+; NORMAL: pushl (%esp)
+; SLM: movl (%esp), [[RELOAD:%e..]]
+; SLM-NEXT: pushl [[RELOAD]]
+; CHECK: calll
+; CHECK-NEXT: addl $4, %esp
+  %c = add i32 %a, %b
+  call void @foo(i32 %c)
+  call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"()
+  call void @foo(i32 %c)
+  ret void
+}
+
+define void @test_min(i32 %a, i32 %b) minsize nounwind {
+; CHECK-LABEL: test_min:
+; CHECK: movl [[EAX:%e..]], (%esp)
+; CHECK-NEXT: pushl [[EAX]]
+; CHECK-NEXT: calll
+; CHECK-NEXT: popl
+; CHECK: nop
+; CHECK: pushl (%esp)
+; CHECK: calll
+; CHECK-NEXT: popl
+  %c = add i32 %a, %b
+  call void @foo(i32 %c)
+  call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"()
+  call void @foo(i32 %c)
+  ret void
+}
diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll
index a9ba20f45e845..d0cf341700810 100644
--- a/test/CodeGen/X86/force-align-stack-alloca.ll
+++ b/test/CodeGen/X86/force-align-stack-alloca.ll
@@ -3,7 +3,7 @@
 ; arbitrarily force alignment up to 32-bytes for i386 hoping that this will
 ; exceed any ABI provisions.
 ;
-; RUN: llc < %s -mcpu=generic -force-align-stack -stack-alignment=32 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -stackrealign -stack-alignment=32 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/force-align-stack.ll b/test/CodeGen/X86/force-align-stack.ll
index ffcbf8a908c8d..fa94ad4dcd860 100644
--- a/test/CodeGen/X86/force-align-stack.ll
+++ b/test/CodeGen/X86/force-align-stack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=static -force-align-stack | FileCheck %s
+; RUN: llc < %s -relocation-model=static -stackrealign | FileCheck %s
 ; Tests to make sure that we always align the stack out to the minimum needed - 
 ; in this case 16-bytes.
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
index 27af5738ca3e8..fa31b9c9e1285 100644
--- a/test/CodeGen/X86/fp-fast.ll
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s
 
 define float @test1(float %a) {
diff --git a/test/CodeGen/X86/fp-logic.ll b/test/CodeGen/X86/fp-logic.ll
new file mode 100644
index 0000000000000..64c3f6b79a235
--- /dev/null
+++ b/test/CodeGen/X86/fp-logic.ll
@@ -0,0 +1,264 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2 < %s | FileCheck %s
+
+; PR22428: https://llvm.org/bugs/show_bug.cgi?id=22428
+; f1, f2, f3, and f4 should use an integer logic instruction.
+; f9 and f10 should use an FP (SSE) logic instruction.
+;
+; f5, f6, f7, and f8 are less clear.
+;
+; For f5 and f6, we can save a register move by using an FP logic instruction,
+; but we may need to calculate the relative costs of an SSE op vs. int op vs. 
+; scalar <-> SSE register moves.
+;
+; For f7 and f8, the SSE instructions don't take immediate operands, so if we
+; use one of those, we either have to load a constant from memory or move the
+; scalar immediate value from an integer register over to an SSE register.
+; Optimizing for size may affect that decision. Also, note that there are no
+; scalar versions of the FP logic ops, so if we want to fold a load into a
+; logic op, we have to load or splat a 16-byte vector constant.
+
+; 1 FP operand, 1 int operand, int result
+
+define i32 @f1(float %x, i32 %y) {
+; CHECK-LABEL: f1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = and i32 %bc1, %y
+  ret i32 %and
+}
+
+; Swap operands of the logic op.
+
+define i32 @f2(float %x, i32 %y) {
+; CHECK-LABEL: f2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = and i32 %y, %bc1
+  ret i32 %and
+}
+
+; 1 FP operand, 1 constant operand, int result
+
+define i32 @f3(float %x) {
+; CHECK-LABEL: f3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = and i32 %bc1, 1
+  ret i32 %and
+}
+
+; Swap operands of the logic op.
+
+define i32 @f4(float %x) {
+; CHECK-LABEL: f4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    andl $2, %eax
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = and i32 2, %bc1
+  ret i32 %and
+}
+
+; 1 FP operand, 1 integer operand, FP result
+
+define float @f5(float %x, i32 %y) {
+; CHECK-LABEL: f5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = and i32 %bc1, %y
+  %bc2 = bitcast i32 %and to float
+  ret float %bc2
+}
+
+; Swap operands of the logic op.
+
+define float @f6(float %x, i32 %y) {
+; CHECK-LABEL: f6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = and i32 %y, %bc1
+  %bc2 = bitcast i32 %and to float
+  ret float %bc2
+}
+
+; 1 FP operand, 1 constant operand, FP result
+
+define float @f7(float %x) {
+; CHECK-LABEL: f7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    andps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = and i32 %bc1, 3
+  %bc2 = bitcast i32 %and to float
+  ret float %bc2
+}
+
+; Swap operands of the logic op.
+
+define float @f8(float %x) {
+; CHECK-LABEL: f8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    andps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = and i32 4, %bc1
+  %bc2 = bitcast i32 %and to float
+  ret float %bc2
+}
+
+; 2 FP operands, int result
+
+define i32 @f9(float %x, float %y) {
+; CHECK-LABEL: f9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps %xmm1, %xmm0
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %bc2 = bitcast float %y to i32
+  %and = and i32 %bc1, %bc2
+  ret i32 %and
+}
+
+; 2 FP operands, FP result
+
+define float @f10(float %x, float %y) {
+; CHECK-LABEL: f10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %bc2 = bitcast float %y to i32
+  %and = and i32 %bc1, %bc2
+  %bc3 = bitcast i32 %and to float
+  ret float %bc3
+}
+
+define float @or(float %x, float %y) {
+; CHECK-LABEL: or:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %bc2 = bitcast float %y to i32
+  %and = or i32 %bc1, %bc2
+  %bc3 = bitcast i32 %and to float
+  ret float %bc3
+}
+
+define float @xor(float %x, float %y) {
+; CHECK-LABEL: xor:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %bc2 = bitcast float %y to i32
+  %and = xor i32 %bc1, %bc2
+  %bc3 = bitcast i32 %and to float
+  ret float %bc3
+}
+
+define float @f7_or(float %x) {
+; CHECK-LABEL: f7_or:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = or i32 %bc1, 3
+  %bc2 = bitcast i32 %and to float
+  ret float %bc2
+}
+
+define float @f7_xor(float %x) {
+; CHECK-LABEL: f7_xor:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    xorps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = xor i32 %bc1, 3
+  %bc2 = bitcast i32 %and to float
+  ret float %bc2
+}
+
+; Make sure that doubles work too.
+
+define double @doubles(double %x, double %y) {
+; CHECK-LABEL: doubles:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andpd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast double %x to i64
+  %bc2 = bitcast double %y to i64
+  %and = and i64 %bc1, %bc2
+  %bc3 = bitcast i64 %and to double
+  ret double %bc3
+}
+
+define double @f7_double(double %x) {
+; CHECK-LABEL: f7_double:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    andpd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast double %x to i64
+  %and = and i64 %bc1, 3
+  %bc2 = bitcast i64 %and to double
+  ret double %bc2
+}
+
+; Grabbing the sign bit is a special case that could be handled
+; by movmskps/movmskpd, but if we're not shifting it over, then
+; a simple FP logic op is cheaper.
+
+define float @movmsk(float %x) {
+; CHECK-LABEL: movmsk:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    andps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+
+  %bc1 = bitcast float %x to i32
+  %and = and i32 %bc1, 2147483648
+  %bc2 = bitcast i32 %and to float
+  ret float %bc2
+}
+
diff --git a/test/CodeGen/X86/fp128-calling-conv.ll b/test/CodeGen/X86/fp128-calling-conv.ll
new file mode 100644
index 0000000000000..e1dab30847c85
--- /dev/null
+++ b/test/CodeGen/X86/fp128-calling-conv.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; __float128 myFP128 = 1.0L;  // x86_64-linux-android
+@myFP128 = global fp128 0xL00000000000000003FFF000000000000, align 16
+
+; The first few parameters are passed in registers and the other are on stack.
+
+define fp128 @TestParam_FP128_0(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) {
+entry:
+  ret fp128 %d0
+; CHECK-LABEL: TestParam_FP128_0:
+; CHECK-NOT:   mov
+; CHECK:       retq
+}
+
+define fp128 @TestParam_FP128_1(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) {
+entry:
+  ret fp128 %d1
+; CHECK-LABEL: TestParam_FP128_1:
+; CHECK:       movaps  %xmm1, %xmm0
+; CHECK-NEXT:  retq
+}
+
+define fp128 @TestParam_FP128_7(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) {
+entry:
+  ret fp128 %d7
+; CHECK-LABEL: TestParam_FP128_7:
+; CHECK:       movaps  %xmm7, %xmm0
+; CHECK-NEXT:  retq
+}
+
+define fp128 @TestParam_FP128_8(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) {
+entry:
+  ret fp128 %d8
+; CHECK-LABEL: TestParam_FP128_8:
+; CHECK:       movaps 8(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
+
+define fp128 @TestParam_FP128_9(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) {
+entry:
+  ret fp128 %d9
+; CHECK-LABEL: TestParam_FP128_9:
+; CHECK:       movaps 24(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
diff --git a/test/CodeGen/X86/fp128-cast.ll b/test/CodeGen/X86/fp128-cast.ll
new file mode 100644
index 0000000000000..73878e31d0ef4
--- /dev/null
+++ b/test/CodeGen/X86/fp128-cast.ll
@@ -0,0 +1,279 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; Check soft floating point conversion function calls.
+
+@vi32 = common global i32 0, align 4
+@vi64 = common global i64 0, align 8
+@vu32 = common global i32 0, align 4
+@vu64 = common global i64 0, align 8
+@vf32 = common global float 0.000000e+00, align 4
+@vf64 = common global double 0.000000e+00, align 8
+@vf128 = common global fp128 0xL00000000000000000000000000000000, align 16
+
+define void @TestFPExtF32_F128() {
+entry:
+  %0 = load float, float* @vf32, align 4
+  %conv = fpext float %0 to fp128
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: TestFPExtF32_F128:
+; CHECK:       movss      vf32(%rip), %xmm0
+; CHECK-NEXT:  callq      __extendsftf2
+; CHECK-NEXT:  movaps     %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @TestFPExtF64_F128() {
+entry:
+  %0 = load double, double* @vf64, align 8
+  %conv = fpext double %0 to fp128
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: TestFPExtF64_F128:
+; CHECK:       movsd      vf64(%rip), %xmm0
+; CHECK-NEXT:  callq      __extenddftf2
+; CHECK-NEXT:  movapd     %xmm0, vf128(%rip)
+; CHECK:       ret
+}
+
+define void @TestFPToSIF128_I32() {
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = fptosi fp128 %0 to i32
+  store i32 %conv, i32* @vi32, align 4
+  ret void
+; CHECK-LABEL: TestFPToSIF128_I32:
+; CHECK:        movaps     vf128(%rip), %xmm0
+; CHECK-NEXT:   callq      __fixtfsi
+; CHECK-NEXT:   movl       %eax, vi32(%rip)
+; CHECK:        retq
+}
+
+define void @TestFPToUIF128_U32() {
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = fptoui fp128 %0 to i32
+  store i32 %conv, i32* @vu32, align 4
+  ret void
+; CHECK-LABEL: TestFPToUIF128_U32:
+; CHECK:        movaps     vf128(%rip), %xmm0
+; CHECK-NEXT:   callq      __fixunstfsi
+; CHECK-NEXT:   movl       %eax, vu32(%rip)
+; CHECK:        retq
+}
+
+define void @TestFPToSIF128_I64() {
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = fptosi fp128 %0 to i32
+  %conv1 = sext i32 %conv to i64
+  store i64 %conv1, i64* @vi64, align 8
+  ret void
+; CHECK-LABEL: TestFPToSIF128_I64:
+; CHECK:       movaps      vf128(%rip), %xmm0
+; CHECK-NEXT:  callq       __fixtfsi
+; CHECK-NEXT:  cltq
+; CHECK-NEXT:  movq        %rax, vi64(%rip)
+; CHECK:       retq
+}
+
+define void @TestFPToUIF128_U64() {
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = fptoui fp128 %0 to i32
+  %conv1 = zext i32 %conv to i64
+  store i64 %conv1, i64* @vu64, align 8
+  ret void
+; CHECK-LABEL: TestFPToUIF128_U64:
+; CHECK:       movaps      vf128(%rip), %xmm0
+; CHECK-NEXT:  callq       __fixunstfsi
+; CHECK-NEXT:  movl        %eax, %eax
+; CHECK-NEXT:  movq        %rax, vu64(%rip)
+; CHECK:       retq
+}
+
+define void @TestFPTruncF128_F32() {
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = fptrunc fp128 %0 to float
+  store float %conv, float* @vf32, align 4
+  ret void
+; CHECK-LABEL: TestFPTruncF128_F32:
+; CHECK:       movaps      vf128(%rip), %xmm0
+; CHECK-NEXT:  callq       __trunctfsf2
+; CHECK-NEXT:  movss       %xmm0, vf32(%rip)
+; CHECK:       retq
+}
+
+define void @TestFPTruncF128_F64() {
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = fptrunc fp128 %0 to double
+  store double %conv, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: TestFPTruncF128_F64:
+; CHECK:       movapd      vf128(%rip), %xmm0
+; CHECK-NEXT:  callq       __trunctfdf2
+; CHECK-NEXT:  movsd       %xmm0, vf64(%rip)
+; CHECK:       retq
+}
+
+define void @TestSIToFPI32_F128() {
+entry:
+  %0 = load i32, i32* @vi32, align 4
+  %conv = sitofp i32 %0 to fp128
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: TestSIToFPI32_F128:
+; CHECK:       movl       vi32(%rip), %edi
+; CHECK-NEXT:  callq      __floatsitf
+; CHECK-NEXT:  movaps     %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @TestUIToFPU32_F128() #2 {
+entry:
+  %0 = load i32, i32* @vu32, align 4
+  %conv = uitofp i32 %0 to fp128
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: TestUIToFPU32_F128:
+; CHECK:       movl       vu32(%rip), %edi
+; CHECK-NEXT:  callq      __floatunsitf
+; CHECK-NEXT:  movaps     %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @TestSIToFPI64_F128(){
+entry:
+  %0 = load i64, i64* @vi64, align 8
+  %conv = sitofp i64 %0 to fp128
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: TestSIToFPI64_F128:
+; CHECK:       movq       vi64(%rip), %rdi
+; CHECK-NEXT:  callq      __floatditf
+; CHECK-NEXT:  movaps     %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @TestUIToFPU64_F128() #2 {
+entry:
+  %0 = load i64, i64* @vu64, align 8
+  %conv = uitofp i64 %0 to fp128
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: TestUIToFPU64_F128:
+; CHECK:       movq       vu64(%rip), %rdi
+; CHECK-NEXT:  callq      __floatunditf
+; CHECK-NEXT:  movaps     %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define i32 @TestConst128(fp128 %v) {
+entry:
+  %cmp = fcmp ogt fp128 %v, 0xL00000000000000003FFF000000000000
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestConst128:
+; CHECK:       movaps {{.*}}, %xmm1
+; CHECK-NEXT:  callq __gttf2
+; CHECK-NEXT:  test
+; CHECK:       retq
+}
+
+; C code:
+;  struct TestBits_ieee_ext {
+;    unsigned v1;
+;    unsigned v2;
+; };
+; union TestBits_LDU {
+;   FP128 ld;
+;   struct TestBits_ieee_ext bits;
+; };
+; int TestBits128(FP128 ld) {
+;   union TestBits_LDU u;
+;   u.ld = ld * ld;
+;   return ((u.bits.v1 | u.bits.v2)  == 0);
+; }
+define i32 @TestBits128(fp128 %ld) {
+entry:
+  %mul = fmul fp128 %ld, %ld
+  %0 = bitcast fp128 %mul to i128
+  %shift = lshr i128 %0, 32
+  %or5 = or i128 %shift, %0
+  %or = trunc i128 %or5 to i32
+  %cmp = icmp eq i32 %or, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestBits128:
+; CHECK:       movaps %xmm0, %xmm1
+; CHECK-NEXT:  callq __multf3
+; CHECK-NEXT:  movaps %xmm0, (%rsp)
+; CHECK-NEXT:  movq (%rsp),
+; CHECK-NEXT:  movq %
+; CHECK-NEXT:  shrq $32,
+; CHECK:       orl
+; CHECK-NEXT:  sete %al
+; CHECK-NEXT:  movzbl %al, %eax
+; CHECK:       retq
+;
+; If TestBits128 fails due to any llvm or clang change,
+; please make sure the original simplified C code will
+; be compiled into correct IL and assembly code, not
+; just this TestBits128 test case. Better yet, try to
+; test the whole libm and its test cases.
+}
+
+; C code: (compiled with -target x86_64-linux-android)
+; typedef long double __float128;
+; __float128 TestPair128(unsigned long a, unsigned long b) {
+;   unsigned __int128 n;
+;   unsigned __int128 v1 = ((unsigned __int128)a << 64);
+;   unsigned __int128 v2 = (unsigned __int128)b;
+;   n = (v1 | v2) + 3;
+;   return *(__float128*)&n;
+; }
+define fp128 @TestPair128(i64 %a, i64 %b) {
+entry:
+  %conv = zext i64 %a to i128
+  %shl = shl nuw i128 %conv, 64
+  %conv1 = zext i64 %b to i128
+  %or = or i128 %shl, %conv1
+  %add = add i128 %or, 3
+  %0 = bitcast i128 %add to fp128
+  ret fp128 %0
+; CHECK-LABEL: TestPair128:
+; CHECK:       addq $3, %rsi
+; CHECK:       movq %rsi, -24(%rsp)
+; CHECK:       movq %rdi, -16(%rsp)
+; CHECK:       movaps -24(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
+
+define fp128 @TestTruncCopysign(fp128 %x, i32 %n) {
+entry:
+  %cmp = icmp sgt i32 %n, 50000
+  br i1 %cmp, label %if.then, label %cleanup
+
+if.then:                                          ; preds = %entry
+  %conv = fptrunc fp128 %x to double
+  %call = tail call double @copysign(double 0x7FF0000000000000, double %conv) #2
+  %conv1 = fpext double %call to fp128
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.then
+  %retval.0 = phi fp128 [ %conv1, %if.then ], [ %x, %entry ]
+  ret fp128 %retval.0
+; CHECK-LABEL: TestTruncCopysign:
+; CHECK:       callq __trunctfdf2
+; CHECK-NEXT:  andpd {{.*}}, %xmm0
+; CHECK-NEXT:  orpd {{.*}}, %xmm0
+; CHECK-NEXT:  callq __extenddftf2
+; CHECK:       retq
+}
+
+declare double @copysign(double, double) #1
+
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/X86/fp128-compare.ll b/test/CodeGen/X86/fp128-compare.ll
new file mode 100644
index 0000000000000..b5d4fbe1b74e5
--- /dev/null
+++ b/test/CodeGen/X86/fp128-compare.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+define i32 @TestComp128GT(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp ogt fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128GT:
+; CHECK:       callq __gttf2
+; CHECK:       setg  %al
+; CHECK:       movzbl %al, %eax
+; CHECK:       retq
+}
+
+define i32 @TestComp128GE(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp oge fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128GE:
+; CHECK:       callq __getf2
+; CHECK:       testl %eax, %eax
+; CHECK:       setns %al
+; CHECK:       movzbl %al, %eax
+; CHECK:       retq
+}
+
+define i32 @TestComp128LT(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp olt fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128LT:
+; CHECK:       callq __lttf2
+; CHECK-NEXT:  shrl $31, %eax
+; CHECK:       retq
+;
+; The 'shrl' is a special optimization in llvm to combine
+; the effect of 'fcmp olt' and 'zext'. The main purpose is
+; to test soften call to __lttf2.
+}
+
+define i32 @TestComp128LE(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp ole fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128LE:
+; CHECK:       callq __letf2
+; CHECK-NEXT:  testl %eax, %eax
+; CHECK:       setle %al
+; CHECK:       movzbl %al, %eax
+; CHECK:       retq
+}
+
+define i32 @TestComp128EQ(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp oeq fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128EQ:
+; CHECK:       callq __eqtf2
+; CHECK-NEXT:  testl %eax, %eax
+; CHECK:       sete %al
+; CHECK:       movzbl %al, %eax
+; CHECK:       retq
+}
+
+define i32 @TestComp128NE(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp une fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128NE:
+; CHECK:       callq __netf2
+; CHECK-NEXT:  testl %eax, %eax
+; CHECK:       setne %al
+; CHECK:       movzbl %al, %eax
+; CHECK:       retq
+}
+
+define fp128 @TestMax(fp128 %x, fp128 %y) {
+entry:
+  %cmp = fcmp ogt fp128 %x, %y
+  %cond = select i1 %cmp, fp128 %x, fp128 %y
+  ret fp128 %cond
+; CHECK-LABEL: TestMax:
+; CHECK: movaps %xmm1
+; CHECK: movaps %xmm0
+; CHECK: callq __gttf2
+; CHECK: movaps {{.*}}, %xmm0
+; CHECK: testl %eax, %eax
+; CHECK: movaps {{.*}}, %xmm0
+; CHECK: retq
+}
diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll
new file mode 100644
index 0000000000000..77160674ab204
--- /dev/null
+++ b/test/CodeGen/X86/fp128-i128.ll
@@ -0,0 +1,320 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; These tests were generated from simplified libm C code.
+; When compiled for the x86_64-linux-android target,
+; long double is mapped to f128 type that should be passed
+; in SSE registers. When the f128 type calling convention
+; problem was fixed, old llvm code failed to handle f128 values
+; in several f128/i128 type operations. These unit tests hopefully
+; will catch regression in any future change in this area.
+; To modified or enhance these test cases, please consult libm
+; code pattern and compile with -target x86_64-linux-android
+; to generate IL. The __float128 keyword if not accepted by
+; clang, just define it to "long double".
+;
+
+; typedef long double __float128;
+; union IEEEl2bits {
+;   __float128 e;
+;   struct {
+;     unsigned long manl :64;
+;     unsigned long manh :48;
+;     unsigned int exp :15;
+;     unsigned int sign :1;
+;   } bits;
+;   struct {
+;     unsigned long manl :64;
+;     unsigned long manh :48;
+;     unsigned int expsign :16;
+;   } xbits;
+; };
+
+; C code:
+; void foo(__float128 x);
+; void TestUnionLD1(__float128 s, unsigned long n) {
+;      union IEEEl2bits u;
+;      __float128 w;
+;      u.e = s;
+;      u.bits.manh = n;
+;      w = u.e;
+;      foo(w);
+; }
+define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
+entry:
+  %0 = bitcast fp128 %s to i128
+  %1 = zext i64 %n to i128
+  %bf.value = shl nuw i128 %1, 64
+  %bf.shl = and i128 %bf.value, 5192296858534809181786422619668480
+  %bf.clear = and i128 %0, -5192296858534809181786422619668481
+  %bf.set = or i128 %bf.shl, %bf.clear
+  %2 = bitcast i128 %bf.set to fp128
+  tail call void @foo(fp128 %2) #2
+  ret void
+; CHECK-LABEL: TestUnionLD1:
+; CHECK:       movaps %xmm0, -24(%rsp)
+; CHECK-NEXT:  movq -24(%rsp), %rax
+; CHECK-NEXT:  movabsq $281474976710655, %rcx
+; CHECK-NEXT:  andq %rdi, %rcx
+; CHECK-NEXT:  movabsq $-281474976710656, %rdx
+; CHECK-NEXT:  andq -16(%rsp), %rdx
+; CHECK-NEXT:  movq %rax, -40(%rsp)
+; CHECK-NEXT:  orq %rcx, %rdx
+; CHECK-NEXT:  movq %rdx, -32(%rsp)
+; CHECK-NEXT:  movaps -40(%rsp), %xmm0
+; CHECK-NEXT:  jmp foo
+}
+
+; C code:
+; __float128 TestUnionLD2(__float128 s) {
+;      union IEEEl2bits u;
+;      __float128 w;
+;      u.e = s;
+;      u.bits.manl = 0;
+;      w = u.e;
+;      return w;
+; }
+define fp128 @TestUnionLD2(fp128 %s) #0 {
+entry:
+  %0 = bitcast fp128 %s to i128
+  %bf.clear = and i128 %0, -18446744073709551616
+  %1 = bitcast i128 %bf.clear to fp128
+  ret fp128 %1
+; CHECK-LABEL: TestUnionLD2:
+; CHECK:       movaps %xmm0, -24(%rsp)
+; CHECK-NEXT:  movq -16(%rsp), %rax
+; CHECK-NEXT:  movq %rax, -32(%rsp)
+; CHECK-NEXT:  movq $0, -40(%rsp)
+; CHECK-NEXT:  movaps -40(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
+
+; C code:
+; __float128 TestI128_1(__float128 x)
+; {
+;  union IEEEl2bits z;
+;  z.e = x;
+;  z.bits.sign = 0;
+;  return (z.e < 0.1L) ? 1.0L : 2.0L;
+; }
+define fp128 @TestI128_1(fp128 %x) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %bf.clear = and i128 %0, 170141183460469231731687303715884105727
+  %1 = bitcast i128 %bf.clear to fp128
+  %cmp = fcmp olt fp128 %1, 0xL999999999999999A3FFB999999999999
+  %cond = select i1 %cmp, fp128 0xL00000000000000003FFF000000000000, fp128 0xL00000000000000004000000000000000
+  ret fp128 %cond
+; CHECK-LABEL: TestI128_1:
+; CHECK:       movaps %xmm0,
+; CHECK:       movabsq $9223372036854775807,
+; CHECK:       callq __lttf2
+; CHECK:       testl %eax, %eax
+; CHECK:       movaps {{.*}}, %xmm0
+; CHECK:       retq
+}
+
+; C code:
+; __float128 TestI128_2(__float128 x, __float128 y)
+; {
+;  unsigned short hx;
+;  union IEEEl2bits ge_u;
+;  ge_u.e = x;
+;  hx = ge_u.xbits.expsign;
+;  return (hx & 0x8000) == 0 ? x : y;
+; }
+define fp128 @TestI128_2(fp128 %x, fp128 %y) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %cmp = icmp sgt i128 %0, -1
+  %cond = select i1 %cmp, fp128 %x, fp128 %y
+  ret fp128 %cond
+; CHECK-LABEL: TestI128_2:
+; CHECK:       movaps %xmm0, -24(%rsp)
+; CHECK-NEXT:  cmpq $0, -16(%rsp)
+; CHECK-NEXT:  jns
+; CHECK:       movaps %xmm1, %xmm0
+; CHECK:       retq
+}
+
+; C code:
+; __float128 TestI128_3(__float128 x, int *ex)
+; {
+;  union IEEEl2bits u;
+;  u.e = x;
+;  if (u.bits.exp == 0) {
+;    u.e *= 0x1.0p514;
+;    u.bits.exp = 0x3ffe;
+;  }
+;  return (u.e);
+; }
+define fp128 @TestI128_3(fp128 %x, i32* nocapture readnone %ex) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %bf.cast = and i128 %0, 170135991163610696904058773219554885632
+  %cmp = icmp eq i128 %bf.cast, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %mul = fmul fp128 %x, 0xL00000000000000004201000000000000
+  %1 = bitcast fp128 %mul to i128
+  %bf.clear4 = and i128 %1, -170135991163610696904058773219554885633
+  %bf.set = or i128 %bf.clear4, 85060207136517546210586590865283612672
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %u.sroa.0.0 = phi i128 [ %bf.set, %if.then ], [ %0, %entry ]
+  %2 = bitcast i128 %u.sroa.0.0 to fp128
+  ret fp128 %2
+; CHECK-LABEL: TestI128_3:
+; CHECK:       movaps %xmm0,
+; CHECK:       movabsq $9223090561878065152,
+; CHECK:       testq
+; CHECK:       callq __multf3
+; CHECK-NEXT:  movaps %xmm0
+; CHECK:       movabsq $-9223090561878065153,
+; CHECK:       movabsq $4611123068473966592,
+; CHECK:       retq
+}
+
+; C code:
+; __float128 TestI128_4(__float128 x)
+; {
+;  union IEEEl2bits u;
+;  __float128 df;
+;  u.e = x;
+;  u.xbits.manl = 0;
+;  df = u.e;
+;  return x + df;
+; }
+define fp128 @TestI128_4(fp128 %x) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %bf.clear = and i128 %0, -18446744073709551616
+  %1 = bitcast i128 %bf.clear to fp128
+  %add = fadd fp128 %1, %x
+  ret fp128 %add
+; CHECK-LABEL: TestI128_4:
+; CHECK:       movaps %xmm0, %xmm1
+; CHECK-NEXT:  movaps %xmm1, 16(%rsp)
+; CHECK-NEXT:  movq 24(%rsp), %rax
+; CHECK-NEXT:  movq %rax, 8(%rsp)
+; CHECK-NEXT:  movq $0, (%rsp)
+; CHECK-NEXT:  movaps (%rsp), %xmm0
+; CHECK-NEXT:  callq __addtf3
+; CHECK:       retq
+}
+
+@v128 = common global i128 0, align 16
+@v128_2 = common global i128 0, align 16
+
+; C code:
+; unsigned __int128 v128, v128_2;
+; void TestShift128_2() {
+;   v128 = ((v128 << 96) | v128_2);
+; }
+define void @TestShift128_2() #2 {
+entry:
+  %0 = load i128, i128* @v128, align 16
+  %shl = shl i128 %0, 96
+  %1 = load i128, i128* @v128_2, align 16
+  %or = or i128 %shl, %1
+  store i128 %or, i128* @v128, align 16
+  ret void
+; CHECK-LABEL: TestShift128_2:
+; CHECK:       movq v128(%rip), %rax
+; CHECK-NEXT:  shlq $32, %rax
+; CHECK-NEXT:  movq v128_2(%rip), %rcx
+; CHECK-NEXT:  orq v128_2+8(%rip), %rax
+; CHECK-NEXT:  movq %rcx, v128(%rip)
+; CHECK-NEXT:  movq %rax, v128+8(%rip)
+; CHECK-NEXT:  retq
+}
+
+define fp128 @acosl(fp128 %x) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %bf.clear = and i128 %0, -18446744073709551616
+  %1 = bitcast i128 %bf.clear to fp128
+  %add = fadd fp128 %1, %x
+  ret fp128 %add
+; CHECK-LABEL: acosl:
+; CHECK:       movaps %xmm0, %xmm1
+; CHECK-NEXT:  movaps %xmm1, 16(%rsp)
+; CHECK-NEXT:  movq 24(%rsp), %rax
+; CHECK-NEXT:  movq %rax, 8(%rsp)
+; CHECK-NEXT:  movq $0, (%rsp)
+; CHECK-NEXT:  movaps (%rsp), %xmm0
+; CHECK-NEXT:  callq __addtf3
+; CHECK:       retq
+}
+
+; Compare i128 values and check i128 constants.
+define fp128 @TestComp(fp128 %x, fp128 %y) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %cmp = icmp sgt i128 %0, -1
+  %cond = select i1 %cmp, fp128 %x, fp128 %y
+  ret fp128 %cond
+; CHECK-LABEL: TestComp:
+; CHECK:       movaps %xmm0, -24(%rsp)
+; CHECK-NEXT:  cmpq $0, -16(%rsp)
+; CHECK-NEXT:  jns
+; CHECK:       movaps %xmm1, %xmm0
+; CHECK:       retq
+}
+
+declare void @foo(fp128) #1
+
+; Test logical operations on fp128 values.
+define fp128 @TestFABS_LD(fp128 %x) #0 {
+entry:
+  %call = tail call fp128 @fabsl(fp128 %x) #2
+  ret fp128 %call
+; CHECK-LABEL: TestFABS_LD
+; CHECK:       andps {{.*}}, %xmm0
+; CHECK-NEXT:  retq
+}
+
+declare fp128 @fabsl(fp128) #1
+
+declare fp128 @copysignl(fp128, fp128) #1
+
+; Test more complicated logical operations generated from copysignl.
+define void @TestCopySign({ fp128, fp128 }* noalias nocapture sret %agg.result, { fp128, fp128 }* byval nocapture readonly align 16 %z) #0 {
+entry:
+  %z.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 0
+  %z.real = load fp128, fp128* %z.realp, align 16
+  %z.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 1
+  %z.imag4 = load fp128, fp128* %z.imagp, align 16
+  %cmp = fcmp ogt fp128 %z.real, %z.imag4
+  %sub = fsub fp128 %z.imag4, %z.imag4
+  br i1 %cmp, label %if.then, label %cleanup
+
+if.then:                                          ; preds = %entry
+  %call = tail call fp128 @fabsl(fp128 %sub) #2
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.then
+  %z.real.sink = phi fp128 [ %z.real, %if.then ], [ %sub, %entry ]
+  %call.sink = phi fp128 [ %call, %if.then ], [ %z.real, %entry ]
+  %call5 = tail call fp128 @copysignl(fp128 %z.real.sink, fp128 %z.imag4) #2
+  %0 = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %agg.result, i64 0, i32 0
+  %1 = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %agg.result, i64 0, i32 1
+  store fp128 %call.sink, fp128* %0, align 16
+  store fp128 %call5, fp128* %1, align 16
+  ret void
+; CHECK-LABEL: TestCopySign
+; CHECK-NOT:   call
+; CHECK:       callq __subtf3
+; CHECK-NOT:   call
+; CHECK:       callq __gttf2
+; CHECK-NOT:   call
+; CHECK:       andps {{.*}}, %xmm0
+; CHECK:       retq
+}
+
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/X86/fp128-libcalls.ll b/test/CodeGen/X86/fp128-libcalls.ll
new file mode 100644
index 0000000000000..ee5fa447448cc
--- /dev/null
+++ b/test/CodeGen/X86/fp128-libcalls.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; Check all soft floating point library function calls.
+
+@vf64 = common global double 0.000000e+00, align 8
+@vf128 = common global fp128 0xL00000000000000000000000000000000, align 16
+
+define void @Test128Add(fp128 %d1, fp128 %d2) {
+entry:
+  %add = fadd fp128 %d1, %d2
+  store fp128 %add, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128Add:
+; CHECK:       callq __addtf3
+; CHECK-NEXT:  movaps %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128_1Add(fp128 %d1){
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %add = fadd fp128 %0, %d1
+  store fp128 %add, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128_1Add:
+; CHECK:       movaps  %xmm0, %xmm1
+; CHECK-NEXT:  movaps  vf128(%rip), %xmm0
+; CHECK-NEXT:  callq   __addtf3
+; CHECK-NEXT:  movaps  %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128Sub(fp128 %d1, fp128 %d2){
+entry:
+  %sub = fsub fp128 %d1, %d2
+  store fp128 %sub, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128Sub:
+; CHECK:       callq __subtf3
+; CHECK-NEXT:  movaps %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128_1Sub(fp128 %d1){
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %sub = fsub fp128 %0, %d1
+  store fp128 %sub, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128_1Sub:
+; CHECK:       movaps  %xmm0, %xmm1
+; CHECK-NEXT:  movaps  vf128(%rip), %xmm0
+; CHECK-NEXT:  callq   __subtf3
+; CHECK-NEXT:  movaps  %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128Mul(fp128 %d1, fp128 %d2){
+entry:
+  %mul = fmul fp128 %d1, %d2
+  store fp128 %mul, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128Mul:
+; CHECK:       callq __multf3
+; CHECK-NEXT:  movaps %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128_1Mul(fp128 %d1){
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %mul = fmul fp128 %0, %d1
+  store fp128 %mul, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128_1Mul:
+; CHECK:       movaps  %xmm0, %xmm1
+; CHECK-NEXT:  movaps  vf128(%rip), %xmm0
+; CHECK-NEXT:  callq   __multf3
+; CHECK-NEXT:  movaps  %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128Div(fp128 %d1, fp128 %d2){
+entry:
+  %div = fdiv fp128 %d1, %d2
+  store fp128 %div, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128Div:
+; CHECK:       callq __divtf3
+; CHECK-NEXT:  movaps %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128_1Div(fp128 %d1){
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %div = fdiv fp128 %0, %d1
+  store fp128 %div, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128_1Div:
+; CHECK:       movaps  %xmm0, %xmm1
+; CHECK-NEXT:  movaps  vf128(%rip), %xmm0
+; CHECK-NEXT:  callq   __divtf3
+; CHECK-NEXT:  movaps  %xmm0, vf128(%rip)
+; CHECK:       retq
+}
diff --git a/test/CodeGen/X86/fp128-load.ll b/test/CodeGen/X86/fp128-load.ll
new file mode 100644
index 0000000000000..73bacf87275eb
--- /dev/null
+++ b/test/CodeGen/X86/fp128-load.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; __float128 myFP128 = 1.0L;  // x86_64-linux-android
+@my_fp128 = global fp128 0xL00000000000000003FFF000000000000, align 16
+
+define fp128 @get_fp128() {
+entry:
+  %0 = load fp128, fp128* @my_fp128, align 16
+  ret fp128 %0
+; CHECK-LABEL: get_fp128:
+; CHECK:       movaps my_fp128(%rip), %xmm0
+; CHECK-NEXT:  retq
+}
+
+@TestLoadExtend.data = internal unnamed_addr constant [2 x float] [float 0x3FB99999A0000000, float 0x3FC99999A0000000], align 4
+
+define fp128 @TestLoadExtend(fp128 %x, i32 %n) {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds [2 x float], [2 x float]* @TestLoadExtend.data, i64 0, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fpext float %0 to fp128
+  ret fp128 %conv
+; CHECK-LABEL: TestLoadExtend:
+; CHECK:       movslq  %edi, %rax
+; CHECK-NEXT:  movss   TestLoadExtend.data(,%rax,4), %xmm0
+; CHECK-NEXT:  callq   __extendsftf2
+; CHECK:       retq
+}
+
+; CHECK-LABEL:  my_fp128:
+; CHECK-NEXT:  .quad   0
+; CHECK-NEXT:  .quad   4611404543450677248
+; CHECK-NEXT:  .size   my_fp128, 16
diff --git a/test/CodeGen/X86/fp128-store.ll b/test/CodeGen/X86/fp128-store.ll
new file mode 100644
index 0000000000000..ca3af637cff58
--- /dev/null
+++ b/test/CodeGen/X86/fp128-store.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; __float128 myFP128 = 1.0L;  // x86_64-linux-android
+@myFP128 = global fp128 0xL00000000000000003FFF000000000000, align 16
+
+define void @set_FP128(fp128 %x) {
+entry:
+  store fp128 %x, fp128* @myFP128, align 16
+  ret void
+; CHECK-LABEL: set_FP128:
+; CHECK:       movaps  %xmm0, myFP128(%rip)
+; CHECK-NEXT:  retq
+}
diff --git a/test/CodeGen/X86/fpcmp-soft-fp.ll b/test/CodeGen/X86/fpcmp-soft-fp.ll
new file mode 100644
index 0000000000000..58d57017d18a5
--- /dev/null
+++ b/test/CodeGen/X86/fpcmp-soft-fp.ll
@@ -0,0 +1,127 @@
+; RUN: llc < %s -march=x86 -mcpu=pentium -mtriple=x86-linux-gnu -float-abi=soft | FileCheck %s 
+
+define i1 @test1(double %d) #0 {
+entry:
+  %cmp = fcmp ule double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test1:
+; CHECK: calll __gtdf2
+; CHECK: setle
+; CHECK: retl
+ 
+define i1 @test2(double %d) #0 {
+entry:
+  %cmp = fcmp ult double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test2:
+; CHECK: calll __gedf2
+; CHECK: sets
+; CHECK: retl
+
+define i1 @test3(double %d) #0 {
+entry:
+  %cmp = fcmp ugt double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test3:
+; CHECK: calll __ledf2
+; CHECK: setg
+; CHECK: retl
+
+define i1 @test4(double %d) #0 {
+entry:
+  %cmp = fcmp uge double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test4:
+; CHECK: calll __ltdf2
+; CHECK: setns
+; CHECK: retl
+
+define i1 @test5(double %d) #0 {
+entry:
+  %cmp = fcmp ole double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test5:  
+; CHECK: calll __ledf2
+; CHECK: setle
+; CHECK: retl
+
+define i1 @test6(double %d) #0 {
+entry:
+  %cmp = fcmp olt double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test6:
+; CHECK: calll __ltdf2
+; CHECK: sets
+; CHECK: retl
+
+define i1 @test7(double %d) #0 {
+entry:
+  %cmp = fcmp ogt double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test7:
+; CHECK: calll __gtdf2
+; CHECK: setg
+; CHECK: retl
+
+define i1 @test8(double %d) #0 {
+entry:
+  %cmp = fcmp oge double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test8:
+; CHECK: calll __gedf2
+; CHECK: setns
+; CHECK: retl
+
+define i1 @test9(double %d) #0 {
+entry:
+  %cmp = fcmp oeq double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test9:
+; CHECK: calll __eqdf2
+; CHECK: sete
+; CHECK: retl
+
+define i1 @test10(double %d) #0 {
+entry:
+  %cmp = fcmp ueq double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test10:
+; CHECK: calll __eqdf2
+; CHECK: sete
+; CHECK: calll __unorddf2
+; CHECK: setne
+; CHECK: retl
+
+define i1 @test11(double %d) #0 {
+entry:
+  %cmp = fcmp one double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test11:
+; CHECK: calll __gtdf2
+; CHECK: setg
+; CHECK: calll __ltdf2
+; CHECK: sets
+; CHECK: retl
+
+define i1 @test12(double %d) #0 {
+entry:
+  %cmp = fcmp une double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test12:
+; CHECK: calll __nedf2
+; CHECK: setne
+; CHECK: retl
+
+attributes #0 = { "use-soft-float"="true" }
diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
index 34398414a76cd..2ee67dc190bd5 100644
--- a/test/CodeGen/X86/fpstack-debuginstr-kill.ll
+++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
@@ -3,7 +3,7 @@
 @g1 = global double 0.000000e+00, align 8
 @g2 = global i32 0, align 4
 
-define void @_Z16fpuop_arithmeticjj(i32, i32) {
+define void @_Z16fpuop_arithmeticjj(i32, i32) !dbg !4 {
 entry:
   switch i32 undef, label %sw.bb.i1921 [
   ]
@@ -43,27 +43,27 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!24, !25}
-!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !21, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !21, imports: !2)
 !1 = !DIFile(filename: "fpu_ieee.cpp", directory: "x87stackifier")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "fpuop_arithmetic", linkageName: "_Z16fpuop_arithmeticjj", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 13, file: !5, scope: !6, type: !7, function: void (i32, i32)* @_Z16fpuop_arithmeticjj, variables: !10)
+!4 = distinct !DISubprogram(name: "fpuop_arithmetic", linkageName: "_Z16fpuop_arithmeticjj", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 13, file: !5, scope: !6, type: !7, variables: !10)
 !5 = !DIFile(filename: "f1.cpp", directory: "x87stackifier")
 !6 = !DIFile(filename: "f1.cpp", directory: "x87stackifier")
 !7 = !DISubroutineType(types: !8)
 !8 = !{null, !9, !9}
 !9 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
 !10 = !{!11, !12, !13, !18, !20}
-!11 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "", line: 11, arg: 1, scope: !4, file: !6, type: !9)
-!12 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "", line: 11, arg: 2, scope: !4, file: !6, type: !9)
-!13 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "x", line: 14, scope: !4, file: !6, type: !14)
+!11 = !DILocalVariable(name: "", line: 11, arg: 1, scope: !4, file: !6, type: !9)
+!12 = !DILocalVariable(name: "", line: 11, arg: 2, scope: !4, file: !6, type: !9)
+!13 = !DILocalVariable(name: "x", line: 14, scope: !4, file: !6, type: !14)
 !14 = !DIDerivedType(tag: DW_TAG_typedef, name: "fpu_extended", line: 3, file: !5, baseType: !15)
 !15 = !DIDerivedType(tag: DW_TAG_typedef, name: "fpu_register", line: 2, file: !5, baseType: !16)
 !16 = !DIDerivedType(tag: DW_TAG_typedef, name: "uae_f64", line: 1, file: !5, baseType: !17)
 !17 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
-!18 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "a", line: 15, scope: !4, file: !6, type: !19)
+!18 = !DILocalVariable(name: "a", line: 15, scope: !4, file: !6, type: !19)
 !19 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!20 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "value", line: 16, scope: !4, file: !6, type: !14)
+!20 = !DILocalVariable(name: "value", line: 16, scope: !4, file: !6, type: !14)
 !21 = !{!22, !23}
 !22 = !DIGlobalVariable(name: "g1", line: 5, isLocal: false, isDefinition: true, scope: null, file: !6, type: !14, variable: double* @g1)
 !23 = !DIGlobalVariable(name: "g2", line: 6, isLocal: false, isDefinition: true, scope: null, file: !6, type: !19, variable: i32* @g2)
diff --git a/test/CodeGen/X86/frameescape.ll b/test/CodeGen/X86/frameescape.ll
deleted file mode 100644
index 179a936304ba0..0000000000000
--- a/test/CodeGen/X86/frameescape.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
-; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
-
-declare void @llvm.localescape(...)
-declare i8* @llvm.frameaddress(i32)
-declare i8* @llvm.localrecover(i8*, i8*, i32)
-declare i32 @printf(i8*, ...)
-
-@str = internal constant [10 x i8] c"asdf: %d\0A\00"
-
-define void @print_framealloc_from_fp(i8* %fp) {
-  %a.i8 = call i8* @llvm.localrecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 0)
-  %a = bitcast i8* %a.i8 to i32*
-  %a.val = load i32, i32* %a
-  call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %a.val)
-  %b.i8 = call i8* @llvm.localrecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 1)
-  %b = bitcast i8* %b.i8 to i32*
-  %b.val = load i32, i32* %b
-  call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b.val)
-  store i32 42, i32* %b
-  %b2 = getelementptr i32, i32* %b, i32 1
-  %b2.val = load i32, i32* %b2
-  call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b2.val)
-  ret void
-}
-
-; X64-LABEL: print_framealloc_from_fp:
-; X64: movq %rcx, %[[parent_fp:[a-z]+]]
-; X64: movl .Lalloc_func$frame_escape_0(%[[parent_fp]]), %edx
-; X64: leaq {{.*}}(%rip), %[[str:[a-z]+]]
-; X64: movq %[[str]], %rcx
-; X64: callq printf
-; X64: movl .Lalloc_func$frame_escape_1(%[[parent_fp]]), %edx
-; X64: movq %[[str]], %rcx
-; X64: callq printf
-; X64: movl    $42, .Lalloc_func$frame_escape_1(%[[parent_fp]])
-; X64: retq
-
-; X86-LABEL: print_framealloc_from_fp:
-; X86: pushl   %esi
-; X86: subl    $8, %esp
-; X86: movl    16(%esp), %esi
-; X86: movl    Lalloc_func$frame_escape_0(%esi), %eax
-; X86: movl    %eax, 4(%esp)
-; X86: movl    $_str, (%esp)
-; X86: calll   _printf
-; X86: movl    Lalloc_func$frame_escape_1(%esi), %eax
-; X86: movl    %eax, 4(%esp)
-; X86: movl    $_str, (%esp)
-; X86: calll   _printf
-; X86: movl    $42, Lalloc_func$frame_escape_1(%esi)
-; X86: movl    $4, %eax
-; X86: movl    Lalloc_func$frame_escape_1(%esi,%eax), %eax
-; X86: movl    %eax, 4(%esp)
-; X86: movl    $_str, (%esp)
-; X86: calll   _printf
-; X86: addl    $8, %esp
-; X86: popl    %esi
-; X86: retl
-
-define void @alloc_func() {
-  %a = alloca i32
-  %b = alloca i32, i32 2
-  call void (...) @llvm.localescape(i32* %a, i32* %b)
-  store i32 42, i32* %a
-  store i32 13, i32* %b
-  %fp = call i8* @llvm.frameaddress(i32 0)
-  call void @print_framealloc_from_fp(i8* %fp)
-  ret void
-}
-
-; X64-LABEL: alloc_func:
-; X64: subq    $48, %rsp
-; X64: .seh_stackalloc 48
-; X64: leaq    48(%rsp), %rbp
-; X64: .seh_setframe 5, 48
-; X64: .Lalloc_func$frame_escape_0 = 44
-; X64: .Lalloc_func$frame_escape_1 = 36
-; X64: movl $42, -4(%rbp)
-; X64: movl $13, -12(%rbp)
-; X64: leaq    -48(%rbp), %rcx
-; X64: callq print_framealloc_from_fp
-; X64: retq
-
-; X86-LABEL: alloc_func:
-; X86: pushl   %ebp
-; X86: movl    %esp, %ebp
-; X86: subl    $16, %esp
-; X86: Lalloc_func$frame_escape_0 = -4
-; X86: Lalloc_func$frame_escape_1 = -12
-; X86: movl    $42, -4(%ebp)
-; X86: movl    $13, -12(%ebp)
-; X86: movl    %ebp, (%esp)
-; X86: calll   _print_framealloc_from_fp
-; X86: addl    $16, %esp
-; X86: popl    %ebp
-; X86: retl
-
-; Helper to make this a complete program so it can be compiled and tested.
-define i32 @main() {
-  call void @alloc_func()
-  ret i32 0
-}
-
-define void @alloc_func_no_frameaddr() {
-  %a = alloca i32
-  %b = alloca i32
-  call void (...) @llvm.localescape(i32* %a, i32* %b)
-  store i32 42, i32* %a
-  store i32 13, i32* %b
-  call void @print_framealloc_from_fp(i8* null)
-  ret void
-}
-
-; X64-LABEL: alloc_func_no_frameaddr:
-; X64: subq    $40, %rsp
-; X64: .seh_stackalloc 40
-; X64: .seh_endprologue
-; X64: .Lalloc_func_no_frameaddr$frame_escape_0 = 36
-; X64: .Lalloc_func_no_frameaddr$frame_escape_1 = 32
-; X64: movl $42, 36(%rsp)
-; X64: movl $13, 32(%rsp)
-; X64: xorl %ecx, %ecx
-; X64: callq print_framealloc_from_fp
-; X64: addq $40, %rsp
-; X64: retq
-
-; X86-LABEL: alloc_func_no_frameaddr:
diff --git a/test/CodeGen/X86/frem-msvc32.ll b/test/CodeGen/X86/frem-msvc32.ll
new file mode 100644
index 0000000000000..01144eb44de48
--- /dev/null
+++ b/test/CodeGen/X86/frem-msvc32.ll
@@ -0,0 +1,12 @@
+; Make sure that 32-bit FREM is promoted to 64-bit FREM on 32-bit MSVC.
+
+; MSVC does not have a 32-bit fmodf function, so it must be promoted to
+; a 64-bit fmod rtlib call.
+; RUN: llc -mtriple=i686-pc-windows-msvc -O0 < %s  | FileCheck %s
+
+; CHECK: @do_frem32
+; CHECK: {{_fmod$}}
+define float @do_frem32(float %a, float %b) {
+    %val = frem float %a, %b
+    ret float %val
+}
diff --git a/test/CodeGen/X86/funclet-layout.ll b/test/CodeGen/X86/funclet-layout.ll
new file mode 100644
index 0000000000000..0942645cf5a40
--- /dev/null
+++ b/test/CodeGen/X86/funclet-layout.ll
@@ -0,0 +1,158 @@
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+%eh.ThrowInfo = type { i32, i32, i32, i32 }
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = internal global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }
+
+define void @test1(i1 %B) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @g()
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:
+  %cp = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  br label %catch.loop
+
+catch.loop:
+  br i1 %B, label %catchret, label %catch.loop
+
+catchret:
+  catchret from %cp to label %try.cont
+
+try.cont:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+; CHECK-LABEL: test1:
+
+; The entry funclet contains %entry and %try.cont
+; CHECK: # %entry
+; CHECK: # %try.cont
+; CHECK: retq
+
+; The catch funclet contains %catch and %catchret
+; CHECK: # %catch{{$}}
+; CHECK: # %catchret
+; CHECK: retq
+
+declare void @g()
+
+
+define i32 @test2(i1 %B) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) #1
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %0 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) #1 ["funclet"(token %0)]
+          to label %unreachable unwind label %catch.dispatch.1
+
+catch.dispatch.1:                                 ; preds = %catch
+  %cs2 = catchswitch within %0 [label %catch.3] unwind to caller
+
+catch.3:                                          ; preds = %catch.dispatch.1
+  %1 = catchpad within %cs2 [i8* null, i32 64, i8* null]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %catch.3
+  catchret from %0 to label %try.cont.5
+
+try.cont.5:                                       ; preds = %try.cont
+  ret i32 0
+
+unreachable:                                      ; preds = %catch, %entry
+  unreachable
+}
+
+; CHECK-LABEL: test2:
+
+; The parent function contains %entry and %try.cont.5
+; CHECK: .seh_proc
+; CHECK: # %entry
+; CHECK: # %try.cont.5
+; CHECK: retq
+
+; The inner catch funclet contains %catch.3
+; CHECK: .seh_proc
+; CHECK: # %catch.3{{$}}
+; CHECK: retq
+
+; The outer catch funclet contains %catch
+; CHECK: .seh_proc
+; CHECK: # %catch{{$}}
+; CHECK: callq _CxxThrowException
+; CHECK: # %unreachable
+; CHECK: ud2
+
+
+define void @test3(i1 %V) #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  invoke void @g()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %catch.2] unwind label %catch.dispatch.1
+
+catch.2:                                          ; preds = %catch.dispatch
+  %0 = catchpad within %cs1 [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i8* null]
+  tail call void @exit(i32 0) #2 [ "funclet"(token %0) ]
+  unreachable
+
+catch.dispatch.1:                                 ; preds = %catch.dispatch
+  %cs2 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch.1
+  %1 = catchpad within %cs2 [i8* null, i32 64, i8* null]
+  tail call void @exit(i32 0) #2 [ "funclet"(token %1) ]
+  unreachable
+
+try.cont:                                         ; preds = %entry
+  br i1 %V, label %exit_one, label %exit_two
+
+exit_one:
+  tail call void @exit(i32 0)
+  unreachable
+
+exit_two:
+  tail call void @exit(i32 0)
+  unreachable
+}
+
+; CHECK-LABEL: test3:
+
+; The entry funclet contains %entry and %try.cont
+; CHECK: # %entry
+; CHECK: # %try.cont
+; CHECK: callq exit
+; CHECK-NOT: # exit_one
+; CHECK-NOT: # exit_two
+; CHECK: ud2
+
+; The catch(...) funclet contains %catch.2
+; CHECK: # %catch.2{{$}}
+; CHECK: callq exit
+; CHECK: ud2
+
+; The catch(int) funclet contains %catch
+; CHECK: # %catch{{$}}
+; CHECK: callq exit
+; CHECK: ud2
+
+declare void @exit(i32) noreturn nounwind
+declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
+declare i32 @__CxxFrameHandler3(...)
diff --git a/test/CodeGen/X86/function-alias.ll b/test/CodeGen/X86/function-alias.ll
new file mode 100644
index 0000000000000..d68d75d5578aa
--- /dev/null
+++ b/test/CodeGen/X86/function-alias.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; "data" constant
+@0 = private constant <{ i8, i8 }> <{i8 15, i8 11}>, section ".text"
+
+; function-typed alias
+@ud2 = alias void (), bitcast (<{ i8, i8 }>* @0 to void ()*)
+
+; Check that "ud2" is emitted as a function symbol.
+; CHECK: .type{{.*}}ud2,@function
diff --git a/test/CodeGen/X86/gcc_except_table.ll b/test/CodeGen/X86/gcc_except_table.ll
index 82064c2a39078..92ea539bcf77a 100644
--- a/test/CodeGen/X86/gcc_except_table.ll
+++ b/test/CodeGen/X86/gcc_except_table.ll
@@ -18,9 +18,9 @@ define i32 @main() uwtable optsize ssp personality i8* bitcast (i32 (...)* @__gx
 ; MINGW64: .seh_setframe 5, 32
 ; MINGW64: callq _Unwind_Resume
 ; MINGW64: .seh_handlerdata
+; MINGW64: .seh_endproc
 ; MINGW64: GCC_except_table0:
 ; MINGW64: Lexception0:
-; MINGW64: .seh_endproc
 
 ; MINGW32: .cfi_startproc
 ; MINGW32: .cfi_personality 0, ___gxx_personality_v0
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index 82547a6067429..92440f2b33160 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -117,7 +117,7 @@ bb7:
 
 ; TODO: linux drops this into .rodata, we drop it into ".gnu.linkonce.r.G2"
 
-; DARWIN: .section __TEXT,__const_coal,coalesced
+; DARWIN: .section __TEXT,__const{{$}}
 ; DARWIN: _G2:
 ; DARWIN:    .long 42
 
@@ -176,7 +176,6 @@ bb7:
 ; LINUX: .weak  "foo bar"
 ; LINUX: "foo bar":
 
-; DARWIN: .section              __DATA,__datacoal_nt,coalesced
 ; DARWIN: .globl        "_foo bar"
 ; DARWIN:       .weak_definition "_foo bar"
 ; DARWIN: "_foo bar":
@@ -190,7 +189,7 @@ bb7:
 ; LINUX:   .byte        1
 ; LINUX:   .size        G6, 1
 
-; DARWIN:  .section __TEXT,__const_coal,coalesced
+; DARWIN:  .section __TEXT,__const{{$}}
 ; DARWIN:  .globl _G6
 ; DARWIN:  .weak_definition _G6
 ; DARWIN:_G6:
@@ -239,7 +238,7 @@ bb7:
 @G10 = weak global [100 x i32] zeroinitializer, align 32 ; <[100 x i32]*> [#uses=0]
 
 
-; DARWIN:       .section        __DATA,__datacoal_nt,coalesced
+; DARWIN:       .section        __DATA,__data{{$}}
 ; DARWIN: .globl _G10
 ; DARWIN:       .weak_definition _G10
 ; DARWIN:       .align  5
diff --git a/test/CodeGen/X86/h-register-store.ll b/test/CodeGen/X86/h-register-store.ll
index 0adb2b148c398..0e6a0236d2c3d 100644
--- a/test/CodeGen/X86/h-register-store.ll
+++ b/test/CodeGen/X86/h-register-store.ll
@@ -7,6 +7,15 @@
 ; X64-NEXT: movb %ah, (%rsi)
 ; X64-NOT:      mov
 
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s -check-prefix=X32
+; X32:      mov
+; X32-NEXT: movb %ah, (%esi)
+; X32:      mov
+; X32-NEXT: movb %ah, (%esi)
+; X32:      mov
+; X32-NEXT: movb %ah, (%esi)
+; X32-NOT:      mov
+
 ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=W64
 ; W64-NOT:      mov
 ; W64:      movb %ch, (%rdx)
@@ -16,14 +25,14 @@
 ; W64:      movb %ch, (%rdx)
 ; W64-NOT:      mov
 
-; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X32
-; X32-NOT:      mov
-; X32:      movb %ah, (%e
-; X32-NOT:      mov
-; X32:      movb %ah, (%e
-; X32-NOT:      mov
-; X32:      movb %ah, (%e
-; X32-NOT:      mov
+; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X86
+; X86-NOT:      mov
+; X86:      movb %ah, (%e
+; X86-NOT:      mov
+; X86:      movb %ah, (%e
+; X86-NOT:      mov
+; X86:      movb %ah, (%e
+; X86-NOT:      mov
 
 ; Use h-register extract and store.
 
diff --git a/test/CodeGen/X86/h-registers-0.ll b/test/CodeGen/X86/h-registers-0.ll
index 6a5ccaa1e76f2..9b72916ea7438 100644
--- a/test/CodeGen/X86/h-registers-0.ll
+++ b/test/CodeGen/X86/h-registers-0.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mattr=-bmi -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
+; RUN: llc < %s -mattr=-bmi -mtriple=x86_64-linux-gnux32 | FileCheck %s -check-prefix=X86-64
 ; RUN: llc < %s -mattr=-bmi -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
 ; RUN: llc < %s -mattr=-bmi -march=x86    | FileCheck %s -check-prefix=X86-32
 
diff --git a/test/CodeGen/X86/h-registers-1.ll b/test/CodeGen/X86/h-registers-1.ll
index 7254325a92657..469d5517b40bd 100644
--- a/test/CodeGen/X86/h-registers-1.ll
+++ b/test/CodeGen/X86/h-registers-1.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mattr=-bmi < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc -mattr=-bmi < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
 
 ; LLVM creates virtual registers for values live across blocks
 ; based on the type of the value. Make sure that the extracts
diff --git a/test/CodeGen/X86/h-registers-3.ll b/test/CodeGen/X86/h-registers-3.ll
index 29d0c280c4fb5..58b02b7df21f5 100644
--- a/test/CodeGen/X86/h-registers-3.ll
+++ b/test/CodeGen/X86/h-registers-3.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=x86    | grep mov | count 1
 ; RUN: llc < %s -march=x86-64 | grep mov | count 1
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | grep mov | count 1
 
 define zeroext i8 @foo() nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll
index 8a726370f19ab..3b2518e28f587 100644
--- a/test/CodeGen/X86/half.ll
+++ b/test/CodeGen/X86/half.ll
@@ -77,7 +77,7 @@ define i64 @test_fptosi_i64(half* %p) #0 {
 ; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
 ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
 ; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
-; CHECK-LIBCALL-NEXT: popq %rdx
+; CHECK-LIBCALL-NEXT: popq %rcx
 ; CHECK-LIBCALL-NEXT: retq
 
 ; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]]
@@ -127,7 +127,7 @@ define i64 @test_fptoui_i64(half* %p) #0 {
 ; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, [[REG5:%[a-z0-9]+]]
 ; CHECK-LIBCALL-NEXT: ucomiss [[REG1]], %xmm0
 ; CHECK-LIBCALL-NEXT: cmovaeq [[REG4]], [[REG5]]
-; CHECK-LIBCALL-NEXT: popq %rdx
+; CHECK-LIBCALL-NEXT: popq %rcx
 ; CHECK-LIBCALL-NEXT: retq
 
 ; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]]
diff --git a/test/CodeGen/X86/hhvm-cc.ll b/test/CodeGen/X86/hhvm-cc.ll
new file mode 100644
index 0000000000000..3b729ed72f1cc
--- /dev/null
+++ b/test/CodeGen/X86/hhvm-cc.ll
@@ -0,0 +1,241 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare hhvmcc i64 @bar(i64, i64, i64) nounwind
+
+; Simply check we can modify %rbx and %rbp before returning via call to bar.
+define hhvmcc i64 @foo(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  foo:
+; CHECK-DAG:    movl $1, %ebx
+; CHECK-DAG:    movl $3, %ebp
+; CHECK:        jmp bar
+  %ret = musttail call hhvmcc i64 @bar(i64 1, i64 %b, i64 3)
+  ret i64 %ret
+}
+
+; Check that we can read and modify %rbx returned from PHP function.
+define hhvmcc i64 @mod_return(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  mod_return:
+; CHECK-NEXT:   {{^#.*}}
+; CHECK-NEXT:   callq bar
+; CHECK-NEXT:   incq %rbx
+  %tmp = call hhvmcc i64 @bar(i64 %a, i64 %b, i64 %c)
+  %retval = add i64 %tmp, 1
+  ret i64 %retval
+}
+
+%rettype = type { i64, i64, i64, i64, i64, i64, i64,
+                  i64, i64, i64, i64, i64, i64, i64
+}
+
+; Check that we can return up to 14 64-bit args in registers.
+define hhvmcc %rettype @return_all(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  return_all:
+; CHECK-DAG:    movl $1, %ebx
+; CHECK-DAG:    movl $2, %ebp
+; CHECK-DAG:    movl $3, %edi
+; CHECK-DAG:    movl $4, %esi
+; CHECK-DAG:    movl $5, %edx
+; CHECK-DAG:    movl $6, %ecx
+; CHECK-DAG:    movl $7, %r8
+; CHECK-DAG:    movl $8, %r9
+; CHECK-DAG:    movl $9, %eax
+; CHECK-DAG:    movl $10, %r10
+; CHECK-DAG:    movl $11, %r11
+; CHECK-DAG:    movl $12, %r13
+; CHECK-DAG:    movl $13, %r14
+; CHECK-DAG:    movl $14, %r15
+; CHECK:        retq
+  %r1 = insertvalue %rettype zeroinitializer, i64 1, 0
+  %r2 = insertvalue %rettype %r1, i64 2, 1
+  %r3 = insertvalue %rettype %r2, i64 3, 2
+  %r4 = insertvalue %rettype %r3, i64 4, 3
+  %r5 = insertvalue %rettype %r4, i64 5, 4
+  %r6 = insertvalue %rettype %r5, i64 6, 5
+  %r7 = insertvalue %rettype %r6, i64 7, 6
+  %r8 = insertvalue %rettype %r7, i64 8, 7
+  %r9 = insertvalue %rettype %r8, i64 9, 8
+  %r10 = insertvalue %rettype %r9, i64 10, 9
+  %r11 = insertvalue %rettype %r10, i64 11, 10
+  %r12 = insertvalue %rettype %r11, i64 12, 11
+  %r13 = insertvalue %rettype %r12, i64 13, 12
+  %r14 = insertvalue %rettype %r13, i64 14, 13
+  ret %rettype %r14
+}
+
+declare hhvmcc void @return_all_tc(i64, i64, i64, i64, i64, i64, i64, i64,
+                                 i64, i64, i64, i64, i64, i64, i64)
+
+; Check that we can return up to 14 64-bit args in registers via tail call.
+define hhvmcc void @test_return_all_tc(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  test_return_all_tc:
+; CHECK-NEXT:   {{^#.*}}
+; CHECK-DAG:    movl $1, %ebx
+; CHECK-DAG:    movl $3, %ebp
+; CHECK-DAG:    movl $4, %r15
+; CHECK-DAG:    movl $5, %edi
+; CHECK-DAG:    movl $6, %esi
+; CHECK-DAG:    movl $7, %edx
+; CHECK-DAG:    movl $8, %ecx
+; CHECK-DAG:    movl $9, %r8
+; CHECK-DAG:    movl $10, %r9
+; CHECK-DAG:    movl $11, %eax
+; CHECK-DAG:    movl $12, %r10
+; CHECK-DAG:    movl $13, %r11
+; CHECK-DAG:    movl $14, %r13
+; CHECK-DAG:    movl $15, %r14
+; CHECK:        jmp  return_all_tc
+  tail call hhvmcc void @return_all_tc(
+    i64 1, i64 %b, i64 3, i64 4, i64 5, i64 6, i64 7,
+    i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15)
+  ret void
+}
+
+declare hhvmcc {i64, i64} @php_short(i64, i64, i64, i64)
+
+define hhvmcc i64 @test_php_short(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  test_php_short:
+; CHECK-NEXT:   {{^#.*}}
+; CHECK-NEXT:   movl $42, %r15
+; CHECK-NEXT:   callq php_short
+; CHECK-NEXT:   leaq (%rbp,%r12), %rbx
+; CHECK-NEXT:   retq
+  %pair = call hhvmcc {i64, i64} @php_short(i64 %a, i64 %b, i64 %c, i64 42)
+  %fp = extractvalue {i64, i64} %pair, 1
+  %rv = add i64 %fp, %b
+  ret i64 %rv
+}
+
+declare hhvmcc %rettype @php_all(i64, i64, i64, i64, i64, i64, i64,
+                                 i64, i64, i64, i64, i64, i64, i64, i64)
+
+; Check that we can pass 15 arguments in registers.
+; Also check that %r12 (2nd arg) is not spilled.
+define hhvmcc i64 @test_php_all(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  test_php_all:
+; CHECK-NEXT:   {{^#.*}}
+; CHECK-NOT:    sub
+; CHECK-NOT:    sub
+; CHECK-DAG:    movl $1, %ebx
+; CHECK-DAG:    movl $3, %ebp
+; CHECK-DAG:    movl $4, %r15
+; CHECK-DAG:    movl $5, %edi
+; CHECK-DAG:    movl $6, %esi
+; CHECK-DAG:    movl $7, %edx
+; CHECK-DAG:    movl $8, %ecx
+; CHECK-DAG:    movl $9, %r8
+; CHECK-DAG:    movl $10, %r9
+; CHECK-DAG:    movl $11, %eax
+; CHECK-DAG:    movl $12, %r10
+; CHECK-DAG:    movl $13, %r11
+; CHECK-DAG:    movl $14, %r13
+; CHECK-DAG:    movl $15, %r14
+; CHECK:        callq php_all
+  %pair = call hhvmcc %rettype @php_all(
+    i64 1, i64 %b, i64 3, i64 4, i64 5, i64 6, i64 7,
+    i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15)
+  %fp = extractvalue %rettype %pair, 1
+  %rv = add i64 %fp, %b
+  ret i64 %rv
+}
+
+declare hhvmcc void @svcreq(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64,
+                             i64, i64)
+
+define hhvmcc void @test_svcreq(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  test_svcreq:
+; CHECK-DAG:    movl $42, %r10
+; CHECK-DAG:    movl $1, %edi
+; CHECK-DAG:    movl $2, %esi
+; CHECK-DAG:    movl $3, %edx
+; CHECK-DAG:    movl $4, %ecx
+; CHECK-DAG:    movl $5, %r8
+; CHECK-DAG:    movl $6, %r9
+; CHECK:        jmp svcreq
+  tail call hhvmcc void @svcreq(i64 %a, i64 %b, i64 %c, i64 undef, i64 1,
+                                i64 2, i64 3, i64 4, i64 5, i64 6, i64 undef,
+                                i64 42)
+  ret void
+}
+
+declare hhvm_ccc void @helper_short(i64, i64, i64, i64, i64, i64, i64)
+
+; Pass all arguments in registers and check that we don't adjust stack
+; for the call.
+define hhvmcc void @test_helper_short(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  test_helper_short:
+; CHECK-NOT:    push
+; CHECK-NOT:    sub
+; CHECK-DAG:    movl $1, %edi
+; CHECK-DAG:    movl $2, %esi
+; CHECK-DAG:    movl $3, %edx
+; CHECK-DAG:    movl $4, %ecx
+; CHECK-DAG:    movl $5, %r8
+; CHECK-DAG:    movl $6, %r9
+; CHECK:        callq helper_short
+  call hhvm_ccc void @helper_short(i64 %c, i64 1, i64 2, i64 3, i64 4,
+                                   i64 5, i64 6)
+  ret void
+}
+
+declare hhvm_ccc void @helper(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64)
+
+define hhvmcc void @test_helper(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  test_helper:
+; CHECK-DAG:    movl $1, %edi
+; CHECK-DAG:    movl $2, %esi
+; CHECK-DAG:    movl $3, %edx
+; CHECK-DAG:    movl $4, %ecx
+; CHECK-DAG:    movl $5, %r8
+; CHECK-DAG:    movl $6, %r9
+; CHECK:        callq helper
+  call hhvm_ccc void @helper(i64 %c, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6,
+                             i64 7, i64 8, i64 9)
+  ret void
+}
+
+; When we enter function with HHVM calling convention, the stack is aligned
+; at 16 bytes. This means we align objects on the stack differently and
+; adjust the stack differently for calls.
+declare hhvm_ccc void @stack_helper(i64, i64, i64)
+declare hhvm_ccc void @stack_helper2(<2 x double>, i64)
+
+define hhvmcc void @test_stack_helper(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  test_stack_helper:
+; CHECK-NOT:    push
+; CHECK:        subq $32, %rsp
+; CHECK:        movaps  16(%rsp), %xmm0
+; CHECK:        callq stack_helper2
+  %t1 = alloca <2 x double>, align 16
+  %t2 = alloca i64, align 8
+  %t3 = alloca i64, align 8
+  %load3 = load i64, i64 *%t3
+  call hhvm_ccc void @stack_helper(i64 %c, i64 %load3, i64 42)
+  %load = load <2 x double>, <2 x double> *%t1
+  %load2 = load i64, i64 *%t2
+  call hhvm_ccc void @stack_helper2(<2 x double> %load, i64 %load2)
+  ret void
+}
+
+; Check that we are not adjusting the stack before calling the helper.
+define hhvmcc void @test_stack_helper2(i64 %a, i64 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL:  test_stack_helper2:
+; CHECK-NOT:    push
+; CHECK-NOT:    subq
+  call hhvm_ccc void @stack_helper(i64 %c, i64 7, i64 42)
+  ret void
+}
+
diff --git a/test/CodeGen/X86/i386-shrink-wrapping.ll b/test/CodeGen/X86/i386-shrink-wrapping.ll
new file mode 100644
index 0000000000000..748c397143c54
--- /dev/null
+++ b/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -0,0 +1,113 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-apple-macosx"
+
+@a = common global i32 0, align 4
+@d = internal unnamed_addr global i1 false
+@b = common global i32 0, align 4
+@e = common global i8 0, align 1
+@f = common global i8 0, align 1
+@c = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+
+; Check that we are clobbering the flags when they are live-in of the
+; prologue block and the prologue needs to adjust the stack.
+; PR25607.
+;
+; CHECK-LABEL: eflagsLiveInPrologue:
+;
+; DISABLE: pushl
+; DISABLE-NEXT: pushl
+; DISABLE-NEXT: subl $20, %esp
+;
+; CHECK: movl L_a$non_lazy_ptr, [[A:%[a-z]+]]
+; CHECK-NEXT: cmpl $0, ([[A]])
+; CHECK-NEXT: je [[PREHEADER_LABEL:LBB[0-9_]+]]
+;
+; CHECK: movb $1, _d
+;
+; CHECK: [[PREHEADER_LABEL]]:
+; CHECK-NEXT: movl L_b$non_lazy_ptr, [[B:%[a-z]+]]
+; CHECK-NEXT: movl ([[B]]), [[TMP1:%[a-z]+]]
+; CHECK-NEXT: testl [[TMP1]], [[TMP1]]
+; CHECK-NEXT: je  [[FOREND_LABEL:LBB[0-9_]+]]
+;
+; Skip the loop.
+; [...]
+;
+; The for.end block is split to accomadate the different selects.
+; We are interested in the one with the call, so skip until the branch.
+; CHECK: [[FOREND_LABEL]]:
+; CHECK-NEXT: movb _d, [[D:%[a-z]+]]
+; [...]
+; CHECK: jne [[CALL_LABEL:LBB[0-9_]+]]
+;
+; CHECK: movb $6, [[D]]
+;
+; CHECK: [[CALL_LABEL]]
+;
+; ENABLE-NEXT: pushl
+; ENABLE-NEXT: pushl
+; We must not use sub here otherwise we will clobber the eflags.
+; ENABLE-NEXT: leal -20(%esp), %esp
+;
+; CHECK-NEXT: L_e$non_lazy_ptr, [[E:%[a-z]+]]
+; CHECK-NEXT: movb [[D]], ([[E]])
+; CHECK-NEXT: L_f$non_lazy_ptr, [[F:%[a-z]+]]
+; CHECK-NEXT: movsbl ([[F]]), [[CONV:%[a-z]+]]
+; CHECK-NEXT: movl $6, [[CONV:%[a-z]+]]
+; The eflags is used in the next instruction.
+; If that instruction disappear, we are not exercising the bug
+; anymore.
+; CHECK-NEXT: cmovnel {{%[a-z]+}}, [[CONV]]
+;
+; Skip all the crust of vaarg lowering.
+; CHECK: calll L_varfunc$stub
+; Set the return value to 0.
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: addl $20, %esp
+; CHECK-NEXT: popl
+; CHECK-NEXT: popl
+; CHECK-NEXT: retl
+define i32 @eflagsLiveInPrologue() #0 {
+entry:
+  %tmp = load i32, i32* @a, align 4
+  %tobool = icmp eq i32 %tmp, 0
+  br i1 %tobool, label %for.cond.preheader, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i1 true, i1* @d, align 1
+  br label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %if.then, %entry
+  %tmp1 = load i32, i32* @b, align 4
+  %tobool14 = icmp eq i32 %tmp1, 0
+  br i1 %tobool14, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.cond.preheader
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  br label %for.body
+
+for.end:                                          ; preds = %for.cond.preheader
+  %.b3 = load i1, i1* @d, align 1
+  %tmp2 = select i1 %.b3, i8 0, i8 6
+  store i8 %tmp2, i8* @e, align 1
+  %tmp3 = load i8, i8* @f, align 1
+  %conv = sext i8 %tmp3 to i32
+  %add = add nsw i32 %conv, 1
+  %rem = srem i32 %tmp1, %add
+  store i32 %rem, i32* @c, align 4
+  %conv2 = select i1 %.b3, i32 0, i32 6
+  %call = tail call i32 (i8*, ...) @varfunc(i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %conv2) #1
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare i32 @varfunc(i8* nocapture readonly, ...) #0
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/X86/immediate_merging.ll b/test/CodeGen/X86/immediate_merging.ll
new file mode 100644
index 0000000000000..8aef9c279b319
--- /dev/null
+++ b/test/CodeGen/X86/immediate_merging.ll
@@ -0,0 +1,82 @@
+; RUN: llc -o - -mtriple=i386-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -o - -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+@c = common global i32 0, align 4
+@e = common global i32 0, align 4
+@x = common global i32 0, align 4
+@f = common global i32 0, align 4
+@h = common global i32 0, align 4
+@i = common global i32 0, align 4
+
+; Test -Os to make sure immediates with multiple users don't get pulled in to
+; instructions.
+define i32 @foo() optsize {
+; CHECK-LABEL: foo:
+; CHECK: movl $1234, [[R1:%[a-z]+]]
+; CHECK-NOT: movl $1234, a
+; CHECK-NOT: movl $1234, b
+; CHECK-NOT: movl $12, c
+; CHECK-NOT: cmpl $12, e
+; CHECK: movl [[R1]], a
+; CHECK: movl [[R1]], b
+
+entry:
+  store i32 1234, i32* @a
+  store i32 1234, i32* @b
+  store i32 12, i32* @c
+  %0 = load i32, i32* @e
+  %cmp = icmp eq i32 %0, 12
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @x
+  br label %if.end
+
+; New block.. Make sure 1234 isn't live across basic blocks from before.
+; CHECK: movl $1234, f
+; CHECK: movl $555, [[R3:%[a-z]+]]
+; CHECK-NOT: movl $555, h
+; CHECK-NOT: addl $555, i
+; CHECK: movl [[R3]], h
+; CHECK: addl [[R3]], i
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1234, i32* @f
+  store i32 555, i32* @h
+  %1 = load i32, i32* @i
+  %add1 = add nsw i32 %1, 555
+  store i32 %add1, i32* @i
+  ret i32 0
+}
+
+; Test -O2 to make sure that all immediates get pulled in to their users.
+define i32 @foo2() {
+; CHECK-LABEL: foo2:
+; CHECK: movl $1234, a
+; CHECK: movl $1234, b
+
+entry:
+  store i32 1234, i32* @a
+  store i32 1234, i32* @b
+
+  ret i32 0
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #1
+
+@AA = common global [100 x i8] zeroinitializer, align 1
+
+; memset gets lowered in DAG. Constant merging should hoist all the
+; immediates used to store to the individual memory locations. Make
+; sure we don't directly store the immediates.
+define void @foomemset() optsize {
+; CHECK-LABEL: foomemset:
+; CHECK-NOT: movl ${{.*}}, AA
+; CHECK: mov{{l|q}} %{{e|r}}ax, AA
+
+entry:
+  call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i32 1, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/X86/implicit-null-check.ll b/test/CodeGen/X86/implicit-null-check.ll
index fd7a902eefc13..8b905f5d23b68 100644
--- a/test/CodeGen/X86/implicit-null-check.ll
+++ b/test/CodeGen/X86/implicit-null-check.ll
@@ -101,6 +101,40 @@ define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z)
   ret i32 %t1
 }
 
+define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
+; CHECK-LABEL: _imp_null_check_via_mem_comparision
+; CHECK: Ltmp9:
+; CHECK: cmpl   %esi, 4(%rdi)
+; CHECK: jge    LBB4_2
+; CHECK: movl   $100, %eax
+; CHECK: retq
+; CHECK: Ltmp8:
+; CHECK: movl   $42, %eax
+; CHECK: retq
+; CHECK: LBB4_2:
+; CHECK: movl   $200, %eax
+; CHECK: retq
+
+ entry:
+  %c = icmp eq i32* %x, null
+  br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+  ret i32 42
+
+ not_null:
+  %x.loc = getelementptr i32, i32* %x, i32 1
+  %t = load i32, i32* %x.loc
+  %m = icmp slt i32 %t, %val
+  br i1 %m, label %ret_100, label %ret_200
+
+ ret_100:
+  ret i32 100
+
+ ret_200:
+  ret i32 200
+}
+
 !0 = !{}
 
 ; CHECK-LABEL: __LLVM_FaultMaps:
@@ -113,7 +147,7 @@ define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z)
 ; CHECK-NEXT: .short 0
 
 ; # functions:
-; CHECK-NEXT: .long 4
+; CHECK-NEXT: .long 5
 
 ; FunctionAddr:
 ; CHECK-NEXT: .quad _imp_null_check_add_result
@@ -167,9 +201,22 @@ define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z)
 ; Fault[0].HandlerOffset:
 ; CHECK-NEXT: .long Ltmp0-_imp_null_check_load
 
+; FunctionAddr:
+; CHECK-NEXT: .quad     _imp_null_check_via_mem_comparision
+; NumFaultingPCs
+; CHECK-NEXT: .long   1
+; Reserved:
+; CHECK-NEXT: .long   0
+; Fault[0].Type:
+; CHECK-NEXT: .long   1
+; Fault[0].FaultOffset:
+; CHECK-NEXT: .long   Ltmp9-_imp_null_check_via_mem_comparision
+; Fault[0].HandlerOffset:
+; CHECK-NEXT: .long   Ltmp8-_imp_null_check_via_mem_comparision
+
 ; OBJDUMP: FaultMap table:
 ; OBJDUMP-NEXT: Version: 0x1
-; OBJDUMP-NEXT: NumFunctions: 4
+; OBJDUMP-NEXT: NumFunctions: 5
 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 5
 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
diff --git a/test/CodeGen/X86/imul.ll b/test/CodeGen/X86/imul.ll
index c64b4e302b921..9d4d19332dbb1 100644
--- a/test/CodeGen/X86/imul.ll
+++ b/test/CodeGen/X86/imul.ll
@@ -108,3 +108,66 @@ define i64 @mul40_64(i64 %A) {
     %mul = mul i64 %A, 40
     ret i64 %mul
 }
+
+define i32 @mul4_32_minsize(i32 %A) minsize {
+; X64-LABEL: mul4_32_minsize:
+; X64: leal
+; X86-LABEL: mul4_32_minsize:
+; X86: shll
+    %mul = mul i32 %A, 4
+    ret i32 %mul
+}
+
+define i32 @mul40_32_minsize(i32 %A) minsize {
+; X64-LABEL: mul40_32_minsize:
+; X64: imull
+; X86-LABEL: mul40_32_minsize:
+; X86: imull
+    %mul = mul i32 %A, 40
+    ret i32 %mul
+}
+
+define i32 @mul33_32(i32 %A) {
+; X64-LABEL: mul33_32:
+; X64: shll
+; X64-NEXT: leal
+; X86-LABEL: mul33_32:
+; X86: shll
+; X86-NEXT: addl
+    %mul = mul i32 %A, 33
+    ret i32 %mul
+}
+
+define i32 @mul31_32(i32 %A) {
+; X64-LABEL: mul31_32:
+; X64: shll
+; X64-NEXT: subl
+; X86-LABEL: mul31_32:
+; X86: shll
+; X86-NEXT: subl
+    %mul = mul i32 %A, 31
+    ret i32 %mul
+}
+
+define i32 @mul0_32(i32 %A) {
+; X64-LABEL: mul0_32:
+; X64: xorl	%eax, %eax
+    %mul = mul i32 %A, 0
+    ret i32 %mul
+}
+
+define i32 @mul4294967295_32(i32 %A) {
+; X64-LABEL: mul4294967295_32:
+; X64: negl	%edi
+; X64-NEXT:	movl	%edi, %eax
+    %mul = mul i32 %A, 4294967295
+    ret i32 %mul
+}
+
+define i64 @mul18446744073709551615_64(i64 %A) {
+; X64-LABEL: mul18446744073709551615_64:
+; X64: negq	%rdi
+; X64-NEXT:	movq	%rdi, %rax
+    %mul = mul i64 %A, 18446744073709551615
+    ret i64 %mul
+}
diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll
index e5f6ea70e9cb4..4f7e4092a99c0 100644
--- a/test/CodeGen/X86/inalloca-stdcall.ll
+++ b/test/CodeGen/X86/inalloca-stdcall.ll
@@ -14,8 +14,9 @@ define void @g() {
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK: movl    $13, (%esp)
-; CHECK: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK: movl    $13, (%eax)
+; CHECK: movl    $42, 4(%eax)
   call x86_stdcallcc void @f(%Foo* inalloca %b)
 ; CHECK: calll   _f@8
 ; CHECK-NOT: %esp
diff --git a/test/CodeGen/X86/inalloca.ll b/test/CodeGen/X86/inalloca.ll
index 904366219ab78..e523c945a69f8 100644
--- a/test/CodeGen/X86/inalloca.ll
+++ b/test/CodeGen/X86/inalloca.ll
@@ -14,8 +14,9 @@ entry:
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK: movl    $13, (%esp)
-; CHECK: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK: movl    $13, (%eax)
+; CHECK: movl    $42, 4(%eax)
   call void @f(%Foo* inalloca %b)
 ; CHECK: calll   _f
   ret void
@@ -33,8 +34,9 @@ entry:
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK: movl    $13, (%esp)
-; CHECK: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK: movl    $13, (%eax)
+; CHECK: movl    $42, 4(%eax)
   call void @inreg_with_inalloca(i32 inreg 1, %Foo* inalloca %b)
 ; CHECK: movl    $1, %eax
 ; CHECK: calll   _inreg_with_inalloca
@@ -53,8 +55,9 @@ entry:
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK-DAG: movl    $13, (%esp)
-; CHECK-DAG: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK-DAG: movl    $13, (%eax)
+; CHECK-DAG: movl    $42, 4(%eax)
   call x86_thiscallcc void @thiscall_with_inalloca(i8* null, %Foo* inalloca %b)
 ; CHECK-DAG: xorl    %ecx, %ecx
 ; CHECK: calll   _thiscall_with_inalloca
diff --git a/test/CodeGen/X86/inconsistent_landingpad.ll b/test/CodeGen/X86/inconsistent_landingpad.ll
new file mode 100644
index 0000000000000..495e999c4a955
--- /dev/null
+++ b/test/CodeGen/X86/inconsistent_landingpad.ll
@@ -0,0 +1,30 @@
+; RUN: not llvm-as -disable-output <%s 2>&1 | FileCheck %s
+
+define void @test() personality i32 (...)* @dummy_personality {
+; CHECK: The landingpad instruction should have a consistent result type inside a function
+entry:
+  invoke void @dummy1()
+          to label %next unwind label %unwind1
+
+unwind1:
+  %lp1 = landingpad token
+            cleanup
+  br label %return
+
+next:
+  invoke void @dummy2()
+          to label %return unwind label %unwind2
+
+unwind2:
+  %lp2 = landingpad { i8*, i32 }
+            cleanup
+  br label %return
+
+return:
+  ret void
+}
+
+declare void @dummy1()
+declare void @dummy2()
+
+declare i32 @dummy_personality(...)
diff --git a/test/CodeGen/X86/inline-asm-2addr.ll b/test/CodeGen/X86/inline-asm-2addr.ll
index 4a2c7fc5ebac5..079f883186fb6 100644
--- a/test/CodeGen/X86/inline-asm-2addr.ll
+++ b/test/CodeGen/X86/inline-asm-2addr.ll
@@ -1,9 +1,18 @@
-; RUN: llc < %s -march=x86-64 | not grep movq
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 define i64 @t(i64 %a, i64 %b) nounwind ssp {
 entry:
+; CHECK-LABEL: t:
 	%asmtmp = tail call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 1, i64 %a) nounwind		; <i64> [#uses=1]
+; CHECK:      #APP
+; CHECK-NEXT: rorq    %[[REG1:.*]]
+; CHECK-NEXT: #NO_APP
 	%asmtmp1 = tail call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 1, i64 %b) nounwind		; <i64> [#uses=1]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: rorq    %[[REG2:.*]]
+; CHECK-NEXT: #NO_APP
 	%0 = add i64 %asmtmp1, %asmtmp		; <i64> [#uses=1]
+; CHECK-NEXT: leaq    (%[[REG2]],%[[REG1]]), %rax
 	ret i64 %0
+; CHECK:      retq
 }
diff --git a/test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll b/test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll
index b55571bcba09e..970b9943948ff 100644
--- a/test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll
+++ b/test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -force-align-stack -mtriple i386-apple-darwin -mcpu=i486 | FileCheck %s
+; RUN: llc < %s -stackrealign -mtriple i386-apple-darwin -mcpu=i486 | FileCheck %s
 
 %struct.foo = type { [88 x i8] }
 
diff --git a/test/CodeGen/X86/inline-sse.ll b/test/CodeGen/X86/inline-sse.ll
new file mode 100644
index 0000000000000..78d6b762b5e51
--- /dev/null
+++ b/test/CodeGen/X86/inline-sse.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+; PR16133 - we must treat XMM registers as v4f32 as SSE1 targets don't permit other vector types.
+
+define void @nop() nounwind {
+; X32-LABEL: nop:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    #APP
+; X32-NEXT:    #NO_APP
+; X32-NEXT:    movaps %xmm0, (%esp)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: nop:
+; X64:       # BB#0:
+; X64-NEXT:    subq    $24, %rsp
+; X64-NEXT:    #APP
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    movaps %xmm0, (%rsp)
+; X64-NEXT:    addq    $24, %rsp
+; X64-NEXT:    retq
+  %1 = alloca <4 x float>, align 16
+  %2 = call <4 x float> asm "", "=x,~{dirflag},~{fpsr},~{flags}"()
+  store <4 x float> %2, <4 x float>* %1, align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/insertps-from-constantpool.ll b/test/CodeGen/X86/insertps-from-constantpool.ll
new file mode 100644
index 0000000000000..cfcfeacad0678
--- /dev/null
+++ b/test/CodeGen/X86/insertps-from-constantpool.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=X32
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=X64
+
+; Test for case where insertps folds the load of an insertion element from a constant pool.
+
+define <4 x float> @fold_from_constantpool(<4 x float> %a) {
+; X32-LABEL: fold_from_constantpool:
+; X32:       # BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: fold_from_constantpool:
+; X64:       # BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> <float 0.0, float 1.0, float 0.0, float 0.0>, i8 64)
+  ret <4 x float> %1
+}
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/insertps-unfold-load-bug.ll b/test/CodeGen/X86/insertps-unfold-load-bug.ll
new file mode 100644
index 0000000000000..bf7c4bc4d7b97
--- /dev/null
+++ b/test/CodeGen/X86/insertps-unfold-load-bug.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X32
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X64
+
+; Test for case where insertps was folding the load of the insertion element, but a later optimization
+; was then manipulating the load.
+
+define <4 x float> @insertps_unfold(<4 x float>* %v0, <4 x float>* %v1) {
+; X32-LABEL: insertps_unfold:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    movaps (%eax), %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT:    addps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_unfold:
+; X64:       # BB#0:
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movaps (%rdi), %xmm0
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    addps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %a = getelementptr inbounds <4 x float>, <4 x float>* %v1, i64 0, i64 1
+  %b = load float, float* %a, align 4
+  %c = insertelement <4 x float> undef, float %b, i32 0
+  %d = load <4 x float>, <4 x float>* %v1, align 16
+  %e = load <4 x float>, <4 x float>* %v0, align 16
+  %f = shufflevector <4 x float> %e, <4 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  %g = fadd <4 x float> %c, %f
+  ret <4 x float> %g
+}
diff --git a/test/CodeGen/X86/int-intrinsic.ll b/test/CodeGen/X86/int-intrinsic.ll
index 45a9b0f15c674..b253e6c5f3b02 100644
--- a/test/CodeGen/X86/int-intrinsic.ll
+++ b/test/CodeGen/X86/int-intrinsic.ll
@@ -11,7 +11,7 @@ bb.entry:
   ret void
 }
 
-; CHECK: int	$-128
+; CHECK: int	$128
 ; CHECK: ret
 define void @primitive_int128 () {
 bb.entry:
diff --git a/test/CodeGen/X86/late-address-taken.ll b/test/CodeGen/X86/late-address-taken.ll
new file mode 100644
index 0000000000000..f98c53595abb0
--- /dev/null
+++ b/test/CodeGen/X86/late-address-taken.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -enable-shrink-wrap=false | FileCheck %s
+; Make sure shrink-wrapping does not break the lowering of exception handling.
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -enable-shrink-wrap=true | FileCheck %s
+
+; Repro cases from PR25168
+
+; test @catchret - catchret target is not address-taken until PEI
+; splits it into lea/mov followed by ret.  Make sure the MBB is
+; handled, both by tempting BranchFolding to merge it with %early_out
+; and delete it, and by checking that we emit a proper reference
+; to it in the LEA
+
+declare void @ProcessCLRException()
+declare void @f()
+
+define void @catchret(i1 %b) personality void ()* @ProcessCLRException {
+entry:
+  br i1 %b, label %body, label %early_out
+early_out:
+  ret void
+body:
+  invoke void @f()
+          to label %exit unwind label %catch.pad
+catch.pad:
+  %cs1 = catchswitch within none [label %catch.body] unwind to caller
+catch.body:
+  %catch = catchpad within %cs1 [i32 33554467]
+  catchret from %catch to label %exit
+exit:
+  ret void
+}
+; CHECK-LABEL: catchret:  # @catchret
+; CHECK: [[Exit:^[^ :]+]]: # Block address taken
+; CHECK-NEXT:              # %exit
+; CHECK: # %catch.body
+; CHECK: .seh_endprolog
+; CHECK: leaq [[Exit]](%rip), %rax
+; CHECK: retq # CATCHRET
+
+
+; test @setjmp - similar to @catchret, but the MBB in question
+; is the one generated when the setjmp's block is split
+
+@buf = internal global [5 x i8*] zeroinitializer
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+declare i8* @llvm.stacksave() nounwind
+declare i32 @llvm.eh.sjlj.setjmp(i8*) nounwind
+declare void @llvm.eh.sjlj.longjmp(i8*) nounwind
+
+define void @setjmp(i1 %b) nounwind {
+entry:
+  br i1 %b, label %early_out, label %sj
+early_out:
+  ret void
+sj:
+  %fp = call i8* @llvm.frameaddress(i32 0)
+  store i8* %fp, i8** getelementptr inbounds ([5 x i8*], [5 x i8*]* @buf, i64 0, i64 0), align 16
+  %sp = call i8* @llvm.stacksave()
+  store i8* %sp, i8** getelementptr inbounds ([5 x i8*], [5 x i8*]* @buf, i64 0, i64 2), align 16
+  call i32 @llvm.eh.sjlj.setjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+  ret void
+}
+; CHECK-LABEL: setjmp: # @setjmp
+; CHECK: # %sj
+; CHECK: leaq [[Label:\..+]](%rip), %[[Reg:.+]]{{$}}
+; CHECK-NEXT: movq %[[Reg]], buf
+; CHECK: {{^}}[[Label]]:  # Block address taken
+; CHECK-NEXT:              # %sj
diff --git a/test/CodeGen/X86/lea-opt.ll b/test/CodeGen/X86/lea-opt.ll
new file mode 100644
index 0000000000000..571f2d9084c4d
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt.ll
@@ -0,0 +1,131 @@
+; RUN: llc < %s -mtriple=x86_64-linux -enable-x86-lea-opt | FileCheck %s
+
+%struct.anon1 = type { i32, i32, i32 }
+%struct.anon2 = type { i32, [32 x i32], i32 }
+
+@arr1 = external global [65 x %struct.anon1], align 16
+@arr2 = external global [65 x %struct.anon2], align 16
+
+define void @test1(i64 %x) nounwind {
+entry:
+  %a = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 0
+  %tmp = load i32, i32* %a, align 4
+  %b = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 1
+  %tmp1 = load i32, i32* %b, align 4
+  %sub = sub i32 %tmp, %tmp1
+  %c = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 2
+  %tmp2 = load i32, i32* %c, align 4
+  %add = add nsw i32 %sub, %tmp2
+  switch i32 %add, label %sw.epilog [
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb.2
+  ]
+
+sw.bb.1:                                          ; preds = %entry
+  store i32 111, i32* %b, align 4
+  store i32 222, i32* %c, align 4
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry
+  store i32 333, i32* %b, align 4
+  store i32 444, i32* %c, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1, %entry
+  ret void
+; CHECK-LABEL: test1:
+; CHECK:	leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
+; CHECK:	movl arr1(,[[REG1]],4), {{.*}}
+; CHECK:	leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
+; CHECK:	subl arr1+4(,[[REG1]],4), {{.*}}
+; CHECK:	leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
+; CHECK:	addl arr1+8(,[[REG1]],4), {{.*}}
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+}
+
+define void @test2(i64 %x) nounwind optsize {
+entry:
+  %a = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 0
+  %tmp = load i32, i32* %a, align 4
+  %b = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 1
+  %tmp1 = load i32, i32* %b, align 4
+  %sub = sub i32 %tmp, %tmp1
+  %c = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 2
+  %tmp2 = load i32, i32* %c, align 4
+  %add = add nsw i32 %sub, %tmp2
+  switch i32 %add, label %sw.epilog [
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb.2
+  ]
+
+sw.bb.1:                                          ; preds = %entry
+  store i32 111, i32* %b, align 4
+  store i32 222, i32* %c, align 4
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry
+  store i32 333, i32* %b, align 4
+  store i32 444, i32* %c, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1, %entry
+  ret void
+; CHECK-LABEL: test2:
+; CHECK:	leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
+; CHECK:	leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
+; CHECK:	movl -4([[REG2]]), {{.*}}
+; CHECK:	subl ([[REG2]]), {{.*}}
+; CHECK:	leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
+; CHECK:	addl ([[REG3]]), {{.*}}
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+}
+
+; Check that LEA optimization pass takes into account a resultant address
+; displacement when choosing a LEA instruction for replacing a redundant
+; address recalculation.
+
+define void @test3(i64 %x) nounwind optsize {
+entry:
+  %a = getelementptr inbounds [65 x %struct.anon2], [65 x %struct.anon2]* @arr2, i64 0, i64 %x, i32 2
+  %tmp = load i32, i32* %a, align 4
+  %b = getelementptr inbounds [65 x %struct.anon2], [65 x %struct.anon2]* @arr2, i64 0, i64 %x, i32 0
+  %tmp1 = load i32, i32* %b, align 4
+  %add = add nsw i32 %tmp, %tmp1
+  switch i32 %add, label %sw.epilog [
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb.2
+  ]
+
+sw.bb.1:                                          ; preds = %entry
+  store i32 111, i32* %a, align 4
+  store i32 222, i32* %b, align 4
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry
+  store i32 333, i32* %a, align 4
+  store i32 444, i32* %b, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1, %entry
+  ret void
+; CHECK-LABEL: test3:
+; CHECK:	imulq {{.*}}, [[REG1:%[a-z]+]]
+; CHECK:	leaq arr2+132([[REG1]]), [[REG2:%[a-z]+]]
+; CHECK:	leaq arr2([[REG1]]), [[REG3:%[a-z]+]]
+
+; REG3's definition is closer to movl than REG2's, but the pass still chooses
+; REG2 because it provides the resultant address displacement fitting 1 byte.
+
+; CHECK:	movl ([[REG2]]), {{.*}}
+; CHECK:	addl ([[REG3]]), {{.*}}
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+}
diff --git a/test/CodeGen/X86/lit.local.cfg b/test/CodeGen/X86/lit.local.cfg
index 8ed58f119c4fb..4a1dd86abc450 100644
--- a/test/CodeGen/X86/lit.local.cfg
+++ b/test/CodeGen/X86/lit.local.cfg
@@ -4,7 +4,7 @@
 #
 # It should be possible to remove this override once all the bots have cycled
 # cleanly.
-config.suffixes = ['.ll', '.test', '.txt']
+config.suffixes = ['.ll', '.mir', '.test', '.txt']
 
 if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/CodeGen/X86/localescape.ll b/test/CodeGen/X86/localescape.ll
new file mode 100644
index 0000000000000..3cd174df0b713
--- /dev/null
+++ b/test/CodeGen/X86/localescape.ll
@@ -0,0 +1,143 @@
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
+
+declare i8* @llvm.frameaddress(i32)
+declare void @llvm.localescape(...)
+declare i8* @llvm.localaddress()
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+declare i32 @printf(i8*, ...)
+
+@str = internal constant [10 x i8] c"asdf: %d\0A\00"
+
+define void @print_framealloc_from_fp(i8* %fp) {
+  %a.i8 = call i8* @llvm.localrecover(i8* bitcast (void(i32)* @alloc_func to i8*), i8* %fp, i32 0)
+  %a = bitcast i8* %a.i8 to i32*
+  %a.val = load i32, i32* %a
+  call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %a.val)
+  %b.i8 = call i8* @llvm.localrecover(i8* bitcast (void(i32)* @alloc_func to i8*), i8* %fp, i32 1)
+  %b = bitcast i8* %b.i8 to i32*
+  %b.val = load i32, i32* %b
+  call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b.val)
+  store i32 42, i32* %b
+  %b2 = getelementptr i32, i32* %b, i32 1
+  %b2.val = load i32, i32* %b2
+  call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b2.val)
+  ret void
+}
+
+; X64-LABEL: print_framealloc_from_fp:
+; X64: movq %rcx, %[[parent_fp:[a-z]+]]
+; X64: movl .Lalloc_func$frame_escape_0(%[[parent_fp]]), %edx
+; X64: leaq {{.*}}(%rip), %[[str:[a-z]+]]
+; X64: movq %[[str]], %rcx
+; X64: callq printf
+; X64: movl .Lalloc_func$frame_escape_1(%[[parent_fp]]), %edx
+; X64: movq %[[str]], %rcx
+; X64: callq printf
+; X64: movl    $42, .Lalloc_func$frame_escape_1(%[[parent_fp]])
+; X64: retq
+
+; X86-LABEL: print_framealloc_from_fp:
+; X86: pushl   %esi
+; X86: subl    $8, %esp
+; X86: movl    16(%esp), %esi
+; X86: movl    Lalloc_func$frame_escape_0(%esi), %eax
+; X86: movl    %eax, 4(%esp)
+; X86: movl    $_str, (%esp)
+; X86: calll   _printf
+; X86: movl    Lalloc_func$frame_escape_1(%esi), %eax
+; X86: movl    %eax, 4(%esp)
+; X86: movl    $_str, (%esp)
+; X86: calll   _printf
+; X86: movl    $42, Lalloc_func$frame_escape_1(%esi)
+; X86: movl    $4, %eax
+; X86: movl    Lalloc_func$frame_escape_1(%esi,%eax), %eax
+; X86: movl    %eax, 4(%esp)
+; X86: movl    $_str, (%esp)
+; X86: calll   _printf
+; X86: addl    $8, %esp
+; X86: popl    %esi
+; X86: retl
+
+define void @alloc_func(i32 %n) {
+  %a = alloca i32
+  %b = alloca i32, i32 2
+  call void (...) @llvm.localescape(i32* %a, i32* %b)
+  store i32 42, i32* %a
+  store i32 13, i32* %b
+
+  ; Force usage of EBP with a dynamic alloca.
+  alloca i8, i32 %n
+
+  %lp = call i8* @llvm.localaddress()
+  call void @print_framealloc_from_fp(i8* %lp)
+  ret void
+}
+
+; X64-LABEL: alloc_func:
+; X64: pushq   %rbp
+; X64: subq    $16, %rsp
+; X64: .seh_stackalloc 16
+; X64: leaq    16(%rsp), %rbp
+; X64: .seh_setframe 5, 16
+; X64: .Lalloc_func$frame_escape_0 = -4
+; X64: .Lalloc_func$frame_escape_1 = -12
+; X64: movl $42, -4(%rbp)
+; X64: movl $13, -12(%rbp)
+; X64: movq 	%rbp, %rcx
+; X64: callq print_framealloc_from_fp
+; X64: retq
+
+; X86-LABEL: alloc_func:
+; X86: pushl   %ebp
+; X86: movl    %esp, %ebp
+; X86: subl    $12, %esp
+; X86: Lalloc_func$frame_escape_0 = -4
+; X86: Lalloc_func$frame_escape_1 = -12
+; X86: movl    $42, -4(%ebp)
+; X86: movl    $13, -12(%ebp)
+; X86: pushl   %ebp
+; X86: calll   _print_framealloc_from_fp
+; X86: movl    %ebp, %esp
+; X86: popl    %ebp
+; X86: retl
+
+; Helper to make this a complete program so it can be compiled and tested.
+define i32 @main() {
+  call void @alloc_func(i32 3)
+  ret i32 0
+}
+
+define void @alloc_func_no_frameaddr() {
+  %a = alloca i32
+  %b = alloca i32
+  call void (...) @llvm.localescape(i32* %a, i32* %b)
+  store i32 42, i32* %a
+  store i32 13, i32* %b
+  call void @print_framealloc_from_fp(i8* null)
+  ret void
+}
+
+; X64-LABEL: alloc_func_no_frameaddr:
+; X64: subq    $40, %rsp
+; X64: .seh_stackalloc 40
+; X64: .seh_endprologue
+; X64: .Lalloc_func_no_frameaddr$frame_escape_0 = 36
+; X64: .Lalloc_func_no_frameaddr$frame_escape_1 = 32
+; X64: movl $42, 36(%rsp)
+; X64: movl $13, 32(%rsp)
+; X64: xorl %ecx, %ecx
+; X64: callq print_framealloc_from_fp
+; X64: addq $40, %rsp
+; X64: retq
+
+; X86-LABEL: alloc_func_no_frameaddr:
+; X86: subl    $12, %esp
+; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 8
+; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 4
+; X86: movl $42, 8(%esp)
+; X86: movl $13, 4(%esp)
+; X86: movl $0, (%esp)
+; X86: calll _print_framealloc_from_fp
+; X86: addl $12, %esp
+; X86: retl
diff --git a/test/CodeGen/X86/lower-vec-shift-2.ll b/test/CodeGen/X86/lower-vec-shift-2.ll
index fb8fbba71fca9..2814615770047 100644
--- a/test/CodeGen/X86/lower-vec-shift-2.ll
+++ b/test/CodeGen/X86/lower-vec-shift-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE2
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
 
diff --git a/test/CodeGen/X86/lsr-static-addr.ll b/test/CodeGen/X86/lsr-static-addr.ll
index 1765ed7871d8d..97451e5573fe0 100644
--- a/test/CodeGen/X86/lsr-static-addr.ll
+++ b/test/CodeGen/X86/lsr-static-addr.ll
@@ -18,7 +18,7 @@
 ; ATOM-NEXT: movsd A(,%rax,8)
 ; ATOM-NEXT: mulsd
 ; ATOM-NEXT: movsd
-; ATOM-NEXT: leaq 1(%rax), %rax
+; ATOM-NEXT: incq %rax
 
 @A = external global [0 x double]
 
diff --git a/test/CodeGen/X86/machine-combiner-int-vec.ll b/test/CodeGen/X86/machine-combiner-int-vec.ll
new file mode 100644
index 0000000000000..dc1ce77e13b70
--- /dev/null
+++ b/test/CodeGen/X86/machine-combiner-int-vec.ll
@@ -0,0 +1,112 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX
+
+; Verify that 128-bit vector logical ops are reassociated.
+
+define <4 x i32> @reassociate_and_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, <4 x i32> %x3) {
+; SSE-LABEL: reassociate_and_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_and_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %t0 = add <4 x i32> %x0, %x1
+  %t1 = and <4 x i32> %x2, %t0
+  %t2 = and <4 x i32> %x3, %t1
+  ret <4 x i32> %t2
+}
+
+define <4 x i32> @reassociate_or_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, <4 x i32> %x3) {
+; SSE-LABEL: reassociate_or_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_or_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %t0 = add <4 x i32> %x0, %x1
+  %t1 = or <4 x i32> %x2, %t0
+  %t2 = or <4 x i32> %x3, %t1
+  ret <4 x i32> %t2
+}
+
+define <4 x i32> @reassociate_xor_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, <4 x i32> %x3) {
+; SSE-LABEL: reassociate_xor_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm2
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_xor_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %t0 = add <4 x i32> %x0, %x1
+  %t1 = xor <4 x i32> %x2, %t0
+  %t2 = xor <4 x i32> %x3, %t1
+  ret <4 x i32> %t2
+}
+
+; Verify that 256-bit vector logical ops are reassociated.
+
+define <8 x i32> @reassociate_and_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, <8 x i32> %x3) {
+; AVX-LABEL: reassociate_and_v8i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpand %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+
+  %t0 = add <8 x i32> %x0, %x1
+  %t1 = and <8 x i32> %x2, %t0
+  %t2 = and <8 x i32> %x3, %t1
+  ret <8 x i32> %t2
+}
+
+define <8 x i32> @reassociate_or_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, <8 x i32> %x3) {
+; AVX-LABEL: reassociate_or_v8i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpor %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+
+  %t0 = add <8 x i32> %x0, %x1
+  %t1 = or <8 x i32> %x2, %t0
+  %t2 = or <8 x i32> %x3, %t1
+  ret <8 x i32> %t2
+}
+
+define <8 x i32> @reassociate_xor_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, <8 x i32> %x3) {
+; AVX-LABEL: reassociate_xor_v8i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpxor %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+
+  %t0 = add <8 x i32> %x0, %x1
+  %t1 = xor <8 x i32> %x2, %t0
+  %t2 = xor <8 x i32> %x3, %t1
+  ret <8 x i32> %t2
+}
+
diff --git a/test/CodeGen/X86/machine-combiner-int.ll b/test/CodeGen/X86/machine-combiner-int.ll
new file mode 100644
index 0000000000000..4a1ba1a980aea
--- /dev/null
+++ b/test/CodeGen/X86/machine-combiner-int.ll
@@ -0,0 +1,194 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -o /dev/null 2>&1 | FileCheck %s --check-prefix=DEAD
+
+; Verify that integer multiplies are reassociated. The first multiply in 
+; each test should be independent of the result of the preceding add (lea).
+
+; TODO: This test does not actually test i16 machine instruction reassociation 
+; because the operands are being promoted to i32 types.
+
+define i16 @reassociate_muls_i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3) {
+; CHECK-LABEL: reassociate_muls_i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    leal   (%rdi,%rsi), %eax
+; CHECK-NEXT:    imull  %ecx, %edx
+; CHECK-NEXT:    imull  %edx, %eax
+; CHECK-NEXT:    retq
+  %t0 = add i16 %x0, %x1
+  %t1 = mul i16 %x2, %t0
+  %t2 = mul i16 %x3, %t1
+  ret i16 %t2
+}
+
+define i32 @reassociate_muls_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
+; CHECK-LABEL: reassociate_muls_i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    leal   (%rdi,%rsi), %eax
+; CHECK-NEXT:    imull  %ecx, %edx
+; CHECK-NEXT:    imull  %edx, %eax
+; CHECK-NEXT:    retq
+
+; DEAD:       ADD32rr
+; DEAD-NEXT:  IMUL32rr{{.*}}implicit-def dead %eflags
+; DEAD-NEXT:  IMUL32rr{{.*}}implicit-def dead %eflags
+
+  %t0 = add i32 %x0, %x1
+  %t1 = mul i32 %x2, %t0
+  %t2 = mul i32 %x3, %t1
+  ret i32 %t2
+}
+
+define i64 @reassociate_muls_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
+; CHECK-LABEL: reassociate_muls_i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    leaq   (%rdi,%rsi), %rax
+; CHECK-NEXT:    imulq  %rcx, %rdx
+; CHECK-NEXT:    imulq  %rdx, %rax
+; CHECK-NEXT:    retq
+  %t0 = add i64 %x0, %x1
+  %t1 = mul i64 %x2, %t0
+  %t2 = mul i64 %x3, %t1
+  ret i64 %t2
+}
+
+; Verify that integer 'ands' are reassociated. The first 'and' in 
+; each test should be independent of the result of the preceding sub.
+
+define i8 @reassociate_ands_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
+; CHECK-LABEL: reassociate_ands_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subb  %sil, %dil
+; CHECK-NEXT:    andb  %cl, %dl
+; CHECK-NEXT:    andb  %dil, %dl
+; CHECK_NEXT:    movb  %dx, %ax
+; CHECK_NEXT:    retq
+  %t0 = sub i8 %x0, %x1
+  %t1 = and i8 %x2, %t0
+  %t2 = and i8 %x3, %t1
+  ret i8 %t2
+}
+
+; TODO: No way to test i16? These appear to always get promoted to i32.
+
+define i32 @reassociate_ands_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
+; CHECK-LABEL: reassociate_ands_i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subl  %esi, %edi
+; CHECK-NEXT:    andl  %ecx, %edx
+; CHECK-NEXT:    andl  %edi, %edx
+; CHECK_NEXT:    movl  %edx, %eax
+; CHECK_NEXT:    retq
+  %t0 = sub i32 %x0, %x1
+  %t1 = and i32 %x2, %t0
+  %t2 = and i32 %x3, %t1
+  ret i32 %t2
+}
+
+define i64 @reassociate_ands_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
+; CHECK-LABEL: reassociate_ands_i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subq  %rsi, %rdi
+; CHECK-NEXT:    andq  %rcx, %rdx
+; CHECK-NEXT:    andq  %rdi, %rdx
+; CHECK-NEXT:    movq  %rdx, %rax
+; CHECK_NEXT:    retq
+  %t0 = sub i64 %x0, %x1
+  %t1 = and i64 %x2, %t0
+  %t2 = and i64 %x3, %t1
+  ret i64 %t2
+}
+
+; Verify that integer 'ors' are reassociated. The first 'or' in 
+; each test should be independent of the result of the preceding sub.
+
+define i8 @reassociate_ors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
+; CHECK-LABEL: reassociate_ors_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subb  %sil, %dil
+; CHECK-NEXT:    orb   %cl, %dl
+; CHECK-NEXT:    orb   %dil, %dl
+; CHECK_NEXT:    movb  %dx, %ax
+; CHECK_NEXT:    retq
+  %t0 = sub i8 %x0, %x1
+  %t1 = or i8 %x2, %t0
+  %t2 = or i8 %x3, %t1
+  ret i8 %t2
+}
+
+; TODO: No way to test i16? These appear to always get promoted to i32.
+
+define i32 @reassociate_ors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
+; CHECK-LABEL: reassociate_ors_i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subl  %esi, %edi
+; CHECK-NEXT:    orl   %ecx, %edx
+; CHECK-NEXT:    orl   %edi, %edx
+; CHECK_NEXT:    movl  %edx, %eax
+; CHECK_NEXT:    retq
+  %t0 = sub i32 %x0, %x1
+  %t1 = or i32 %x2, %t0
+  %t2 = or i32 %x3, %t1
+  ret i32 %t2
+}
+
+define i64 @reassociate_ors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
+; CHECK-LABEL: reassociate_ors_i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subq  %rsi, %rdi
+; CHECK-NEXT:    orq   %rcx, %rdx
+; CHECK-NEXT:    orq   %rdi, %rdx
+; CHECK-NEXT:    movq  %rdx, %rax
+; CHECK_NEXT:    retq
+  %t0 = sub i64 %x0, %x1
+  %t1 = or i64 %x2, %t0
+  %t2 = or i64 %x3, %t1
+  ret i64 %t2
+}
+
+; Verify that integer 'xors' are reassociated. The first 'xor' in
+; each test should be independent of the result of the preceding sub.
+
+define i8 @reassociate_xors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
+; CHECK-LABEL: reassociate_xors_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subb  %sil, %dil
+; CHECK-NEXT:    xorb  %cl, %dl
+; CHECK-NEXT:    xorb  %dil, %dl
+; CHECK_NEXT:    movb  %dx, %ax
+; CHECK_NEXT:    retq
+  %t0 = sub i8 %x0, %x1
+  %t1 = xor i8 %x2, %t0
+  %t2 = xor i8 %x3, %t1
+  ret i8 %t2
+}
+
+; TODO: No way to test i16? These appear to always get promoted to i32.
+
+define i32 @reassociate_xors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
+; CHECK-LABEL: reassociate_xors_i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subl  %esi, %edi
+; CHECK-NEXT:    xorl  %ecx, %edx
+; CHECK-NEXT:    xorl  %edi, %edx
+; CHECK_NEXT:    movl  %edx, %eax
+; CHECK_NEXT:    retq
+  %t0 = sub i32 %x0, %x1
+  %t1 = xor i32 %x2, %t0
+  %t2 = xor i32 %x3, %t1
+  ret i32 %t2
+}
+
+define i64 @reassociate_xors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
+; CHECK-LABEL: reassociate_xors_i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subq  %rsi, %rdi
+; CHECK-NEXT:    xorq  %rcx, %rdx
+; CHECK-NEXT:    xorq  %rdi, %rdx
+; CHECK-NEXT:    movq  %rdx, %rax
+; CHECK_NEXT:    retq
+  %t0 = sub i64 %x0, %x1
+  %t1 = xor i64 %x2, %t0
+  %t2 = xor i64 %x3, %t1
+  ret i64 %t2
+}
+
diff --git a/test/CodeGen/X86/machine-combiner.ll b/test/CodeGen/X86/machine-combiner.ll
index 0943bebbb0999..3fbb233696c82 100644
--- a/test/CodeGen/X86/machine-combiner.ll
+++ b/test/CodeGen/X86/machine-combiner.ll
@@ -144,7 +144,7 @@ define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
   ret float %t2
 }
 
-; Verify that SSE and AVX scalar single-precison multiplies are reassociated.
+; Verify that SSE and AVX scalar single-precision multiplies are reassociated.
 
 define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) {
 ; SSE-LABEL: reassociate_muls1:
@@ -166,7 +166,7 @@ define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) {
   ret float %t2
 }
 
-; Verify that SSE and AVX scalar double-precison adds are reassociated.
+; Verify that SSE and AVX scalar double-precision adds are reassociated.
 
 define double @reassociate_adds_double(double %x0, double %x1, double %x2, double %x3) {
 ; SSE-LABEL: reassociate_adds_double:
@@ -188,7 +188,7 @@ define double @reassociate_adds_double(double %x0, double %x1, double %x2, doubl
   ret double %t2
 }
 
-; Verify that SSE and AVX scalar double-precison multiplies are reassociated.
+; Verify that SSE and AVX scalar double-precision multiplies are reassociated.
 
 define double @reassociate_muls_double(double %x0, double %x1, double %x2, double %x3) {
 ; SSE-LABEL: reassociate_muls_double:
@@ -210,3 +210,464 @@ define double @reassociate_muls_double(double %x0, double %x1, double %x2, doubl
   ret double %t2
 }
 
+; Verify that SSE and AVX 128-bit vector single-precision adds are reassociated.
+
+define <4 x float> @reassociate_adds_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; SSE-LABEL: reassociate_adds_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulps %xmm1, %xmm0
+; SSE-NEXT:    addps %xmm3, %xmm2
+; SSE-NEXT:    addps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddps %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fmul <4 x float> %x0, %x1
+  %t1 = fadd <4 x float> %x2, %t0
+  %t2 = fadd <4 x float> %x3, %t1
+  ret <4 x float> %t2
+}
+
+; Verify that SSE and AVX 128-bit vector double-precision adds are reassociated.
+
+define <2 x double> @reassociate_adds_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
+; SSE-LABEL: reassociate_adds_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulpd %xmm1, %xmm0
+; SSE-NEXT:    addpd %xmm3, %xmm2
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddpd %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fmul <2 x double> %x0, %x1
+  %t1 = fadd <2 x double> %x2, %t0
+  %t2 = fadd <2 x double> %x3, %t1
+  ret <2 x double> %t2
+}
+
+; Verify that SSE and AVX 128-bit vector single-precision multiplies are reassociated.
+
+define <4 x float> @reassociate_muls_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; SSE-LABEL: reassociate_muls_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    mulps %xmm3, %xmm2
+; SSE-NEXT:    mulps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_muls_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmulps %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fadd <4 x float> %x0, %x1
+  %t1 = fmul <4 x float> %x2, %t0
+  %t2 = fmul <4 x float> %x3, %t1
+  ret <4 x float> %t2
+}
+
+; Verify that SSE and AVX 128-bit vector double-precision multiplies are reassociated.
+
+define <2 x double> @reassociate_muls_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
+; SSE-LABEL: reassociate_muls_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    mulpd %xmm3, %xmm2
+; SSE-NEXT:    mulpd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_muls_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmulpd %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fadd <2 x double> %x0, %x1
+  %t1 = fmul <2 x double> %x2, %t0
+  %t2 = fmul <2 x double> %x3, %t1
+  ret <2 x double> %t2
+}
+
+; Verify that AVX 256-bit vector single-precision adds are reassociated.
+
+define <8 x float> @reassociate_adds_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) {
+; AVX-LABEL: reassociate_adds_v8f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vaddps %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %t0 = fmul <8 x float> %x0, %x1
+  %t1 = fadd <8 x float> %x2, %t0
+  %t2 = fadd <8 x float> %x3, %t1
+  ret <8 x float> %t2
+}
+
+; Verify that AVX 256-bit vector double-precision adds are reassociated.
+
+define <4 x double> @reassociate_adds_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) {
+; AVX-LABEL: reassociate_adds_v4f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %t0 = fmul <4 x double> %x0, %x1
+  %t1 = fadd <4 x double> %x2, %t0
+  %t2 = fadd <4 x double> %x3, %t1
+  ret <4 x double> %t2
+}
+
+; Verify that AVX 256-bit vector single-precision multiplies are reassociated.
+
+define <8 x float> @reassociate_muls_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) {
+; AVX-LABEL: reassociate_muls_v8f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vmulps %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %t0 = fadd <8 x float> %x0, %x1
+  %t1 = fmul <8 x float> %x2, %t0
+  %t2 = fmul <8 x float> %x3, %t1
+  ret <8 x float> %t2
+}
+
+; Verify that AVX 256-bit vector double-precision multiplies are reassociated.
+
+define <4 x double> @reassociate_muls_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) {
+; AVX-LABEL: reassociate_muls_v4f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vmulpd %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %t0 = fadd <4 x double> %x0, %x1
+  %t1 = fmul <4 x double> %x2, %t0
+  %t2 = fmul <4 x double> %x3, %t1
+  ret <4 x double> %t2
+}
+
+; Verify that SSE and AVX scalar single-precision minimum ops are reassociated.
+
+define float @reassociate_mins_single(float %x0, float %x1, float %x2, float %x3) {
+; SSE-LABEL: reassociate_mins_single:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    minss %xmm3, %xmm2
+; SSE-NEXT:    minss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_mins_single:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vminss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vminss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fdiv float %x0, %x1
+  %cmp1 = fcmp olt float %x2, %t0
+  %sel1 = select i1 %cmp1, float %x2, float %t0
+  %cmp2 = fcmp olt float %x3, %sel1
+  %sel2 = select i1 %cmp2, float %x3, float %sel1
+  ret float %sel2
+}
+
+; Verify that SSE and AVX scalar single-precision maximum ops are reassociated.
+
+define float @reassociate_maxs_single(float %x0, float %x1, float %x2, float %x3) {
+; SSE-LABEL: reassociate_maxs_single:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    maxss %xmm3, %xmm2
+; SSE-NEXT:    maxss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_maxs_single:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmaxss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fdiv float %x0, %x1
+  %cmp1 = fcmp ogt float %x2, %t0
+  %sel1 = select i1 %cmp1, float %x2, float %t0
+  %cmp2 = fcmp ogt float %x3, %sel1
+  %sel2 = select i1 %cmp2, float %x3, float %sel1
+  ret float %sel2
+}
+
+; Verify that SSE and AVX scalar double-precision minimum ops are reassociated.
+
+define double @reassociate_mins_double(double %x0, double %x1, double %x2, double %x3) {
+; SSE-LABEL: reassociate_mins_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    minsd %xmm3, %xmm2
+; SSE-NEXT:    minsd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_mins_double:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vminsd %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fdiv double %x0, %x1
+  %cmp1 = fcmp olt double %x2, %t0
+  %sel1 = select i1 %cmp1, double %x2, double %t0
+  %cmp2 = fcmp olt double %x3, %sel1
+  %sel2 = select i1 %cmp2, double %x3, double %sel1
+  ret double %sel2
+}
+
+; Verify that SSE and AVX scalar double-precision maximum ops are reassociated.
+
+define double @reassociate_maxs_double(double %x0, double %x1, double %x2, double %x3) {
+; SSE-LABEL: reassociate_maxs_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    maxsd %xmm3, %xmm2
+; SSE-NEXT:    maxsd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_maxs_double:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmaxsd %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fdiv double %x0, %x1
+  %cmp1 = fcmp ogt double %x2, %t0
+  %sel1 = select i1 %cmp1, double %x2, double %t0
+  %cmp2 = fcmp ogt double %x3, %sel1
+  %sel2 = select i1 %cmp2, double %x3, double %sel1
+  ret double %sel2
+}
+
+; Verify that SSE and AVX 128-bit vector single-precision minimum ops are reassociated.
+
+define <4 x float> @reassociate_mins_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; SSE-LABEL: reassociate_mins_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    minps %xmm3, %xmm2
+; SSE-NEXT:    minps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_mins_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vminps %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vminps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fadd <4 x float> %x0, %x1
+  %cmp1 = fcmp olt <4 x float> %x2, %t0
+  %sel1 = select <4 x i1> %cmp1, <4 x float> %x2, <4 x float> %t0
+  %cmp2 = fcmp olt <4 x float> %x3, %sel1
+  %sel2 = select <4 x i1> %cmp2, <4 x float> %x3, <4 x float> %sel1
+  ret <4 x float> %sel2
+}
+
+; Verify that SSE and AVX 128-bit vector single-precision maximum ops are reassociated.
+
+define <4 x float> @reassociate_maxs_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; SSE-LABEL: reassociate_maxs_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    maxps %xmm3, %xmm2
+; SSE-NEXT:    maxps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_maxs_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmaxps %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fadd <4 x float> %x0, %x1
+  %cmp1 = fcmp ogt <4 x float> %x2, %t0
+  %sel1 = select <4 x i1> %cmp1, <4 x float> %x2, <4 x float> %t0
+  %cmp2 = fcmp ogt <4 x float> %x3, %sel1
+  %sel2 = select <4 x i1> %cmp2, <4 x float> %x3, <4 x float> %sel1
+  ret <4 x float> %sel2
+}
+
+; Verify that SSE and AVX 128-bit vector double-precision minimum ops are reassociated.
+
+define <2 x double> @reassociate_mins_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
+; SSE-LABEL: reassociate_mins_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    minpd %xmm3, %xmm2
+; SSE-NEXT:    minpd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_mins_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vminpd %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fadd <2 x double> %x0, %x1
+  %cmp1 = fcmp olt <2 x double> %x2, %t0
+  %sel1 = select <2 x i1> %cmp1, <2 x double> %x2, <2 x double> %t0
+  %cmp2 = fcmp olt <2 x double> %x3, %sel1
+  %sel2 = select <2 x i1> %cmp2, <2 x double> %x3, <2 x double> %sel1
+  ret <2 x double> %sel2
+}
+
+; Verify that SSE and AVX 128-bit vector double-precision maximum ops are reassociated.
+
+define <2 x double> @reassociate_maxs_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
+; SSE-LABEL: reassociate_maxs_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    maxpd %xmm3, %xmm2
+; SSE-NEXT:    maxpd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_maxs_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmaxpd %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fadd <2 x double> %x0, %x1
+  %cmp1 = fcmp ogt <2 x double> %x2, %t0
+  %sel1 = select <2 x i1> %cmp1, <2 x double> %x2, <2 x double> %t0
+  %cmp2 = fcmp ogt <2 x double> %x3, %sel1
+  %sel2 = select <2 x i1> %cmp2, <2 x double> %x3, <2 x double> %sel1
+  ret <2 x double> %sel2
+}
+
+; Verify that AVX 256-bit vector single-precision minimum ops are reassociated.
+
+define <8 x float> @reassociate_mins_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) {
+; AVX-LABEL: reassociate_mins_v8f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vminps %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vminps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %t0 = fadd <8 x float> %x0, %x1
+  %cmp1 = fcmp olt <8 x float> %x2, %t0
+  %sel1 = select <8 x i1> %cmp1, <8 x float> %x2, <8 x float> %t0
+  %cmp2 = fcmp olt <8 x float> %x3, %sel1
+  %sel2 = select <8 x i1> %cmp2, <8 x float> %x3, <8 x float> %sel1
+  ret <8 x float> %sel2
+}
+
+; Verify that AVX 256-bit vector single-precision maximum ops are reassociated.
+
+define <8 x float> @reassociate_maxs_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) {
+; AVX-LABEL: reassociate_maxs_v8f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vmaxps %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %t0 = fadd <8 x float> %x0, %x1
+  %cmp1 = fcmp ogt <8 x float> %x2, %t0
+  %sel1 = select <8 x i1> %cmp1, <8 x float> %x2, <8 x float> %t0
+  %cmp2 = fcmp ogt <8 x float> %x3, %sel1
+  %sel2 = select <8 x i1> %cmp2, <8 x float> %x3, <8 x float> %sel1
+  ret <8 x float> %sel2
+}
+
+; Verify that AVX 256-bit vector double-precision minimum ops are reassociated.
+
+define <4 x double> @reassociate_mins_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) {
+; AVX-LABEL: reassociate_mins_v4f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vminpd %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vminpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %t0 = fadd <4 x double> %x0, %x1
+  %cmp1 = fcmp olt <4 x double> %x2, %t0
+  %sel1 = select <4 x i1> %cmp1, <4 x double> %x2, <4 x double> %t0
+  %cmp2 = fcmp olt <4 x double> %x3, %sel1
+  %sel2 = select <4 x i1> %cmp2, <4 x double> %x3, <4 x double> %sel1
+  ret <4 x double> %sel2
+}
+
+; Verify that AVX 256-bit vector double-precision maximum ops are reassociated.
+
+define <4 x double> @reassociate_maxs_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) {
+; AVX-LABEL: reassociate_maxs_v4f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vmaxpd %ymm3, %ymm2, %ymm1
+; AVX-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %t0 = fadd <4 x double> %x0, %x1
+  %cmp1 = fcmp ogt <4 x double> %x2, %t0
+  %sel1 = select <4 x i1> %cmp1, <4 x double> %x2, <4 x double> %t0
+  %cmp2 = fcmp ogt <4 x double> %x3, %sel1
+  %sel2 = select <4 x i1> %cmp2, <4 x double> %x3, <4 x double> %sel1
+  ret <4 x double> %sel2
+}
+
+; PR25016: https://llvm.org/bugs/show_bug.cgi?id=25016
+; Verify that reassociation is not happening needlessly or wrongly.
+
+declare double @bar()
+
+define double @reassociate_adds_from_calls() {
+; AVX-LABEL: reassociate_adds_from_calls:
+; AVX:       callq   bar
+; AVX-NEXT:  vmovsd  %xmm0, 16(%rsp)
+; AVX-NEXT:  callq   bar
+; AVX-NEXT:  vmovsd  %xmm0, 8(%rsp)
+; AVX-NEXT:  callq   bar
+; AVX-NEXT:  vmovsd  %xmm0, (%rsp)
+; AVX-NEXT:  callq   bar
+; AVX-NEXT:  vmovsd  8(%rsp), %xmm1
+; AVX:       vaddsd  16(%rsp), %xmm1, %xmm1
+; AVX-NEXT:  vaddsd  (%rsp), %xmm0, %xmm0
+; AVX-NEXT:  vaddsd  %xmm0, %xmm1, %xmm0
+
+  %x0 = call double @bar()
+  %x1 = call double @bar()
+  %x2 = call double @bar()
+  %x3 = call double @bar()
+  %t0 = fadd double %x0, %x1
+  %t1 = fadd double %t0, %x2
+  %t2 = fadd double %t1, %x3
+  ret double %t2
+}
+
+define double @already_reassociated() {
+; AVX-LABEL: already_reassociated:
+; AVX:       callq   bar
+; AVX-NEXT:  vmovsd  %xmm0, 16(%rsp)
+; AVX-NEXT:  callq   bar
+; AVX-NEXT:  vmovsd  %xmm0, 8(%rsp)
+; AVX-NEXT:  callq   bar
+; AVX-NEXT:  vmovsd  %xmm0, (%rsp)
+; AVX-NEXT:  callq   bar
+; AVX-NEXT:  vmovsd  8(%rsp), %xmm1
+; AVX:       vaddsd  16(%rsp), %xmm1, %xmm1
+; AVX-NEXT:  vaddsd  (%rsp), %xmm0, %xmm0
+; AVX-NEXT:  vaddsd  %xmm0, %xmm1, %xmm0
+
+  %x0 = call double @bar()
+  %x1 = call double @bar()
+  %x2 = call double @bar()
+  %x3 = call double @bar()
+  %t0 = fadd double %x0, %x1
+  %t1 = fadd double %x2, %x3
+  %t2 = fadd double %t0, %t1
+  ret double %t2
+}
+
diff --git a/test/CodeGen/X86/machine-cp.ll b/test/CodeGen/X86/machine-cp.ll
index aaed0f0a23dca..143a1c3787a0a 100644
--- a/test/CodeGen/X86/machine-cp.ll
+++ b/test/CodeGen/X86/machine-cp.ll
@@ -66,29 +66,23 @@ while.end:                                        ; preds = %while.body, %entry
 ;
 ; CHECK-LABEL: foo:
 ; CHECK: psllw $7,
-; CHECK: psllw $7,
-; CHECK-NEXT: pand
-; CHECK-NEXT: pcmpgtb 
-; CHECK-NEXT: pand %xmm{{[0-9]+}}, [[SRC:%xmm[0-9]+]]
-; Machine propagation used to delete the first copy as the
-; first few uses were <undef>.
-; CHECK-NEXT: movdqa [[SRC]], [[CPY1:%xmm[0-9]+]]
-; CHECK-NEXT: movdqa [[SRC]], [[CPY2:%xmm[0-9]+]]
-; CHECK-NEXT: punpckhbw [[SRC]],
-; Check that CPY1 is not redefined.
-; CHECK-NOT: , [[CPY1]]
-; undef use, we do not care.
-; CHECK: punpcklwd [[CPY1]],
-; Check that CPY1 is not redefined.
-; CHECK-NOT: , [[CPY1]]
-; CHECK: punpcklbw [[CPY2]], [[CPY2]]
-; CHECK-NEXT: punpckhwd [[CPY2]], [[CPY2]]
-; CHECK-NEXT pslld $31, [[CPY2]]
+; CHECK: psllw $7, [[SRC1:%xmm[0-9]+]]
+; CHECK-NEXT: pand {{.*}}(%rip), [[SRC1]]
+; CHECK-NEXT: pcmpgtb [[SRC1]], [[SRC2:%xmm[0-9]+]]
+; CHECK-NEXT: pand %xmm{{[0-9]+}}, [[SRC2]]
+; CHECK-NEXT: movdqa [[SRC2]], [[CPY1:%xmm[0-9]+]]
+; CHECK-NEXT: punpcklbw %xmm{{[0-9]+}}, [[CPY1]]
 ; Check that CPY1 is not redefined.
-; CHECK-NOT: , [[CPY1]]
-; CHECK: punpcklbw [[CPY1]], [[CPY1]]
-; CHECK-NEXT: punpcklwd [[CPY1]], [[CPY1]]
-; CHECK-NEXT pslld $31, [[CPY1]]
+; CHECK-NOT:  , [[CPY1]]
+; CHECK: punpckhwd %xmm{{[0-9]+}}, [[CPY1]]
+; CHECK-NEXT: pslld $31, [[CPY1]]
+; CHECK-NEXT: psrad $31, [[CPY1]]
+; CHECK: punpckhbw %xmm{{[0-9]+}}, [[CPY2:%xmm[0-9]+]]
+; Check that CPY2 is not redefined.
+; CHECK-NOT:  , [[CPY2]]
+; CHECK: punpckhwd %xmm{{[0-9]+}}, [[CPY2]]
+; CHECK-NEXT: pslld $31, [[CPY2]]
+; CHECK-NEXT: psrad $31, [[CPY2]]
 define <16 x float> @foo(<16 x float> %x) {
 bb:
   %v3 = icmp slt <16 x i32> undef, zeroinitializer
diff --git a/test/CodeGen/X86/machine-trace-metrics-crash.ll b/test/CodeGen/X86/machine-trace-metrics-crash.ll
index 1d0ee79f04a96..048260c51fe36 100644
--- a/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -54,9 +54,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: 1)
 !1 = !DIFile(filename: "24199.cpp", directory: "/bin")
 !2 = !{i32 2, !"Debug Info Version", i32 3}
-!3 = !DISubprogram(linkageName: "foo", file: !1, line: 18, isLocal: false, isDefinition: true, scopeLine: 18, function: void (%struct.A*)* @foo)
+!3 = distinct !DISubprogram(linkageName: "foo", file: !1, line: 18, isLocal: false, isDefinition: true, scopeLine: 18)
 !4 = !DIExpression()
-!5 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, scope: !3, flags: DIFlagArtificial | DIFlagObjectPointer)
+!5 = !DILocalVariable(name: "this", arg: 1, scope: !3, flags: DIFlagArtificial | DIFlagObjectPointer)
 !6 = !DILocation(line: 0, scope: !3)
 
 
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index de16e5ddc06b6..b7280d87d3b70 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1,19 +1,51 @@
-; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=knl < %s | FileCheck %s -check-prefix=KNL
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
+; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
+; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
+; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
+
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; KNL-LABEL: test1
-; KNL: kxnorw  %k1, %k1, %k1
-; KNL: vgatherdps      (%rdi,%zmm0,4), %zmm1 {%k1}
+
+; SCALAR-LABEL: test1
+; SCALAR:      extractelement <16 x float*>
+; SCALAR-NEXT: load float
+; SCALAR-NEXT: insertelement <16 x float>
+; SCALAR-NEXT: extractelement <16 x float*>
+; SCALAR-NEXT: load float
+
 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test1:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test1:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test1:
+; SKX:       # BB#0:
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
 
   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
 
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
-  
+
   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
   ret <16 x float>%res
 }
@@ -21,11 +53,41 @@ define <16 x float> @test1(float* %base, <16 x i32> %ind) {
 declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
 declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
 declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
-  
-; KNL-LABEL: test2
-; KNL: kmovw %esi, %k1
-; KNL: vgatherdps      (%rdi,%zmm0,4), %zmm1 {%k1}
+
+
+; SCALAR-LABEL: test2
+; SCALAR:      extractelement <16 x float*>
+; SCALAR-NEXT: load float
+; SCALAR-NEXT: insertelement <16 x float>
+; SCALAR-NEXT: br label %else
+; SCALAR: else:
+; SCALAR-NEXT:  %res.phi.else = phi
+; SCALAR-NEXT:  %Mask1 = extractelement <16 x i1> %imask, i32 1
+; SCALAR-NEXT:  %ToLoad1 = icmp eq i1 %Mask1, true
+; SCALAR-NEXT:  br i1 %ToLoad1, label %cond.load1, label %else2
+
 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test2:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    kmovw %esi, %k1
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test2:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test2:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovw %esi, %k1
+; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
 
   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
@@ -37,10 +99,28 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
   ret <16 x float> %res
 }
 
-; KNL-LABEL: test3
-; KNL: kmovw %esi, %k1
-; KNL: vpgatherdd      (%rdi,%zmm0,4), %zmm1 {%k1}
 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test3:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    kmovw %esi, %k1
+; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test3:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test3:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovw %esi, %k1
+; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
 
   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
@@ -52,13 +132,38 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
   ret <16 x i32> %res
 }
 
-; KNL-LABEL: test4
-; KNL: kmovw %esi, %k1
-; KNL: kmovw
-; KNL: vpgatherdd
-; KNL: vpgatherdd
 
 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test4:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    kmovw %esi, %k1
+; KNL_64-NEXT:    kmovw %k1, %k2
+; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm2
+; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
+; KNL_64-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test4:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT:    kmovw %k1, %k2
+; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
+; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
+; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test4:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovw %esi, %k1
+; SKX-NEXT:    kmovw %k1, %k2
+; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
+; SKX-NEXT:    vmovaps %zmm1, %zmm2
+; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
+; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
+; SKX-NEXT:    retq
 
   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
@@ -71,12 +176,46 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
   ret <16 x i32> %res
 }
 
-; KNL-LABEL: test5
-; KNL: kmovw %k1, %k2
-; KNL: vpscatterdd {{.*}}%k2
-; KNL: vpscatterdd {{.*}}%k1
+
+; SCALAR-LABEL: test5
+; SCALAR:        %Mask0 = extractelement <16 x i1> %imask, i32 0
+; SCALAR-NEXT:   %ToStore0 = icmp eq i1 %Mask0, true
+; SCALAR-NEXT:   br i1 %ToStore0, label %cond.store, label %else
+; SCALAR: cond.store:
+; SCALAR-NEXT:  %Elt0 = extractelement <16 x i32> %val, i32 0
+; SCALAR-NEXT:  %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
+; SCALAR-NEXT:  store i32 %Elt0, i32* %Ptr0, align 4
+; SCALAR-NEXT:  br label %else
+; SCALAR: else:
+; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
+; SCALAR-NEXT:  %ToStore1 = icmp eq i1 %Mask1, true
+; SCALAR-NEXT:  br i1 %ToStore1, label %cond.store1, label %else2
 
 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; KNL_64-LABEL: test5:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    kmovw %esi, %k1
+; KNL_64-NEXT:    kmovw %k1, %k2
+; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
+; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test5:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT:    kmovw %k1, %k2
+; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
+; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test5:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovw %esi, %k1
+; SKX-NEXT:    kmovw %k1, %k2
+; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
+; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; SKX-NEXT:    retq
 
   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
@@ -91,12 +230,44 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
 declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
 declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
 
-; KNL-LABEL: test6
-; KNL: kxnorw  %k1, %k1, %k1
-; KNL: kxnorw  %k2, %k2, %k2
-; KNL: vpgatherqd      (,%zmm{{.*}}), %ymm{{.*}} {%k2}
-; KNL: vpscatterqd     %ymm{{.*}}, (,%zmm{{.*}}) {%k1}
+
+; SCALAR-LABEL: test6
+; SCALAR:        store i32 %Elt0, i32* %Ptr01, align 4
+; SCALAR-NEXT:   %Elt1 = extractelement <8 x i32> %a1, i32 1
+; SCALAR-NEXT:   %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
+; SCALAR-NEXT:   store i32 %Elt1, i32* %Ptr12, align 4
+; SCALAR-NEXT:   %Elt2 = extractelement <8 x i32> %a1, i32 2
+; SCALAR-NEXT:   %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
+; SCALAR-NEXT:   store i32 %Elt2, i32* %Ptr23, align 4
+
 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
+; KNL_64-LABEL: test6:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
+; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test6:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm2
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k2
+; KNL_32-NEXT:    vpgatherqd (,%zmm2), %ymm1 {%k2}
+; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm2) {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test6:
+; SKX:       # BB#0:
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    kxnorw %k0, %k0, %k2
+; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; SKX-NEXT:    vmovaps %zmm2, %zmm0
+; SKX-NEXT:    retq
 
   %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
 
@@ -104,13 +275,41 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
   ret <8 x i32>%a
 }
 
-; In this case the index should be promoted to <8 x i64> for KNL
-; KNL-LABEL: test7
-; KNL: vpmovsxdq %ymm0, %zmm0
-; KNL: kmovw   %k1, %k2
-; KNL: vpgatherqd {{.*}} {%k2}
-; KNL: vpgatherqd {{.*}} {%k1}
 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
+;
+; KNL_64-LABEL: test7:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    movzbl %sil, %eax
+; KNL_64-NEXT:    kmovw %eax, %k1
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT:    kmovw %k1, %k2
+; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm2
+; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test7:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT:    kmovw %k1, %k2
+; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
+; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test7:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovb %esi, %k1
+; SKX-NEXT:    kmovw %k1, %k2
+; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
+; SKX-NEXT:    vmovaps %zmm1, %zmm2
+; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
+; SKX-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
+; SKX-NEXT:    retq
 
   %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
   %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
@@ -125,18 +324,1751 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
 
 ; No uniform base in this case, index <8 x i64> contains addresses,
 ; each gather call will be split into two
-; KNL-LABEL: test8
-; KNL: kshiftrw        $8, %k1, %k2
-; KNL: vpgatherqd
-; KNL: vpgatherqd
-; KNL: vinserti64x4
-; KNL: vpgatherqd
-; KNL: vpgatherqd
-; KNL: vinserti64x4
 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
+; KNL_64-LABEL: test8:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    kmovw %edi, %k1
+; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_64-NEXT:    kmovw %k2, %k3
+; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
+; KNL_64-NEXT:    kmovw %k1, %k3
+; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
+; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm4
+; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test8:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT:    kmovw %k1, %k2
+; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
+; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test8:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kshiftrw $8, %k1, %k2
+; SKX-NEXT:    kmovw %k2, %k3
+; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
+; SKX-NEXT:    kmovw %k1, %k3
+; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
+; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm4
+; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test8:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; SKX_32-NEXT:    kmovw %k1, %k2
+; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm2
+; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
+; SKX_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
+; SKX_32-NEXT:    retl
+
   %imask = bitcast i16 %mask to <16 x i1>
   %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
   %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
   %res = add <16 x i32> %gt1, %gt2
   ret <16 x i32> %res
 }
+
+%struct.RT = type { i8, [10 x [20 x i32]], i8 }
+%struct.ST = type { i32, double, %struct.RT }
+
+; Masked gather for agregate types
+; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
+
+
+define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
+; KNL_64-LABEL: test9:
+; KNL_64:       # BB#0: # %entry
+; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
+; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
+; KNL_64-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
+; KNL_64-NEXT:    vpsllq $32, %zmm1, %zmm1
+; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
+; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
+; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
+; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
+; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test9:
+; KNL_32:       # BB#0: # %entry
+; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
+; KNL_32-NEXT:    vpbroadcastd .LCPI8_0, %ymm3
+; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
+; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL_32-NEXT:    vpbroadcastd .LCPI8_1, %ymm3
+; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
+; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT:    vpbroadcastd .LCPI8_2, %ymm1
+; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_32-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test9:
+; SKX:       # BB#0: # %entry
+; SKX-NEXT:    vpbroadcastq %rdi, %zmm2
+; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
+; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
+  %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
+
+  %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
+  %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
+; KNL_64-LABEL: test10:
+; KNL_64:       # BB#0: # %entry
+; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
+; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
+; KNL_64-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
+; KNL_64-NEXT:    vpsllq $32, %zmm1, %zmm1
+; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
+; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
+; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
+; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
+; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test10:
+; KNL_32:       # BB#0: # %entry
+; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
+; KNL_32-NEXT:    vpbroadcastd .LCPI9_0, %ymm3
+; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
+; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL_32-NEXT:    vpbroadcastd .LCPI9_1, %ymm3
+; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
+; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT:    vpbroadcastd .LCPI9_2, %ymm1
+; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_32-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test10:
+; SKX:       # BB#0: # %entry
+; SKX-NEXT:    vpbroadcastq %rdi, %zmm2
+; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
+; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
+  %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
+
+  %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
+  %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  ret <8 x i32> %res
+}
+
+; Splat index in GEP, requires broadcast
+define <16 x float> @test11(float* %base, i32 %ind) {
+; KNL_64-LABEL: test11:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpbroadcastd %esi, %zmm1
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test11:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %zmm1
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test11:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpbroadcastd %esi, %zmm1
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; SKX-NEXT:    retq
+
+  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+; We are checking the uniform base here. It is taken directly from input to vgatherdps
+define <16 x float> @test12(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test12:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test12:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test12:
+; SKX:       # BB#0:
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+; The same as the previous, but the mask is undefined
+define <16 x float> @test13(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test13:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test13:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test13:
+; SKX:       # BB#0:
+; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+; The base pointer is not splat, can't find unform base
+define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
+; KNL_64-LABEL: test14:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
+; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_64-NEXT:    vpbroadcastq %xmm0, %zmm0
+; KNL_64-NEXT:    vmovd %esi, %xmm1
+; KNL_64-NEXT:    vpbroadcastd %xmm1, %ymm1
+; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT:    vpsllq $2, %zmm1, %zmm1
+; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT:    kshiftrw $8, %k0, %k1
+; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
+; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
+; KNL_64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test14:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_32-NEXT:    vpbroadcastd %xmm0, %zmm0
+; KNL_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
+; KNL_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; KNL_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test14:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
+; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpbroadcastq %xmm0, %zmm0
+; SKX-NEXT:    vmovd %esi, %xmm1
+; SKX-NEXT:    vpbroadcastd %xmm1, %ymm1
+; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
+; SKX-NEXT:    vpsllq $2, %zmm1, %zmm1
+; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    kshiftrw $8, %k0, %k1
+; SKX-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
+; SKX-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
+; SKX-NEXT:    vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test14:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; SKX_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX_32-NEXT:    vpbroadcastd %xmm0, %zmm0
+; SKX_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
+; SKX_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; SKX_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
+; SKX_32-NEXT:    retl
+
+  %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
+  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
+declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
+
+; Gather smaller than existing instruction
+define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
+;
+; KNL_64-LABEL: test15:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm2
+; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm0
+; KNL_64-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL_64-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL_64-NEXT:    vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test15:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm2
+; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm0
+; KNL_32-NEXT:    vpsllvq .LCPI14_0, %zmm0, %zmm0
+; KNL_32-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL_32-NEXT:    vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test15:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX-NEXT:    vpmovd2m %xmm1, %k1
+; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test15:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX_32-NEXT:    vpmovd2m %xmm1, %k1
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
+; SKX_32-NEXT:    retl
+
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
+  %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+; Gather smaller than existing instruction
+define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
+;
+; KNL_64-LABEL: test16:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
+; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
+; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test16:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
+; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
+; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT:    vpsllvq .LCPI15_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX-NEXT:    vpmovd2m %xmm1, %k1
+; SKX-NEXT:    vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
+; SKX-NEXT:    vmovaps %zmm2, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test16:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX_32-NEXT:    vpmovd2m %xmm1, %k1
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
+; SKX_32-NEXT:    retl
+
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
+  %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
+  ret <4 x double>%res
+}
+
+define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
+;
+; KNL_64-LABEL: test17:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test17:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpsllvq .LCPI16_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test17:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
+; SKX-NEXT:    vpmovq2m %xmm1, %k1
+; SKX-NEXT:    vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT:    vmovaps %zmm2, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test17:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
+; SKX_32-NEXT:    vpmovq2m %xmm1, %k1
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
+; SKX_32-NEXT:    retl
+
+  %sext_ind = sext <2 x i32> %ind to <2 x i64>
+  %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
+  %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double>%res
+}
+
+declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
+declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
+declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
+
+define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
+;
+; KNL_64-LABEL: test18:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT:    vpmovsxdq %ymm2, %zmm2
+; KNL_64-NEXT:    vpsllq $63, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test18:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
+; KNL_32-NEXT:    vpmovsxdq %ymm2, %zmm2
+; KNL_32-NEXT:    vpsllvq .LCPI17_0, %zmm2, %zmm2
+; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test18:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm2, %xmm2
+; SKX-NEXT:    vpmovd2m %xmm2, %k1
+; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test18:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpslld $31, %xmm2, %xmm2
+; SKX_32-NEXT:    vpmovd2m %xmm2, %k1
+; SKX_32-NEXT:    vpscatterdd %xmm0, (,%xmm1) {%k1}
+; SKX_32-NEXT:    retl
+  call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
+;
+; KNL_64-LABEL: test19:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
+; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
+; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT:    vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test19:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
+; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
+; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpsllvq .LCPI18_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test19:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX-NEXT:    vpmovd2m %xmm1, %k1
+; SKX-NEXT:    vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test19:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX_32-NEXT:    vpmovd2m %xmm1, %k1
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
+; SKX_32-NEXT:    retl
+  %gep = getelementptr double, double* %ptr, <4 x i64> %ind
+  call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
+  ret void
+}
+
+; Data type requires widening
+define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
+;
+; KNL_64-LABEL: test20:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_64-NEXT:    vmovq {{.*#+}} xmm2 = xmm2[0],zero
+; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT:    vpmovsxdq %ymm2, %zmm2
+; KNL_64-NEXT:    vpsllq $63, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test20:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_32-NEXT:    vmovq {{.*#+}} xmm2 = xmm2[0],zero
+; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
+; KNL_32-NEXT:    vpmovsxdq %ymm2, %zmm2
+; KNL_32-NEXT:    vpsllvq .LCPI19_0, %zmm2, %zmm2
+; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test20:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
+; SKX-NEXT:    vpmovq2m %xmm2, %k0
+; SKX-NEXT:    kshiftlw $2, %k0, %k0
+; SKX-NEXT:    kshiftrw $2, %k0, %k1
+; SKX-NEXT:    vscatterqps %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test20:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SKX_32-NEXT:    vpsllq $63, %xmm2, %xmm2
+; SKX_32-NEXT:    vpmovq2m %xmm2, %k0
+; SKX_32-NEXT:    kshiftlw $2, %k0, %k0
+; SKX_32-NEXT:    kshiftrw $2, %k0, %k1
+; SKX_32-NEXT:    vscatterdps %xmm0, (,%xmm1) {%k1}
+; SKX_32-NEXT:    retl
+  call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
+  ret void
+}
+
+; Data type requires promotion
+define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
+;
+; KNL_64-LABEL: test21:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT:    vpsllq $63, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test21:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT:    vpsllvq .LCPI20_0, %zmm2, %zmm2
+; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test21:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
+; SKX-NEXT:    vpmovq2m %xmm2, %k0
+; SKX-NEXT:    kshiftlw $2, %k0, %k0
+; SKX-NEXT:    kshiftrw $2, %k0, %k1
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test21:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpsllq $63, %xmm2, %xmm2
+; SKX_32-NEXT:    vpmovq2m %xmm2, %k0
+; SKX_32-NEXT:    kshiftlw $2, %k0, %k0
+; SKX_32-NEXT:    kshiftrw $2, %k0, %k1
+; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX_32-NEXT:    retl
+  call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
+  ret void
+}
+
+; The result type requires widening
+declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
+
+define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
+;
+;
+; KNL_64-LABEL: test22:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_64-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
+; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test22:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
+; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
+; KNL_32-NEXT:    vpsllvq .LCPI21_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test22:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
+; SKX-NEXT:    vpmovq2m %xmm1, %k0
+; SKX-NEXT:    kshiftlw $2, %k0, %k0
+; SKX-NEXT:    kshiftrw $2, %k0, %k1
+; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
+; SKX-NEXT:    vmovaps %zmm2, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test22:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
+; SKX_32-NEXT:    vpmovq2m %xmm1, %k0
+; SKX_32-NEXT:    kshiftlw $2, %k0, %k0
+; SKX_32-NEXT:    kshiftrw $2, %k0, %k1
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
+; SKX_32-NEXT:    retl
+  %sext_ind = sext <2 x i32> %ind to <2 x i64>
+  %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
+  %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
+  ret <2 x float>%res
+}
+
+declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
+
+define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
+;
+; KNL_64-LABEL: test23:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test23:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpsllvq .LCPI22_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test23:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
+; SKX-NEXT:    vpmovq2m %xmm1, %k1
+; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT:    vmovaps %zmm2, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test23:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
+; SKX_32-NEXT:    vpmovq2m %xmm1, %k1
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
+; SKX_32-NEXT:    retl
+  %sext_ind = sext <2 x i32> %ind to <2 x i64>
+  %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
+  %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
+  ret <2 x i32>%res
+}
+
+define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
+; KNL_64-LABEL: test24:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    movb $3, %al
+; KNL_64-NEXT:    movzbl %al, %eax
+; KNL_64-NEXT:    kmovw %eax, %k1
+; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test24:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; KNL_32-NEXT:    vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vpsllvq .LCPI23_1, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test24:
+; SKX:       # BB#0:
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test24:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
+; SKX_32-NEXT:    retl
+  %sext_ind = sext <2 x i32> %ind to <2 x i64>
+  %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
+  %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+  ret <2 x i32>%res
+}
+
+define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
+;
+; KNL_64-LABEL: test25:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test25:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpsllvq .LCPI24_0, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test25:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
+; SKX-NEXT:    vpmovq2m %xmm1, %k1
+; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT:    vmovaps %zmm2, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test25:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
+; SKX_32-NEXT:    vpmovq2m %xmm1, %k1
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
+; SKX_32-NEXT:    retl
+  %sext_ind = sext <2 x i32> %ind to <2 x i64>
+  %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
+  %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
+  ret <2 x i64>%res
+}
+
+define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
+;
+; KNL_64-LABEL: test26:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    movb $3, %al
+; KNL_64-NEXT:    movzbl %al, %eax
+; KNL_64-NEXT:    kmovw %eax, %k1
+; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test26:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; KNL_32-NEXT:    vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
+; KNL_32-NEXT:    vpsllvq .LCPI25_1, %zmm2, %zmm2
+; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test26:
+; SKX:       # BB#0:
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test26:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
+; SKX_32-NEXT:    retl
+  %sext_ind = sext <2 x i32> %ind to <2 x i64>
+  %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
+  %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
+  ret <2 x i64>%res
+}
+
+; Result type requires widening; all-ones mask
+define <2 x float> @test27(float* %base, <2 x i32> %ind) {
+;
+; KNL_64-LABEL: test27:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
+; KNL_64-NEXT:    movb $3, %al
+; KNL_64-NEXT:    movzbl %al, %eax
+; KNL_64-NEXT:    kmovw %eax, %k1
+; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test27:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
+; KNL_32-NEXT:    movb $3, %cl
+; KNL_32-NEXT:    movzbl %cl, %ecx
+; KNL_32-NEXT:    kmovw %ecx, %k1
+; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test27:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SKX-NEXT:    movb $3, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
+; SKX-NEXT:    retq
+  %sext_ind = sext <2 x i32> %ind to <2 x i64>
+  %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
+  %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
+  ret <2 x float>%res
+}
+
+; Data type requires promotion, mask is all-ones
+define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
+;
+;
+; KNL_64-LABEL: test28:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT:    movb $3, %al
+; KNL_64-NEXT:    movzbl %al, %eax
+; KNL_64-NEXT:    kmovw %eax, %k1
+; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test28:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; KNL_32-NEXT:    vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
+; KNL_32-NEXT:    vpsllvq .LCPI27_1, %zmm2, %zmm2
+; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test28:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT:    movb $3, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test28:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32-NEXT:    movb $3, %al
+; SKX_32-NEXT:    kmovb %eax, %k1
+; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX_32-NEXT:    retl
+  call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
+  ret void
+}
+
+
+; SCALAR-LABEL: test29
+; SCALAR:      extractelement <16 x float*>
+; SCALAR-NEXT: load float
+; SCALAR-NEXT: insertelement <16 x float>
+; SCALAR-NEXT: extractelement <16 x float*>
+; SCALAR-NEXT: load float
+
+define <16 x float> @test29(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: test29:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    movw $44, %ax
+; KNL_64-NEXT:    kmovw %eax, %k1
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test29:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    movw $44, %cx
+; KNL_32-NEXT:    kmovw %ecx, %k1
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test29:
+; SKX:       # BB#0:
+; SKX-NEXT:    movw $44, %ax
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+
+  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+; Check non-power-of-2 case. It should be scalarized.
+declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
+define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
+; KNL_64-LABEL: test30:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    andl $1, %edx
+; KNL_64-NEXT:    kmovw %edx, %k1
+; KNL_64-NEXT:    andl $1, %esi
+; KNL_64-NEXT:    kmovw %esi, %k2
+; KNL_64-NEXT:    movl %edi, %eax
+; KNL_64-NEXT:    andl $1, %eax
+; KNL_64-NEXT:    kmovw %eax, %k0
+; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
+; KNL_64-NEXT:    vpsllq $2, %ymm1, %ymm1
+; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; KNL_64-NEXT:    # implicit-def: %XMM0
+; KNL_64-NEXT:    testb $1, %dil
+; KNL_64-NEXT:    je .LBB29_2
+; KNL_64-NEXT:  # BB#1: # %cond.load
+; KNL_64-NEXT:    vmovq %xmm1, %rax
+; KNL_64-NEXT:    vmovd (%rax), %xmm0
+; KNL_64-NEXT:  .LBB29_2: # %else
+; KNL_64-NEXT:    kmovw %k2, %eax
+; KNL_64-NEXT:    movl %eax, %ecx
+; KNL_64-NEXT:    andl $1, %ecx
+; KNL_64-NEXT:    testb %cl, %cl
+; KNL_64-NEXT:    je .LBB29_4
+; KNL_64-NEXT:  # BB#3: # %cond.load1
+; KNL_64-NEXT:    vpextrq $1, %xmm1, %rcx
+; KNL_64-NEXT:    vpinsrd $1, (%rcx), %xmm0, %xmm0
+; KNL_64-NEXT:  .LBB29_4: # %else2
+; KNL_64-NEXT:    kmovw %k1, %ecx
+; KNL_64-NEXT:    movl %ecx, %edx
+; KNL_64-NEXT:    andl $1, %edx
+; KNL_64-NEXT:    testb %dl, %dl
+; KNL_64-NEXT:    je .LBB29_6
+; KNL_64-NEXT:  # BB#5: # %cond.load4
+; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; KNL_64-NEXT:    vmovq %xmm1, %rdx
+; KNL_64-NEXT:    vpinsrd $2, (%rdx), %xmm0, %xmm0
+; KNL_64-NEXT:  .LBB29_6: # %else5
+; KNL_64-NEXT:    kmovw %k0, %edx
+; KNL_64-NEXT:    vmovd %edx, %xmm1
+; KNL_64-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; KNL_64-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
+; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test30:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    andl $1, %eax
+; KNL_32-NEXT:    kmovw %eax, %k1
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    andl $1, %eax
+; KNL_32-NEXT:    kmovw %eax, %k2
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    movl %eax, %ecx
+; KNL_32-NEXT:    andl $1, %ecx
+; KNL_32-NEXT:    kmovw %ecx, %k0
+; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
+; KNL_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; KNL_32-NEXT:    # implicit-def: %XMM0
+; KNL_32-NEXT:    testb $1, %al
+; KNL_32-NEXT:    je .LBB29_2
+; KNL_32-NEXT:  # BB#1: # %cond.load
+; KNL_32-NEXT:    vmovd %xmm1, %eax
+; KNL_32-NEXT:    vmovd (%eax), %xmm0
+; KNL_32-NEXT:  .LBB29_2: # %else
+; KNL_32-NEXT:    kmovw %k2, %eax
+; KNL_32-NEXT:    movl %eax, %ecx
+; KNL_32-NEXT:    andl $1, %ecx
+; KNL_32-NEXT:    testb %cl, %cl
+; KNL_32-NEXT:    je .LBB29_4
+; KNL_32-NEXT:  # BB#3: # %cond.load1
+; KNL_32-NEXT:    vpextrd $1, %xmm1, %ecx
+; KNL_32-NEXT:    vpinsrd $1, (%ecx), %xmm0, %xmm0
+; KNL_32-NEXT:  .LBB29_4: # %else2
+; KNL_32-NEXT:    kmovw %k1, %ecx
+; KNL_32-NEXT:    movl %ecx, %edx
+; KNL_32-NEXT:    andl $1, %edx
+; KNL_32-NEXT:    testb %dl, %dl
+; KNL_32-NEXT:    je .LBB29_6
+; KNL_32-NEXT:  # BB#5: # %cond.load4
+; KNL_32-NEXT:    vpextrd $2, %xmm1, %edx
+; KNL_32-NEXT:    vpinsrd $2, (%edx), %xmm0, %xmm0
+; KNL_32-NEXT:  .LBB29_6: # %else5
+; KNL_32-NEXT:    kmovw %k0, %edx
+; KNL_32-NEXT:    vmovd %edx, %xmm1
+; KNL_32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; KNL_32-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
+; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test30:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm2, %xmm2
+; SKX-NEXT:    vpmovd2m %xmm2, %k1
+; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
+; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
+; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT:    # implicit-def: %XMM0
+; SKX-NEXT:    andb $1, %al
+; SKX-NEXT:    je .LBB29_2
+; SKX-NEXT:  # BB#1: # %cond.load
+; SKX-NEXT:    vmovq %xmm1, %rax
+; SKX-NEXT:    vmovd (%rax), %xmm0
+; SKX-NEXT:  .LBB29_2: # %else
+; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT:    andb $1, %al
+; SKX-NEXT:    je .LBB29_4
+; SKX-NEXT:  # BB#3: # %cond.load1
+; SKX-NEXT:    vpextrq $1, %xmm1, %rax
+; SKX-NEXT:    vpinsrd $1, (%rax), %xmm0, %xmm0
+; SKX-NEXT:  .LBB29_4: # %else2
+; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT:    andb $1, %al
+; SKX-NEXT:    je .LBB29_6
+; SKX-NEXT:  # BB#5: # %cond.load4
+; SKX-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; SKX-NEXT:    vmovq %xmm1, %rax
+; SKX-NEXT:    vpinsrd $2, (%rax), %xmm0, %xmm0
+; SKX-NEXT:  .LBB29_6: # %else5
+; SKX-NEXT:    vmovdqa32 %xmm0, %xmm3 {%k1}
+; SKX-NEXT:    vmovaps %zmm3, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test30:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    subl $12, %esp
+; SKX_32-NEXT:  .Ltmp0:
+; SKX_32-NEXT:    .cfi_def_cfa_offset 16
+; SKX_32-NEXT:    vpslld $31, %xmm2, %xmm2
+; SKX_32-NEXT:    vpmovd2m %xmm2, %k1
+; SKX_32-NEXT:    kmovb %k1, {{[0-9]+}}(%esp)
+; SKX_32-NEXT:    vpslld $2, %xmm1, %xmm1
+; SKX_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; SKX_32-NEXT:    # implicit-def: %XMM1
+; SKX_32-NEXT:    andb $1, %al
+; SKX_32-NEXT:    je .LBB29_2
+; SKX_32-NEXT:  # BB#1: # %cond.load
+; SKX_32-NEXT:    vmovd %xmm2, %eax
+; SKX_32-NEXT:    vmovd (%eax), %xmm1
+; SKX_32-NEXT:  .LBB29_2: # %else
+; SKX_32-NEXT:    kmovb %k1, {{[0-9]+}}(%esp)
+; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; SKX_32-NEXT:    andb $1, %al
+; SKX_32-NEXT:    je .LBB29_4
+; SKX_32-NEXT:  # BB#3: # %cond.load1
+; SKX_32-NEXT:    vpextrd $1, %xmm2, %eax
+; SKX_32-NEXT:    vpinsrd $1, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT:  .LBB29_4: # %else2
+; SKX_32-NEXT:    vmovdqa32 {{[0-9]+}}(%esp), %xmm0
+; SKX_32-NEXT:    kmovb %k1, (%esp)
+; SKX_32-NEXT:    movb (%esp), %al
+; SKX_32-NEXT:    andb $1, %al
+; SKX_32-NEXT:    je .LBB29_6
+; SKX_32-NEXT:  # BB#5: # %cond.load4
+; SKX_32-NEXT:    vpextrd $2, %xmm2, %eax
+; SKX_32-NEXT:    vpinsrd $2, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT:  .LBB29_6: # %else5
+; SKX_32-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; SKX_32-NEXT:    addl $12, %esp
+; SKX_32-NEXT:    retl
+
+  %sext_ind = sext <3 x i32> %ind to <3 x i64>
+  %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
+  %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
+  ret <3 x i32>%res
+}
+
+declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
+
+; KNL-LABEL: test31
+; KNL: vpgatherqq
+; KNL: vpgatherqq
+define <16 x float*> @test31(<16 x float**> %ptrs) {
+; KNL_64-LABEL: test31:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
+; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
+; KNL_64-NEXT:    kshiftrw $8, %k1, %k1
+; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
+; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_64-NEXT:    vmovaps %zmm3, %zmm1
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test31:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test31:
+; SKX:       # BB#0:
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    kxnorw %k0, %k0, %k2
+; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
+; SKX-NEXT:    kshiftrw $8, %k1, %k1
+; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
+; SKX-NEXT:    vmovaps %zmm2, %zmm0
+; SKX-NEXT:    vmovaps %zmm3, %zmm1
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test31:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
+; SKX_32-NEXT:    retl
+
+  %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
+  ret <16 x float*>%res
+}
+
+define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
+; KNL_64-LABEL: test_gather_16i32:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
+; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test_gather_16i32:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test_gather_16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm2
+; SKX-NEXT:    kshiftrw $8, %k1, %k2
+; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test_gather_16i32:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
+; SKX_32-NEXT:    retl
+  %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
+  ret <16 x i32> %res
+}
+define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
+; KNL_64-LABEL: test_gather_16i64:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
+; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
+; KNL_64-NEXT:    vmovaps %zmm3, %zmm0
+; KNL_64-NEXT:    vmovaps %zmm4, %zmm1
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test_gather_16i64:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    pushl %ebp
+; KNL_32-NEXT:  .Ltmp0:
+; KNL_32-NEXT:    .cfi_def_cfa_offset 8
+; KNL_32-NEXT:  .Ltmp1:
+; KNL_32-NEXT:    .cfi_offset %ebp, -8
+; KNL_32-NEXT:    movl %esp, %ebp
+; KNL_32-NEXT:  .Ltmp2:
+; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
+; KNL_32-NEXT:    andl $-64, %esp
+; KNL_32-NEXT:    subl $64, %esp
+; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
+; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
+; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
+; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_32-NEXT:    movl %ebp, %esp
+; KNL_32-NEXT:    popl %ebp
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test_gather_16i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT:    kshiftrw $8, %k1, %k2
+; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
+; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
+; SKX-NEXT:    vmovaps %zmm3, %zmm0
+; SKX-NEXT:    vmovaps %zmm4, %zmm1
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test_gather_16i64:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    pushl %ebp
+; SKX_32-NEXT:  .Ltmp1:
+; SKX_32-NEXT:    .cfi_def_cfa_offset 8
+; SKX_32-NEXT:  .Ltmp2:
+; SKX_32-NEXT:    .cfi_offset %ebp, -8
+; SKX_32-NEXT:    movl %esp, %ebp
+; SKX_32-NEXT:  .Ltmp3:
+; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
+; SKX_32-NEXT:    andl $-64, %esp
+; SKX_32-NEXT:    subl $64, %esp
+; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
+; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
+; SKX_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
+; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
+; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
+; SKX_32-NEXT:    movl %ebp, %esp
+; SKX_32-NEXT:    popl %ebp
+; SKX_32-NEXT:    retl
+  %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
+  ret <16 x i64> %res
+}
+declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
+; KNL_64-LABEL: test_gather_16f32:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm2
+; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_64-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test_gather_16f32:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test_gather_16f32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm2
+; SKX-NEXT:    kshiftrw $8, %k1, %k2
+; SKX-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT:    vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test_gather_16f32:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
+; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
+; SKX_32-NEXT:    retl
+  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
+  ret <16 x float> %res
+}
+define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
+; KNL_64-LABEL: test_gather_16f64:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_64-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
+; KNL_64-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
+; KNL_64-NEXT:    vmovaps %zmm3, %zmm0
+; KNL_64-NEXT:    vmovaps %zmm4, %zmm1
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test_gather_16f64:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    pushl %ebp
+; KNL_32-NEXT:  .Ltmp3:
+; KNL_32-NEXT:    .cfi_def_cfa_offset 8
+; KNL_32-NEXT:  .Ltmp4:
+; KNL_32-NEXT:    .cfi_offset %ebp, -8
+; KNL_32-NEXT:    movl %esp, %ebp
+; KNL_32-NEXT:  .Ltmp5:
+; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
+; KNL_32-NEXT:    andl $-64, %esp
+; KNL_32-NEXT:    subl $64, %esp
+; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
+; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
+; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
+; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
+; KNL_32-NEXT:    movl %ebp, %esp
+; KNL_32-NEXT:    popl %ebp
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test_gather_16f64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT:    kshiftrw $8, %k1, %k2
+; SKX-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
+; SKX-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
+; SKX-NEXT:    vmovaps %zmm3, %zmm0
+; SKX-NEXT:    vmovaps %zmm4, %zmm1
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test_gather_16f64:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    pushl %ebp
+; SKX_32-NEXT:  .Ltmp4:
+; SKX_32-NEXT:    .cfi_def_cfa_offset 8
+; SKX_32-NEXT:  .Ltmp5:
+; SKX_32-NEXT:    .cfi_offset %ebp, -8
+; SKX_32-NEXT:    movl %esp, %ebp
+; SKX_32-NEXT:  .Ltmp6:
+; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
+; SKX_32-NEXT:    andl $-64, %esp
+; SKX_32-NEXT:    subl $64, %esp
+; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
+; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
+; SKX_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
+; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
+; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
+; SKX_32-NEXT:    movl %ebp, %esp
+; SKX_32-NEXT:    popl %ebp
+; SKX_32-NEXT:    retl
+  %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
+  ret <16 x double> %res
+}
+declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
+define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
+; KNL_64-LABEL: test_scatter_16i32:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_64-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
+; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
+; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test_scatter_16i32:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test_scatter_16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT:    kshiftrw $8, %k1, %k2
+; SKX-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
+; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm0
+; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test_scatter_16i32:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT:    retl
+  call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
+  ret void
+}
+define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
+; KNL_64-LABEL: test_scatter_16i64:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_64-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
+; KNL_64-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test_scatter_16i64:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    pushl %ebp
+; KNL_32-NEXT:  .Ltmp6:
+; KNL_32-NEXT:    .cfi_def_cfa_offset 8
+; KNL_32-NEXT:  .Ltmp7:
+; KNL_32-NEXT:    .cfi_offset %ebp, -8
+; KNL_32-NEXT:    movl %esp, %ebp
+; KNL_32-NEXT:  .Ltmp8:
+; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
+; KNL_32-NEXT:    andl $-64, %esp
+; KNL_32-NEXT:    subl $64, %esp
+; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
+; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
+; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
+; KNL_32-NEXT:    movl %ebp, %esp
+; KNL_32-NEXT:    popl %ebp
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test_scatter_16i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT:    kshiftrw $8, %k1, %k2
+; SKX-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
+; SKX-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test_scatter_16i64:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    pushl %ebp
+; SKX_32-NEXT:  .Ltmp7:
+; SKX_32-NEXT:    .cfi_def_cfa_offset 8
+; SKX_32-NEXT:  .Ltmp8:
+; SKX_32-NEXT:    .cfi_offset %ebp, -8
+; SKX_32-NEXT:    movl %esp, %ebp
+; SKX_32-NEXT:  .Ltmp9:
+; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
+; SKX_32-NEXT:    andl $-64, %esp
+; SKX_32-NEXT:    subl $64, %esp
+; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
+; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
+; SKX_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
+; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
+; SKX_32-NEXT:    movl %ebp, %esp
+; SKX_32-NEXT:    popl %ebp
+; SKX_32-NEXT:    retl
+  call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
+define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
+; KNL_64-LABEL: test_scatter_16f32:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_64-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
+; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm0
+; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test_scatter_16f32:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test_scatter_16f32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT:    kshiftrw $8, %k1, %k2
+; SKX-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
+; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm0
+; SKX-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test_scatter_16f32:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT:    retl
+  call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
+define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
+; KNL_64-LABEL: test_scatter_16f64:
+; KNL_64:       # BB#0:
+; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
+; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_64-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
+; KNL_64-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: test_scatter_16f64:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    pushl %ebp
+; KNL_32-NEXT:  .Ltmp9:
+; KNL_32-NEXT:    .cfi_def_cfa_offset 8
+; KNL_32-NEXT:  .Ltmp10:
+; KNL_32-NEXT:    .cfi_offset %ebp, -8
+; KNL_32-NEXT:    movl %esp, %ebp
+; KNL_32-NEXT:  .Ltmp11:
+; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
+; KNL_32-NEXT:    andl $-64, %esp
+; KNL_32-NEXT:    subl $64, %esp
+; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
+; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
+; KNL_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
+; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
+; KNL_32-NEXT:    movl %ebp, %esp
+; KNL_32-NEXT:    popl %ebp
+; KNL_32-NEXT:    retl
+;
+; SKX-LABEL: test_scatter_16f64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT:    kshiftrw $8, %k1, %k2
+; SKX-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
+; SKX-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test_scatter_16f64:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    pushl %ebp
+; SKX_32-NEXT:  .Ltmp10:
+; SKX_32-NEXT:    .cfi_def_cfa_offset 8
+; SKX_32-NEXT:  .Ltmp11:
+; SKX_32-NEXT:    .cfi_offset %ebp, -8
+; SKX_32-NEXT:    movl %esp, %ebp
+; SKX_32-NEXT:  .Ltmp12:
+; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
+; SKX_32-NEXT:    andl $-64, %esp
+; SKX_32-NEXT:    subl $64, %esp
+; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
+; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
+; SKX_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
+; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
+; SKX_32-NEXT:    movl %ebp, %esp
+; SKX_32-NEXT:    popl %ebp
+; SKX_32-NEXT:    retl
+  call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index 6c16e634a59f5..c29933e266b26 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -1,7 +1,8 @@
-; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512
-; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
-; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR
-; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=skx < %s | FileCheck %s -check-prefix=SKX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512
+; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2
+; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR
+; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=skx < %s | FileCheck %s --check-prefix=SKX
 
 ; AVX512-LABEL: test1
 ; AVX512: vmovdqu32       (%rdi), %zmm0 {%k1} {z}
@@ -139,18 +140,55 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double
   ret <4 x double> %res
 }
 
-; AVX2-LABEL: test11
+; AVX2-LABEL: test11a
 ; AVX2: vmaskmovps
 ; AVX2: vblendvps
 
-; SKX-LABEL: test11
-; SKX: vmovaps {{.*}}{%k1}
-define <8 x float> @test11(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
+; SKX-LABEL: test11a
+; SKX: vmovaps (%rdi), %ymm1 {%k1}
+; AVX512-LABEL: test11a
+; AVX512: kshiftlw $8
+; AVX512: kshiftrw $8
+; AVX512: vmovups (%rdi), %zmm1 {%k1}
+define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
   %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
   ret <8 x float> %res
 }
 
+; SKX-LABEL: test11b
+; SKX: vmovdqu32 (%rdi), %ymm1 {%k1}
+; AVX512-LABEL: test11b
+; AVX512: kshiftlw        $8
+; AVX512: kshiftrw        $8
+; AVX512: vmovdqu32 (%rdi), %zmm1 {%k1}
+define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
+  %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
+  ret <8 x i32> %res
+}
+
+; SKX-LABEL: test11c
+; SKX: vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512-LABEL: test11c
+; AVX512: kshiftlw  $8
+; AVX512: kshiftrw  $8
+; AVX512: vmovups (%rdi), %zmm0 {%k1} {z}
+define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
+  %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
+  ret <8 x float> %res
+}
+
+; SKX-LABEL: test11d
+; SKX: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
+; AVX512-LABEL: test11d
+; AVX512: kshiftlw  $8
+; AVX512: kshiftrw  $8
+; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
+  %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
+  ret <8 x i32> %res
+}
+
 ; AVX2-LABEL: test12
 ; AVX2: vpmaskmovd %ymm
 
@@ -190,10 +228,13 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
 ; AVX2-LABEL: test15
 ; AVX2: vpmaskmovd
 
-; SKX-LABEL: test15
-; SKX: kshiftl
-; SKX: kshiftr
-; SKX: vmovdqu32 {{.*}}{%k1}
+; SKX-LABEL: test15:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; SKX-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT:    vpmovqd %xmm1, (%rdi) {%k1}
+; SKX-NEXT:    retq
 define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
@@ -232,12 +273,58 @@ define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
 ; AVX2-LABEL: test18
 ; AVX2: vmaskmovps
 ; AVX2-NOT: blend
+; AVX2: ret
 define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
+; SKX-LABEL: test18:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SKX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
+; SKX-NEXT:    kshiftlw $2, %k0, %k0
+; SKX-NEXT:    kshiftrw $2, %k0, %k1
+; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
   ret <2 x float> %res
 }
 
+; AVX_SCALAR-LABEL: test19
+; AVX_SCALAR: load <4 x float>, <4 x float>* %addr, align 4
+
+define <4 x float> @test19(<4 x i32> %trigger, <4 x float>* %addr) {
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
+  ret <4 x float> %res
+}
+
+; AVX_SCALAR-LABEL: test20
+; AVX_SCALAR: load float, {{.*}}, align 4
+; AVX_SCALAR: insertelement <4 x float> undef, float
+; AVX_SCALAR: select <4 x i1> <i1 true, i1 false, i1 true, i1 true>
+
+define <4 x float> @test20(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %src0) {
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 16, <4 x i1><i1 true, i1 false, i1 true, i1 true>, <4 x float> %src0)
+  ret <4 x float> %res
+}
+
+; AVX_SCALAR-LABEL: test21
+; AVX_SCALAR: store <4 x i32> %val
+define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+; AVX_SCALAR-LABEL: test22
+; AVX_SCALAR: extractelement <4 x i32> %val, i32 0
+; AVX_SCALAR:  store i32
+define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
+  ret void
+}
 
 declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
 declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
@@ -251,6 +338,7 @@ declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i
 declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
 declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
+declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
 declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
 declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
@@ -260,3 +348,415 @@ declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>
 declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
 declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
 
+declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
+
+; AVX512-LABEL: test23
+; AVX512: vmovdqu64       64(%rdi), %zmm1 {%k2} {z}
+; AVX512: vmovdqu64       (%rdi), %zmm0 {%k1} {z}
+
+define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
+  %mask = icmp eq <16 x i32*> %trigger, zeroinitializer
+  %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
+  ret <16 x i32*> %res
+}
+
+%mystruct = type { i16, i16, [1 x i8*] }
+
+declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
+
+define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
+; AVX512-LABEL: test24:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; AVX512-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
+; AVX512-NEXT:    retq
+;
+; AVX2-LABEL: test24:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm1, %ymm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpmaskmovq 96(%rdi), %ymm1, %ymm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpmaskmovq 64(%rdi), %ymm1, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmaskmovq 32(%rdi), %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX2-NEXT:    retq
+;
+; SKX-LABEL: test24:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    kshiftrw $8, %k1, %k1
+; SKX-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
+; SKX-NEXT:    retq
+  %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
+  ret <16 x %mystruct*> %res
+}
+
+define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
+; AVX512-LABEL: test_store_16i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512-NEXT:    vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX2-LABEL: test_store_16i64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm5, %xmm5
+; AVX2-NEXT:    vpsrad $31, %xmm5, %xmm5
+; AVX2-NEXT:    vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT:    vpmaskmovq %ymm1, %ymm5, (%rdi)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpmaskmovq %ymm4, %ymm1, 96(%rdi)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpmaskmovq %ymm3, %ymm1, 64(%rdi)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmaskmovq %ymm2, %ymm0, 32(%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; SKX-LABEL: test_store_16i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
+; SKX-NEXT:    kshiftrw $8, %k1, %k1
+; SKX-NEXT:    vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; SKX-NEXT:    retq
+  call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
+define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
+; AVX512-LABEL: test_store_16f64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
+; AVX512-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512-NEXT:    vmovupd %zmm2, 64(%rdi) {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX2-LABEL: test_store_16f64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm5, %xmm5
+; AVX2-NEXT:    vpsrad $31, %xmm5, %xmm5
+; AVX2-NEXT:    vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT:    vmaskmovpd %ymm1, %ymm5, (%rdi)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vmaskmovpd %ymm4, %ymm1, 96(%rdi)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vmaskmovpd %ymm3, %ymm1, 64(%rdi)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vmaskmovpd %ymm2, %ymm0, 32(%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; SKX-LABEL: test_store_16f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
+; SKX-NEXT:    kshiftrw $8, %k1, %k1
+; SKX-NEXT:    vmovupd %zmm2, 64(%rdi) {%k1}
+; SKX-NEXT:    retq
+  call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
+define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
+; AVX512-LABEL: test_load_16i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm1 {%k1}
+; AVX512-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512-NEXT:    vmovdqu64 64(%rdi), %zmm2 {%k1}
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    vmovaps %zmm2, %zmm1
+; AVX512-NEXT:    retq
+;
+; AVX2-LABEL: test_load_16i64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm5, %xmm5
+; AVX2-NEXT:    vpsrad $31, %xmm5, %xmm5
+; AVX2-NEXT:    vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm5, %ymm9
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm7, %xmm7
+; AVX2-NEXT:    vpsrad $31, %xmm7, %xmm7
+; AVX2-NEXT:    vpmovsxdq %xmm7, %ymm7
+; AVX2-NEXT:    vpmaskmovq 32(%rdi), %ymm7, %ymm8
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm6, %xmm6
+; AVX2-NEXT:    vpsrad $31, %xmm6, %xmm6
+; AVX2-NEXT:    vpmovsxdq %xmm6, %ymm6
+; AVX2-NEXT:    vpmaskmovq 64(%rdi), %ymm6, %ymm10
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm9, %ymm1, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm7, %ymm8, %ymm2, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm6, %ymm10, %ymm3, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmaskmovq 96(%rdi), %ymm0, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vmovapd %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; SKX-LABEL: test_load_16i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vmovdqu64 (%rdi), %zmm1 {%k1}
+; SKX-NEXT:    kshiftrw $8, %k1, %k1
+; SKX-NEXT:    vmovdqu64 64(%rdi), %zmm2 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    vmovaps %zmm2, %zmm1
+; SKX-NEXT:    retq
+  %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
+  ret <16 x i64> %res
+}
+declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
+; AVX512-LABEL: test_load_16f64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
+; AVX512-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    vmovaps %zmm2, %zmm1
+; AVX512-NEXT:    retq
+;
+; AVX2-LABEL: test_load_16f64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm5, %xmm5
+; AVX2-NEXT:    vpsrad $31, %xmm5, %xmm5
+; AVX2-NEXT:    vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm5, %ymm9
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm7, %xmm7
+; AVX2-NEXT:    vpsrad $31, %xmm7, %xmm7
+; AVX2-NEXT:    vpmovsxdq %xmm7, %ymm7
+; AVX2-NEXT:    vmaskmovpd 32(%rdi), %ymm7, %ymm8
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm6, %xmm6
+; AVX2-NEXT:    vpsrad $31, %xmm6, %xmm6
+; AVX2-NEXT:    vpmovsxdq %xmm6, %ymm6
+; AVX2-NEXT:    vmaskmovpd 64(%rdi), %ymm6, %ymm10
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm9, %ymm1, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm7, %ymm8, %ymm2, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm6, %ymm10, %ymm3, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vmaskmovpd 96(%rdi), %ymm0, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vmovapd %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; SKX-LABEL: test_load_16f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
+; SKX-NEXT:    kshiftrw $8, %k1, %k1
+; SKX-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    vmovaps %zmm2, %zmm1
+; SKX-NEXT:    retq
+  %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
+  ret <16 x double> %res
+}
+declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
+
+define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0)  {
+; AVX512-LABEL: test_load_32f64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX512-NEXT:    vpmovsxbd %xmm5, %zmm5
+; AVX512-NEXT:    vpslld $31, %zmm5, %zmm5
+; AVX512-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512-NEXT:    vmovupd 128(%rdi), %zmm3 {%k1}
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512-NEXT:    vmovupd (%rdi), %zmm1 {%k2}
+; AVX512-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512-NEXT:    vmovupd 192(%rdi), %zmm4 {%k1}
+; AVX512-NEXT:    kshiftrw $8, %k2, %k1
+; AVX512-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    vmovaps %zmm2, %zmm1
+; AVX512-NEXT:    vmovaps %zmm3, %zmm2
+; AVX512-NEXT:    vmovaps %zmm4, %zmm3
+; AVX512-NEXT:    retq
+;
+; AVX2-LABEL: test_load_32f64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:  Ltmp0:
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:  Ltmp1:
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:  Ltmp2:
+; AVX2-NEXT:    .cfi_def_cfa_register %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $32, %rsp
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm8, %xmm8
+; AVX2-NEXT:    vpsrad $31, %xmm8, %xmm8
+; AVX2-NEXT:    vpmovsxdq %xmm8, %ymm8
+; AVX2-NEXT:    vmaskmovpd 32(%rsi), %ymm8, %ymm9
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm10, %xmm10
+; AVX2-NEXT:    vpsrad $31, %xmm10, %xmm10
+; AVX2-NEXT:    vpmovsxdq %xmm10, %ymm10
+; AVX2-NEXT:    vmaskmovpd 64(%rsi), %ymm10, %ymm11
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm12, %xmm12
+; AVX2-NEXT:    vpsrad $31, %xmm12, %xmm12
+; AVX2-NEXT:    vpmovsxdq %xmm12, %ymm12
+; AVX2-NEXT:    vmaskmovpd 96(%rsi), %ymm12, %ymm13
+; AVX2-NEXT:    vblendvpd %ymm8, %ymm9, %ymm2, %ymm8
+; AVX2-NEXT:    vblendvpd %ymm10, %ymm11, %ymm3, %ymm9
+; AVX2-NEXT:    vblendvpd %ymm12, %ymm13, %ymm4, %ymm11
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm3, %xmm3
+; AVX2-NEXT:    vpsrad $31, %xmm3, %xmm3
+; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT:    vmaskmovpd 160(%rsi), %ymm3, %ymm10
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm4, %xmm4
+; AVX2-NEXT:    vpsrad $31, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovsxdq %xmm4, %ymm4
+; AVX2-NEXT:    vmaskmovpd 192(%rsi), %ymm4, %ymm12
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm10, %ymm6, %ymm3
+; AVX2-NEXT:    vmovapd 16(%rbp), %ymm6
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm12, %ymm7, %ymm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm7, %xmm7
+; AVX2-NEXT:    vpsrad $31, %xmm7, %xmm7
+; AVX2-NEXT:    vpmovsxdq %xmm7, %ymm7
+; AVX2-NEXT:    vmaskmovpd 224(%rsi), %ymm7, %ymm10
+; AVX2-NEXT:    vblendvpd %ymm7, %ymm10, %ymm6, %ymm6
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vmaskmovpd (%rsi), %ymm0, %ymm7
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm7, %ymm1, %ymm0
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vmaskmovpd 128(%rsi), %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm5, %ymm1
+; AVX2-NEXT:    vmovapd %ymm1, 128(%rdi)
+; AVX2-NEXT:    vmovapd %ymm0, (%rdi)
+; AVX2-NEXT:    vmovapd %ymm6, 224(%rdi)
+; AVX2-NEXT:    vmovapd %ymm4, 192(%rdi)
+; AVX2-NEXT:    vmovapd %ymm3, 160(%rdi)
+; AVX2-NEXT:    vmovapd %ymm11, 96(%rdi)
+; AVX2-NEXT:    vmovapd %ymm9, 64(%rdi)
+; AVX2-NEXT:    vmovapd %ymm8, 32(%rdi)
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; SKX-LABEL: test_load_32f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT:    vpmovb2m %ymm0, %k1
+; SKX-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
+; SKX-NEXT:    kshiftrd $16, %k1, %k2
+; SKX-NEXT:    vmovupd 128(%rdi), %zmm3 {%k2}
+; SKX-NEXT:    kshiftrw $8, %k1, %k1
+; SKX-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
+; SKX-NEXT:    kshiftrw $8, %k2, %k1
+; SKX-NEXT:    vmovupd 192(%rdi), %zmm4 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    vmovaps %zmm2, %zmm1
+; SKX-NEXT:    vmovaps %zmm3, %zmm2
+; SKX-NEXT:    vmovaps %zmm4, %zmm3
+; SKX-NEXT:    retq
+  %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
+  ret <32 x double> %res
+}
+declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
diff --git a/test/CodeGen/X86/materialize.ll b/test/CodeGen/X86/materialize.ll
new file mode 100644
index 0000000000000..695bf0fa5b982
--- /dev/null
+++ b/test/CodeGen/X86/materialize.ll
@@ -0,0 +1,184 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
+; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64
+
+define i32 @one32_nooptsize() {
+entry:
+  ret i32 1
+
+; When not optimizing for size, use mov.
+; CHECK32-LABEL: one32_nooptsize:
+; CHECK32:       movl $1, %eax
+; CHECK32-NEXT:  retl
+; CHECK64-LABEL: one32_nooptsize:
+; CHECK64:       movl $1, %eax
+; CHECK64-NEXT:  retq
+}
+
+define i32 @one32() optsize {
+entry:
+  ret i32 1
+
+; CHECK32-LABEL: one32:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  retl
+
+; FIXME: Figure out the best approach in 64-bit mode.
+; CHECK64-LABEL: one32:
+; CHECK64:       movl $1, %eax
+; CHECK64-NEXT:  retq
+}
+
+define i32 @one32_minsize() minsize {
+entry:
+  ret i32 1
+
+; On 32-bit, xor-inc is preferred over push-pop.
+; CHECK32-LABEL: one32_minsize:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  retl
+
+; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to
+; pop into a 64-bit register even when we just need 32 bits.
+; CHECK64-LABEL: one32_minsize:
+; CHECK64:       pushq $1
+; CHECK64:       .cfi_adjust_cfa_offset 8
+; CHECK64:       popq %rax
+; CHECK64:       .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT:  retq
+}
+
+define i64 @one64_minsize() minsize {
+entry:
+  ret i64 1
+; On 64-bit we don't do xor-inc yet, so push-pop it is.
+; CHECK64-LABEL: one64_minsize:
+; CHECK64:       pushq $1
+; CHECK64:       .cfi_adjust_cfa_offset 8
+; CHECK64:       popq %rax
+; CHECK64:       .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT:  retq
+
+; On Win64 we can't adjust the stack unless there's a frame pointer.
+; CHECKWIN64-LABEL: one64_minsize:
+; CHECKWIN64:       movl $1, %eax
+; CHECKWIN64-NEXT:  retq
+}
+
+define i32 @minus_one32() optsize {
+entry:
+  ret i32 -1
+
+; CHECK32-LABEL: minus_one32:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  retl
+}
+
+define i32 @minus_one32_minsize() minsize {
+entry:
+  ret i32 -1
+
+; xor-dec is preferred over push-pop.
+; CHECK32-LABEL: minus_one32_minsize:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  retl
+}
+
+define i16 @one16() optsize {
+entry:
+  ret i16 1
+
+; CHECK32-LABEL: one16:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  retl
+}
+
+define i16 @minus_one16() optsize {
+entry:
+  ret i16 -1
+
+; CHECK32-LABEL: minus_one16:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  retl
+}
+
+define i32 @minus_five32() minsize {
+entry:
+  ret i32 -5
+
+; CHECK32-LABEL: minus_five32:
+; CHECK32: pushl $-5
+; CHECK32: popl %eax
+; CHECK32: retl
+}
+
+define i64 @minus_five64() minsize {
+entry:
+  ret i64 -5
+
+; CHECK64-LABEL: minus_five64:
+; CHECK64: pushq $-5
+; CHECK64:       .cfi_adjust_cfa_offset 8
+; CHECK64: popq %rax
+; CHECK64:       .cfi_adjust_cfa_offset -8
+; CHECK64: retq
+}
+
+define i32 @rematerialize_minus_one() optsize {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; -1 should be re-materialized here instead of getting spilled above.
+  ret i32 -1
+
+; CHECK32-LABEL: rematerialize_minus_one
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NOT:   %eax
+; CHECK32:       retl
+}
+
+define i32 @rematerialize_minus_one_eflags(i32 %x) optsize {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; Define eflags.
+  %a = icmp ne i32 %x, 123
+  %b = zext i1 %a to i32
+  ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
+  ; It must therefore not use the xor-dec lowering.
+  %c = select i1 %a, i32 %b, i32 -1
+  ret i32 %c
+
+; CHECK32-LABEL: rematerialize_minus_one_eflags
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       cmpl
+; CHECK32:       setne
+; CHECK32-NOT:   xorl
+; CHECK32:       movl $-1
+; CHECK32:       cmov
+; CHECK32:       retl
+}
+
+declare x86_thiscallcc void @f(i32)
diff --git a/test/CodeGen/X86/mcu-abi.ll b/test/CodeGen/X86/mcu-abi.ll
new file mode 100644
index 0000000000000..966fd4521f2d6
--- /dev/null
+++ b/test/CodeGen/X86/mcu-abi.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -mtriple=i686-pc-elfiamcu | FileCheck %s
+
+%struct.st12_t = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+
+; CHECK-LABEL: test_ints:
+; CHECK: addl    %edx, %eax
+; CHECK-NEXT: imull   %ecx, %eax
+; CHECK-NEXT: addl    4(%esp), %eax
+; CHECK-NEXT: retl
+define i32 @test_ints(i32 %a, i32 %b, i32 %c, i32 %d) #0 {
+entry:
+  %r1 = add i32 %b, %a
+  %r2 = mul i32 %c, %r1
+  %r3 = add i32 %d, %r2
+  ret i32 %r3
+}
+
+; CHECK-LABEL: test_floats:
+; CHECK: addl    %edx, %eax
+; CHECK-NEXT: imull   %ecx, %eax
+; CHECK-NEXT: addl    4(%esp), %eax
+; CHECK-NEXT: retl
+define i32 @test_floats(i32 %a, i32 %b, float %c, float %d) #0 {
+entry:
+  %ci = bitcast float %c to i32
+  %di = bitcast float %d to i32
+  %r1 = add i32 %b, %a
+  %r2 = mul i32 %ci, %r1
+  %r3 = add i32 %di, %r2
+  ret i32 %r3
+}
+
+; CHECK-LABEL: test_doubles:
+; CHECK: addl    4(%esp), %eax
+; CHECK-NEXT: adcl    8(%esp), %edx
+; CHECK-NEXT: retl
+define double @test_doubles(double %d1, double %d2) #0 {
+entry:
+    %d1i = bitcast double %d1 to i64
+    %d2i = bitcast double %d2 to i64
+    %r = add i64 %d1i, %d2i
+    %rd = bitcast i64 %r to double
+    ret double %rd
+}
+
+; CHECK-LABEL: test_mixed_doubles:
+; CHECK: addl    %ecx, %eax
+; CHECK-NEXT: adcl    $0, %edx
+; CHECK-NEXT: retl
+define double @test_mixed_doubles(double %d2, i32 %i) #0 {
+entry:
+    %iext = zext i32 %i to i64
+    %d2i = bitcast double %d2 to i64
+    %r = add i64 %iext, %d2i
+    %rd = bitcast i64 %r to double
+    ret double %rd
+}
+
+; CHECK-LABEL: ret_large_struct:
+; CHECK: pushl   %esi
+; CHECK-NEXT: movl    %eax, %esi
+; CHECK-NEXT: leal    8(%esp), %edx
+; CHECK-NEXT: movl    $48, %ecx
+; CHECK-NEXT: calll   memcpy
+; CHECK-NEXT: movl    %esi, %eax
+; CHECK-NEXT: popl    %esi
+; CHECK-NOT:  retl $4
+; CHECK-NEXT: retl
+define void @ret_large_struct(%struct.st12_t* noalias nocapture sret %agg.result, %struct.st12_t* byval nocapture readonly align 4 %r) #0 {
+entry:
+  %0 = bitcast %struct.st12_t* %agg.result to i8*
+  %1 = bitcast %struct.st12_t* %r to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 48, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: var_args:
+; CHECK: movl    4(%esp), %eax
+; CHECK-NEXT: retl
+define i32 @var_args(i32 %i1, ...) #0 {
+entry:
+  ret i32 %i1
+}
+
+; CHECK-LABEL: test_lib_args:
+; CHECK: movl %edx, %eax
+; CHECK: calll __fixsfsi
+define i32 @test_lib_args(float %a, float %b) #0 {
+  %ret = fptosi float %b to i32
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test_fp128:
+; CHECK: movl    (%eax), %e[[CX:..]]
+; CHECK-NEXT: movl    4(%eax), %e[[DX:..]]
+; CHECK-NEXT: movl    8(%eax), %e[[SI:..]]
+; CHECK-NEXT: movl    12(%eax), %e[[AX:..]]
+; CHECK-NEXT: movl    %e[[AX]], 12(%esp)
+; CHECK-NEXT: movl    %e[[SI]], 8(%esp)
+; CHECK-NEXT: movl    %e[[DX]], 4(%esp)
+; CHECK-NEXT: movl    %e[[CX]], (%esp)
+; CHECK-NEXT: calll   __fixtfsi
+define i32 @test_fp128(fp128* %ptr) #0 {
+  %v = load fp128, fp128* %ptr
+  %ret = fptosi fp128 %v to i32
+  ret i32 %ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+
+attributes #0 = { nounwind "use-soft-float"="true"}
+attributes #1 = { nounwind argmemonly }
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index 0111c0d433f11..7ef61c9a677b0 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -3,6 +3,8 @@
 ; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
 ; RUN: llc < %s -mattr=-sse       -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
 ; RUN: llc < %s                 -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
+; RUN: llc < %s                 -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck %s -check-prefix=NHM_64
+
 
 @.str = internal constant [25 x i8] c"image\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00"
 @.str2 = internal constant [30 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 4
@@ -179,13 +181,25 @@ entry:
 ; NOSSE: movl $2021161080
 ; NOSSE: movl $2021161080
 
+;;; TODO: (1) Some of the loads and stores are certainly unaligned and (2) the first load and first
+;;; store overlap with the second load and second store respectively.
+;;;
+;;; Is either of the sequences ideal?
+
 ; X86-64-LABEL: t4:
-; X86-64: movabsq $8680820740569200760, %rax
-; X86-64: movq %rax
-; X86-64: movq %rax
-; X86-64: movq %rax
-; X86-64: movw $120
-; X86-64: movl $2021161080
+; X86-64: movabsq  $33909456017848440, %rax ## imm = 0x78787878787878
+; X86-64: movq     %rax, -10(%rsp)
+; X86-64: movabsq  $8680820740569200760, %rax ## imm = 0x7878787878787878
+; X86-64: movq     %rax, -16(%rsp)
+; X86-64: movq     %rax, -24(%rsp)
+; X86-64: movq     %rax, -32(%rsp)
+
+; NHM_64-LABEL: t4:
+; NHM_64: movups   _.str2+14(%rip), %xmm0
+; NHM_64: movups   %xmm0, -26(%rsp)
+; NHM_64: movups   _.str2(%rip), %xmm0
+; NHM_64: movaps   %xmm0, -40(%rsp)
+
   %tmp1 = alloca [30 x i8]
   %tmp2 = bitcast [30 x i8]* %tmp1 to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp2, i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str2, i32 0, i32 0), i32 30, i32 1, i1 false)
diff --git a/test/CodeGen/X86/memcpy.ll b/test/CodeGen/X86/memcpy.ll
index f582571252b53..4351014192bb3 100644
--- a/test/CodeGen/X86/memcpy.ll
+++ b/test/CodeGen/X86/memcpy.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=DARWIN
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* nocapture, i8 addrspace(256)* nocapture, i64, i32, i1) nounwind
 
 
 ; Variable memcpy's should lower to calls.
@@ -59,6 +60,26 @@ entry:
 ; DARWIN: movq
 }
 
+define void @test3_minsize(i8* nocapture %A, i8* nocapture %B) nounwind minsize noredzone {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false)
+  ret void
+; LINUX-LABEL: test3_minsize:
+; LINUX: memcpy
+
+; DARWIN-LABEL: test3_minsize:
+; DARWIN: memcpy
+}
+
+define void @test3_minsize_optsize(i8* nocapture %A, i8* nocapture %B) nounwind optsize minsize noredzone {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false)
+  ret void
+; LINUX-LABEL: test3_minsize_optsize:
+; LINUX: memcpy
+
+; DARWIN-LABEL: test3_minsize_optsize:
+; DARWIN: memcpy
+}
+
 ; Large constant memcpy's should be inlined when not optimizing for size.
 define void @test4(i8* nocapture %A, i8* nocapture %B) nounwind noredzone {
 entry:
@@ -118,3 +139,15 @@ define void @PR15348(i8* %a, i8* %b) {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 17, i32 0, i1 false)
   ret void
 }
+
+; Memcpys from / to address space 256 should be lowered to appropriate loads /
+; stores if small enough.
+define void @addrspace256(i8 addrspace(256)* %a, i8 addrspace(256)* %b) nounwind {
+  tail call void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* %a, i8 addrspace(256)* %b, i64 16, i32 8, i1 false)
+  ret void
+; LINUX-LABEL: addrspace256:
+; LINUX: movq %gs:
+; LINUX: movq %gs:
+; LINUX: movq {{.*}}, %gs:
+; LINUX: movq {{.*}}, %gs:
+}
diff --git a/test/CodeGen/X86/merge-store-partially-alias-loads.ll b/test/CodeGen/X86/merge-store-partially-alias-loads.ll
new file mode 100644
index 0000000000000..8e148aa76d38e
--- /dev/null
+++ b/test/CodeGen/X86/merge-store-partially-alias-loads.ll
@@ -0,0 +1,52 @@
+; REQUIRES: asserts
+; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck -check-prefix=X86 %s
+; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -debug-only=isel < %s 2>&1 | FileCheck -check-prefix=DBGDAG %s
+
+; It's OK to merge the load / store of the first 2 components, but
+; they must not be placed on the same chain after merging.
+
+; X86-LABEL: {{^}}merge_store_partial_overlap_load:
+; X86-DAG: movw ([[BASEREG:%[a-z]+]]), [[LO2:%[a-z]+]]
+; X86-DAG: movb 2([[BASEREG]]), [[HI1:%[a-z]+]]
+
+; X86-NEXT: movw [[LO2]], 1([[BASEREG]])
+; X86-NEXT: movb [[HI1]], 3([[BASEREG]])
+; X86-NEXT: retq
+
+; DBGDAG-LABEL: Optimized lowered selection DAG: BB#0 'merge_store_partial_overlap_load:'
+; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch = EntryToken
+; DBGDAG-DAG: [[BASEPTR:t[0-9]+]]: i64,ch = CopyFromReg [[ENTRYTOKEN]],
+; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add [[BASEPTR]], Constant:i64<2>
+
+; DBGDAG-DAG: [[LD2:t[0-9]+]]: i16,ch = load<LD2[%tmp81](align=1)> [[ENTRYTOKEN]], [[BASEPTR]], undef:i64
+; DBGDAG-DAG: [[LD1:t[0-9]+]]: i8,ch = load<LD1[%tmp12]> [[ENTRYTOKEN]], [[ADDPTR]], undef:i64
+
+; DBGDAG: [[LOADTOKEN:t[0-9]+]]: ch = TokenFactor [[LD2]]:1, [[LD1]]:1
+
+; DBGDAG-DAG: [[ST2:t[0-9]+]]: ch = store<ST2[%tmp10](align=1)> [[LOADTOKEN]], [[LD2]], t{{[0-9]+}}, undef:i64
+; DBGDAG-DAG: [[ST1:t[0-9]+]]: ch = store<ST1[%tmp14]> [[ST2]], [[LD1]], t{{[0-9]+}}, undef:i64
+; DBGDAG: X86ISD::RET_FLAG [[ST1]],
+
+; DBGDAG: Type-legalized selection DAG: BB#0 'merge_store_partial_overlap_load:'
+define void @merge_store_partial_overlap_load([4 x i8]* %tmp) {
+  %tmp8 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 0
+  %tmp10 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 1
+  %tmp12 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 2
+  %tmp14 = getelementptr [4 x i8], [4 x i8]* %tmp, i32 0, i8 3
+
+  %tmp9 = load i8, i8* %tmp8, align 1   ; base + 0
+  %tmp11 = load i8, i8* %tmp10, align 1 ; base + 1
+  %tmp13 = load i8, i8* %tmp12, align 1 ; base + 2
+
+  store i8 %tmp9, i8* %tmp10, align 1   ; base + 1
+  store i8 %tmp11, i8* %tmp12, align 1  ; base + 2
+  store i8 %tmp13, i8* %tmp14, align 1  ; base + 3
+
+; Should emit
+; load base + 0, base + 1
+; store base + 1, base + 2
+; load base + 2
+; store base + 3
+
+  ret void
+}
diff --git a/test/CodeGen/X86/misched-code-difference-with-debug.ll b/test/CodeGen/X86/misched-code-difference-with-debug.ll
index 0f1f382c49a8e..0a1ea830a41d0 100644
--- a/test/CodeGen/X86/misched-code-difference-with-debug.ll
+++ b/test/CodeGen/X86/misched-code-difference-with-debug.ll
@@ -43,7 +43,7 @@ entry:
 ; CHECK-LABEL: test_with_debug
 ; CHECK: movl [[A]], [[B]]
 ; CHECK-NEXT: movl [[A]], [[C]]
-define void @test_with_debug() {
+define void @test_with_debug() !dbg !13 {
 entry:
   %c = alloca %class.C, align 1
   %0 = load i8, i8* @argc, align 1
@@ -62,26 +62,26 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !23}
 
-!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, enums: !2, retainedTypes: !3, subprograms: !12, globals: !20, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, enums: !2, retainedTypes: !3, subprograms: !12, globals: !20, imports: !2)
 !1 = !DIFile(filename: "test.cpp", directory: "")
 !2 = !{}
 !3 = !{!4}
 !4 = !DICompositeType(tag: DW_TAG_class_type, name: "C", line: 2, size: 8, align: 8, file: !1, elements: !5, identifier: "_ZTS1C")
 !5 = !{!6}
-!6 = !DISubprogram(name: "test", file: !1, scope: !"_ZTS1C", type: !7)
+!6 = !DISubprogram(name: "test", file: !1, scope: !"_ZTS1C", type: !7, isDefinition: false)
 !7 = !DISubroutineType(types: !8)
 !8 = !{!9, !10, !11, !11, !11, null}
 !9 = !DIBasicType(encoding: DW_ATE_signed, size: 32, align: 32, name: "int")
 !10 = !DIDerivedType(baseType: !"_ZTS1C", tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial)
 !11 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
 !12 = !{!13}
-!13 = !DISubprogram(name: "test_with_debug", linkageName: "test_with_debug", line: 6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !14, type: !15, function: void ()* @test_with_debug, variables: !17)
+!13 = distinct !DISubprogram(name: "test_with_debug", linkageName: "test_with_debug", line: 6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !14, type: !15, variables: !17)
 !14 = !DIFile(filename: "test.cpp", directory: "")
 !15 = !DISubroutineType(types: !16)
 !16 = !{null}
 !17 = !{!18, !19}
-!18 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "c", line: 7, scope: !13, file: !14, type: !"_ZTS1C")
-!19 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "lc", line: 8, scope: !13, file: !14, type: !11)
+!18 = !DILocalVariable(name: "c", line: 7, scope: !13, file: !14, type: !"_ZTS1C")
+!19 = !DILocalVariable(name: "lc", line: 8, scope: !13, file: !14, type: !11)
 !20 = !{!21}
 !21 = !DIGlobalVariable(name: "argc", line: 1, isLocal: false, isDefinition: true, scope: null, file: !14, type: !11, variable: i8* @argc)
 !22 = !{i32 2, !"Dwarf Version", i32 4}
diff --git a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
index 2727e3eb02802..9841381b560d1 100644
--- a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
+++ b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86-64
 ;
 ; On Darwin x86-64, v8i8, v4i16, v2i32 values are passed in XMM[0-7].
diff --git a/test/CodeGen/X86/mmx-arg-passing.ll b/test/CodeGen/X86/mmx-arg-passing.ll
index 4e0031076200e..67ccb9e32dde0 100644
--- a/test/CodeGen/X86/mmx-arg-passing.ll
+++ b/test/CodeGen/X86/mmx-arg-passing.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | FileCheck %s --check-prefix=X86-32
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86-64
 ;
diff --git a/test/CodeGen/X86/mmx-coalescing.ll b/test/CodeGen/X86/mmx-coalescing.ll
new file mode 100644
index 0000000000000..a515e5ee37543
--- /dev/null
+++ b/test/CodeGen/X86/mmx-coalescing.ll
@@ -0,0 +1,84 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s
+
+%SA = type <{ %union.anon, i32, [4 x i8], i8*, i8*, i8*, i32, [4 x i8] }>
+%union.anon = type { <1 x i64> }
+
+; Check that extra movd (copy) instructions aren't generated.
+
+define i32 @test(%SA* %pSA, i16* %A, i32 %B, i32 %C, i32 %D, i8* %E) {
+entry:
+; CHECK-LABEL: test
+; CHECK:       # BB#0:
+; CHECK-NEXT:  pshufw
+; CHECK-NEXT:  movd
+; CHECK-NOT:  movd
+; CHECK-NEXT:  testl
+  %shl = shl i32 1, %B
+  %shl1 = shl i32 %C, %B
+  %shl2 = shl i32 1, %D
+  %v = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 0, i32 0
+  %v0 = load <1 x i64>, <1 x i64>* %v, align 8
+  %SA0 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 1
+  %v1 = load i32, i32* %SA0, align 4
+  %SA1 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 3
+  %v2 = load i8*, i8** %SA1, align 8
+  %SA2 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 4
+  %v3 = load i8*, i8** %SA2, align 8
+  %v4 = bitcast <1 x i64> %v0 to <4 x i16>
+  %v5 = bitcast <4 x i16> %v4 to x86_mmx
+  %v6 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v5, i8 -18)
+  %v7 = bitcast x86_mmx %v6 to <4 x i16>
+  %v8 = bitcast <4 x i16> %v7 to <1 x i64>
+  %v9 = extractelement <1 x i64> %v8, i32 0
+  %v10 = bitcast i64 %v9 to <2 x i32>
+  %v11 = extractelement <2 x i32> %v10, i32 0
+  %cmp = icmp eq i32 %v11, 0
+  br i1 %cmp, label %if.A, label %if.B
+
+if.A:
+; CHECK: %if.A
+; CHECK-NEXT:  movd
+; CHECK-NEXT:  psllq
+  %pa = phi <1 x i64> [ %v8, %entry ], [ %vx, %if.C ]
+  %v17 = extractelement <1 x i64> %pa, i32 0
+  %v18 = bitcast i64 %v17 to x86_mmx
+  %v19 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %v18, i32 %B) #2
+  %v20 = bitcast x86_mmx %v19 to i64
+  %v21 = insertelement <1 x i64> undef, i64 %v20, i32 0
+  %cmp3 = icmp eq i64 %v20, 0
+  br i1 %cmp3, label %if.C, label %merge
+
+if.B:
+  %v34 = bitcast <1 x i64> %v8 to <4 x i16>
+  %v35 = bitcast <4 x i16> %v34 to x86_mmx
+  %v36 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v35, i8 -18)
+  %v37 = bitcast x86_mmx %v36 to <4 x i16>
+  %v38 = bitcast <4 x i16> %v37 to <1 x i64>
+  br label %if.C
+
+if.C:
+  %vx = phi <1 x i64> [ %v21, %if.A ], [ %v38, %if.B ]
+  %cvt = bitcast <1 x i64> %vx to <2 x i32>
+  %ex = extractelement <2 x i32> %cvt, i32 0
+  %cmp2 = icmp eq i32 %ex, 0
+  br i1 %cmp2, label %if.A, label %merge
+
+merge:
+; CHECK: %merge
+; CHECK-NOT:  movd
+; CHECK-NEXT:  pshufw
+  %vy = phi <1 x i64> [ %v21, %if.A ], [ %vx, %if.C ]
+  %v130 = bitcast <1 x i64> %vy to <4 x i16>
+  %v131 = bitcast <4 x i16> %v130 to x86_mmx
+  %v132 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v131, i8 -18)
+  %v133 = bitcast x86_mmx %v132 to <4 x i16>
+  %v134 = bitcast <4 x i16> %v133 to <1 x i64>
+  %v135 = extractelement <1 x i64> %v134, i32 0
+  %v136 = bitcast i64 %v135 to <2 x i32>
+  %v137 = extractelement <2 x i32> %v136, i32 0
+  ret i32 %v137
+}
+
+
+declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
diff --git a/test/CodeGen/X86/mmx-intrinsics.ll b/test/CodeGen/X86/mmx-intrinsics.ll
index 39d481b16e7ad..7647fccb5803b 100644
--- a/test/CodeGen/X86/mmx-intrinsics.ll
+++ b/test/CodeGen/X86/mmx-intrinsics.ll
@@ -1,12 +1,13 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+ssse3,-avx | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+ssse3,-avx | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+mmx,+ssse3,-avx | FileCheck %s --check-prefix=ALL --check-prefix=X86
+; RUN: llc < %s -march=x86 -mattr=+mmx,+avx | FileCheck %s --check-prefix=ALL --check-prefix=X86
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+ssse3,-avx | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+avx | FileCheck %s --check-prefix=ALL --check-prefix=X64
 
 declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phaddw
+; ALL-LABEL: @test1
+; ALL: phaddw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -22,7 +23,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpgtd
+; ALL-LABEL: @test88
+; ALL: pcmpgtd
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -38,7 +40,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpgtw
+; ALL-LABEL: @test87
+; ALL: pcmpgtw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -54,7 +57,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpgtb
+; ALL-LABEL: @test86
+; ALL: pcmpgtb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -70,7 +74,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpeqd
+; ALL-LABEL: @test85
+; ALL: pcmpeqd
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -86,7 +91,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpeqw
+; ALL-LABEL: @test84
+; ALL: pcmpeqw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -102,7 +108,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpeqb
+; ALL-LABEL: @test83
+; ALL: pcmpeqb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -118,7 +125,9 @@ entry:
 declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpckldq
+; ALL-LABEL: @test82
+; X86: punpckldq {{.*#+}} mm0 = mm0[0],mem[0]
+; X64: punpckldq {{.*#+}} mm0 = mm0[0],mm1[0]
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -134,7 +143,9 @@ entry:
 declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpcklwd
+; ALL-LABEL: @test81
+; X86: punpcklwd {{.*#+}} mm0 = mm0[0],mem[0],mm0[1],mem[1]
+; X64: punpcklwd {{.*#+}} mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -150,7 +161,9 @@ entry:
 declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpcklbw
+; ALL-LABEL: @test80
+; X86: punpcklbw {{.*#+}} mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3]
+; X64: punpcklbw {{.*#+}} mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3]
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -166,7 +179,9 @@ entry:
 declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpckhdq
+; ALL-LABEL: @test79
+; X86: punpckhdq {{.*#+}} mm0 = mm0[1],mem[1]
+; X64: punpckhdq {{.*#+}} mm0 = mm0[1],mm1[1]
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -182,7 +197,9 @@ entry:
 declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpckhwd
+; ALL-LABEL: @test78
+; X86: punpckhwd {{.*#+}} mm0 = mm0[2],mem[2],mm0[3],mem[3]
+; X64: punpckhwd {{.*#+}} mm0 = mm0[2],mm1[2],mm0[3],mm1[3]
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -198,7 +215,9 @@ entry:
 declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpckhbw
+; ALL-LABEL: @test77
+; X86: punpckhbw {{.*#+}} mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7]
+; X64: punpckhbw {{.*#+}} mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7]
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -214,7 +233,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: packuswb
+; ALL-LABEL: @test76
+; ALL: packuswb
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -230,7 +250,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: packssdw
+; ALL-LABEL: @test75
+; ALL: packssdw
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -246,7 +267,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: packsswb
+; ALL-LABEL: @test74
+; ALL: packsswb
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -262,7 +284,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
 
 define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psrad
+; ALL-LABEL: @test73
+; ALL: psrad
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
   %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
@@ -276,7 +299,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
 
 define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psraw
+; ALL-LABEL: @test72
+; ALL: psraw
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
   %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
@@ -290,7 +314,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
 
 define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psrlq
+; ALL-LABEL: @test71
+; ALL: psrlq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
   %mmx_var.i = bitcast i64 %0 to x86_mmx
@@ -302,7 +327,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
 
 define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psrld
+; ALL-LABEL: @test70
+; ALL: psrld
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
   %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
@@ -316,7 +342,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
 
 define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psrlw
+; ALL-LABEL: @test69
+; ALL: psrlw
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
   %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
@@ -330,7 +357,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
 
 define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psllq
+; ALL-LABEL: @test68
+; ALL: psllq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
   %mmx_var.i = bitcast i64 %0 to x86_mmx
@@ -342,7 +370,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
 
 define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pslld
+; ALL-LABEL: @test67
+; ALL: pslld
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
   %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
@@ -356,7 +385,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
 
 define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psllw
+; ALL-LABEL: @test66
+; ALL: psllw
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
   %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
@@ -370,7 +400,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psrad
+; ALL-LABEL: @test65
+; ALL: psrad
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
   %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
@@ -386,7 +417,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psraw
+; ALL-LABEL: @test64
+; ALL: psraw
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
   %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
@@ -402,7 +434,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psrlq
+; ALL-LABEL: @test63
+; ALL: psrlq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
   %mmx_var.i = bitcast i64 %0 to x86_mmx
@@ -416,7 +449,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psrld
+; ALL-LABEL: @test62
+; ALL: psrld
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
   %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
@@ -432,7 +466,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psrlw
+; ALL-LABEL: @test61
+; ALL: psrlw
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
   %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
@@ -448,7 +483,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psllq
+; ALL-LABEL: @test60
+; ALL: psllq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
   %mmx_var.i = bitcast i64 %0 to x86_mmx
@@ -462,7 +498,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pslld
+; ALL-LABEL: @test59
+; ALL: pslld
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
   %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
@@ -478,7 +515,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psllw
+; ALL-LABEL: @test58
+; ALL: psllw
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
   %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
@@ -494,7 +532,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pxor
+; ALL-LABEL: @test56
+; ALL: pxor
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -510,7 +549,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: por
+; ALL-LABEL: @test55
+; ALL: por
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -526,7 +566,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pandn
+; ALL-LABEL: @test54
+; ALL: pandn
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -542,7 +583,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pand
+; ALL-LABEL: @test53
+; ALL: pand
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -558,7 +600,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmullw
+; ALL-LABEL: @test52
+; ALL: pmullw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -572,7 +615,8 @@ entry:
 }
 
 define i64 @test51(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmullw
+; ALL-LABEL: @test51
+; ALL: pmullw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -588,7 +632,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmulhw
+; ALL-LABEL: @test50
+; ALL: pmulhw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -604,7 +649,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmaddwd
+; ALL-LABEL: @test49
+; ALL: pmaddwd
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -620,7 +666,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubusw
+; ALL-LABEL: @test48
+; ALL: psubusw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -636,7 +683,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubusb
+; ALL-LABEL: @test47
+; ALL: psubusb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -652,7 +700,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubsw
+; ALL-LABEL: @test46
+; ALL: psubsw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -668,7 +717,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubsb
+; ALL-LABEL: @test45
+; ALL: psubsb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -682,7 +732,8 @@ entry:
 }
 
 define i64 @test44(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubq
+; ALL-LABEL: @test44
+; ALL: psubq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
   %mmx_var = bitcast i64 %0 to x86_mmx
@@ -698,7 +749,8 @@ declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
 declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubd
+; ALL-LABEL: @test43
+; ALL: psubd
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -714,7 +766,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubw
+; ALL-LABEL: @test42
+; ALL: psubw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -730,7 +783,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubb
+; ALL-LABEL: @test41
+; ALL: psubb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -746,7 +800,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddusw
+; ALL-LABEL: @test40
+; ALL: paddusw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -762,7 +817,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddusb
+; ALL-LABEL: @test39
+; ALL: paddusb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -778,7 +834,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddsw
+; ALL-LABEL: @test38
+; ALL: paddsw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -794,7 +851,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddsb
+; ALL-LABEL: @test37
+; ALL: paddsb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -810,7 +868,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddq
+; ALL-LABEL: @test36
+; ALL: paddq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
   %mmx_var = bitcast i64 %0 to x86_mmx
@@ -824,7 +883,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddd
+; ALL-LABEL: @test35
+; ALL: paddd
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -840,7 +900,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddw
+; ALL-LABEL: @test34
+; ALL: paddw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -856,7 +917,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddb
+; ALL-LABEL: @test33
+; ALL: paddb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -872,7 +934,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psadbw
+; ALL-LABEL: @test32
+; ALL: psadbw
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -886,7 +949,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pminsw
+; ALL-LABEL: @test31
+; ALL: pminsw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -902,7 +966,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pminub
+; ALL-LABEL: @test30
+; ALL: pminub
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -918,7 +983,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmaxsw
+; ALL-LABEL: @test29
+; ALL: pmaxsw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -934,7 +1000,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmaxub
+; ALL-LABEL: @test28
+; ALL: pmaxub
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -950,7 +1017,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pavgw
+; ALL-LABEL: @test27
+; ALL: pavgw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -966,7 +1034,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pavgb
+; ALL-LABEL: @test26
+; ALL: pavgb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -982,7 +1051,8 @@ entry:
 declare void @llvm.x86.mmx.movnt.dq(x86_mmx*, x86_mmx) nounwind
 
 define void @test25(<1 x i64>* %p, <1 x i64> %a) nounwind optsize ssp {
-; CHECK: movntq
+; ALL-LABEL: @test25
+; ALL: movntq
 entry:
   %mmx_ptr_var.i = bitcast <1 x i64>* %p to x86_mmx*
   %0 = extractelement <1 x i64> %a, i32 0
@@ -994,7 +1064,8 @@ entry:
 declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
 
 define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pmovmskb
+; ALL-LABEL: @test24
+; ALL: pmovmskb
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
   %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx
@@ -1005,7 +1076,8 @@ entry:
 declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*) nounwind
 
 define void @test23(<1 x i64> %d, <1 x i64> %n, i8* %p) nounwind optsize ssp {
-; CHECK: maskmovq
+; ALL-LABEL: @test23
+; ALL: maskmovq
 entry:
   %0 = bitcast <1 x i64> %n to <8 x i8>
   %1 = bitcast <1 x i64> %d to <8 x i8>
@@ -1018,7 +1090,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmulhuw
+; ALL-LABEL: @test22
+; ALL: pmulhuw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -1034,7 +1107,9 @@ entry:
 declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
 
 define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pshufw
+; ALL-LABEL: @test21
+; X86: pshufw {{.*#+}} mm0 = mem[3,0,0,0]
+; X64: pshufw {{.*#+}} mm0 = mm0[3,0,0,0]
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
   %1 = bitcast <4 x i16> %0 to x86_mmx
@@ -1046,9 +1121,10 @@ entry:
 }
 
 define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: test21_2
-; CHECK: pshufw
-; CHECK: movd
+; ALL-LABEL: @test21_2
+; X86: pshufw {{.*#+}} mm0 = mem[3,0,0,0]
+; X64: pshufw {{.*#+}} mm0 = mm0[3,0,0,0]
+; ALL: movd
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
   %1 = bitcast <4 x i16> %0 to x86_mmx
@@ -1062,7 +1138,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmuludq
+; ALL-LABEL: @test20
+; ALL: pmuludq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -1076,7 +1153,8 @@ entry:
 declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
 
 define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: cvtpi2pd
+; ALL-LABEL: @test19
+; ALL: cvtpi2pd
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
   %1 = bitcast <2 x i32> %0 to x86_mmx
@@ -1087,7 +1165,8 @@ entry:
 declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp {
-; CHECK: cvttpd2pi
+; ALL-LABEL: @test18
+; ALL: cvttpd2pi
 entry:
   %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
   %1 = bitcast x86_mmx %0 to <2 x i32>
@@ -1099,7 +1178,8 @@ entry:
 declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp {
-; CHECK: cvtpd2pi
+; ALL-LABEL: @test17
+; ALL: cvtpd2pi
 entry:
   %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
   %1 = bitcast x86_mmx %0 to <2 x i32>
@@ -1111,7 +1191,8 @@ entry:
 declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
 
 define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: palignr
+; ALL-LABEL: @test16
+; ALL: palignr
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
   %mmx_var = bitcast i64 %0 to x86_mmx
@@ -1125,7 +1206,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
 
 define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pabsd
+; ALL-LABEL: @test15
+; ALL: pabsd
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
   %1 = bitcast <2 x i32> %0 to x86_mmx
@@ -1139,7 +1221,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
 
 define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pabsw
+; ALL-LABEL: @test14
+; ALL: pabsw
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
   %1 = bitcast <4 x i16> %0 to x86_mmx
@@ -1153,7 +1236,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
 
 define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pabsb
+; ALL-LABEL: @test13
+; ALL: pabsb
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to x86_mmx
@@ -1167,7 +1251,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psignd
+; ALL-LABEL: @test12
+; ALL: psignd
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -1183,7 +1268,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psignw
+; ALL-LABEL: @test11
+; ALL: psignw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -1199,7 +1285,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psignb
+; ALL-LABEL: @test10
+; ALL: psignb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -1215,7 +1302,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pshufb
+; ALL-LABEL: @test9
+; ALL: pshufb
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -1231,7 +1319,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmulhrsw
+; ALL-LABEL: @test8
+; ALL: pmulhrsw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -1247,7 +1336,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmaddubsw
+; ALL-LABEL: @test7
+; ALL: pmaddubsw
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
@@ -1263,7 +1353,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phsubsw
+; ALL-LABEL: @test6
+; ALL: phsubsw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -1279,7 +1370,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phsubd
+; ALL-LABEL: @test5
+; ALL: phsubd
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -1295,7 +1387,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phsubw
+; ALL-LABEL: @test4
+; ALL: phsubw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -1311,7 +1404,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phaddsw
+; ALL-LABEL: @test3
+; ALL: phaddsw
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
@@ -1327,7 +1421,8 @@ entry:
 declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phaddd
+; ALL-LABEL: @test2
+; ALL: phaddd
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
@@ -1341,16 +1436,18 @@ entry:
 }
 
 define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind {
-; CHECK: cvtpi2ps
+; ALL-LABEL: @test89
+; ALL: cvtpi2ps
   %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
   ret <4 x float> %c
 }
 
 declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
 
-; CHECK-LABEL: test90
+; ALL-LABEL: test90
 define void @test90() {
-; CHECK: emms
+; ALL-LABEL: @test90
+; ALL: emms
   call void @llvm.x86.mmx.emms()
   ret void
 }
diff --git a/test/CodeGen/X86/mmx-only.ll b/test/CodeGen/X86/mmx-only.ll
new file mode 100644
index 0000000000000..35598d5f6e19a
--- /dev/null
+++ b/test/CodeGen/X86/mmx-only.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=x86 -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+mmx,-sse | FileCheck %s
+
+; Test that turning off sse doesn't turn off mmx.
+
+declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone {
+; CHECK-LABEL: @test88
+; CHECK: pcmpgtd
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
diff --git a/test/CodeGen/X86/movntdq-no-avx.ll b/test/CodeGen/X86/movntdq-no-avx.ll
index cc35e201e6b3e..2bf09dd6f5816 100644
--- a/test/CodeGen/X86/movntdq-no-avx.ll
+++ b/test/CodeGen/X86/movntdq-no-avx.ll
@@ -5,7 +5,7 @@
 
 define void @test(<2 x i64>* nocapture %a, <2 x i64> %b) nounwind optsize {
 entry:
-  store <2 x i64> %b, <2 x i64>* %a, align 16, !nontemporal !0
+  store <2 x i64> %b, <2 x i64>* %a, align 32, !nontemporal !0
   ret void
 }
 
diff --git a/test/CodeGen/X86/movpc32-check.ll b/test/CodeGen/X86/movpc32-check.ll
new file mode 100644
index 0000000000000..606af3c898f42
--- /dev/null
+++ b/test/CodeGen/X86/movpc32-check.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=i686-pc-linux -relocation-model=pic | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i686-pc-linux"
+
+; Function Attrs: nounwind
+define void @test() #0 !dbg !4 {
+entry:
+  call void bitcast (void (...)* @bar to void ()*)(), !dbg !11
+  ret void, !dbg !12
+}
+
+declare void @bar(...) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (http://llvm.org/git/clang.git 3490ab8630d5643f71f1f04e46984f05b27b8d67) (http://llvm.org/git/llvm.git d2643e2ff955ed234944fe3c6b4ffc1250085843)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!1 = !DIFile(filename: "test.c", directory: "movpc-test")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: false, variables: !2)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null}
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"PIC Level", i32 2}
+!10 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git 3490ab8630d5643f71f1f04e46984f05b27b8d67) (http://llvm.org/git/llvm.git d2643e2ff955ed234944fe3c6b4ffc1250085843)"}
+!11 = !DILocation(line: 4, column: 3, scope: !4)
+!12 = !DILocation(line: 5, column: 1, scope: !4)
+
+; CHECK: calll .L0$pb
+; CHECK-NEXT: .Ltmp3:
+; CHECK-NEXT: .cfi_adjust_cfa_offset 4
+; CHECK-NEXT: .L0$pb:
+; CHECK-NEXT: popl
+; CHECK-NEXT: .Ltmp4:
+; CHECK-NEXT: .cfi_adjust_cfa_offset -4
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
index b02f9ec45e7fb..de4c87cf30add 100644
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
 ; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED 
+; RUN: llc < %s -mtriple=i686-windows -stackrealign -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED
 
 %class.Class = type { i32 }
 %struct.s = type { i64 }
@@ -357,3 +357,26 @@ entry:
   call void @good(i32 9, i32 10, i32 11, i32 12)
   ret void
 }
+
+; Make sure the add does not prevent folding loads into pushes.
+; val1 and val2 will not be folded into pushes since they have
+; an additional use, but val3 should be.
+; NORMAL-LABEL: test13:
+; NORMAL: movl ([[P1:%e..]]), [[V1:%e..]]
+; NORMAL-NEXT: movl ([[P2:%e..]]), [[V2:%e..]]
+; NORMAL-NEXT: , [[ADD:%e..]]
+; NORMAL-NEXT: pushl [[ADD]]
+; NORMAL-NEXT: pushl ([[P3:%e..]])
+; NORMAL-NEXT: pushl [[V2]]
+; NORMAL-NEXT: pushl [[V1]]
+; NORMAL-NEXT: calll _good
+; NORMAL: movl [[P3]], %eax
+define i32* @test13(i32* inreg %ptr1, i32* inreg %ptr2, i32* inreg %ptr3) optsize {
+entry:
+  %val1 = load i32, i32* %ptr1
+  %val2 = load i32, i32* %ptr2
+  %val3 = load i32, i32* %ptr3
+  %add = add i32 %val1, %val2
+  call void @good(i32 %val1, i32 %val2, i32 %val3, i32 %add)
+  ret i32* %ptr3
+}
diff --git a/test/CodeGen/X86/mult-alt-x86.ll b/test/CodeGen/X86/mult-alt-x86.ll
index 5174f85adb9f5..1c83fedad3cea 100644
--- a/test/CodeGen/X86/mult-alt-x86.ll
+++ b/test/CodeGen/X86/mult-alt-x86.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -no-integrated-as
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 -no-integrated-as
 ; ModuleID = 'mult-alt-x86.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 target triple = "i686-pc-win32"
diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
index 3613f4c08cce4..247d78776b803 100644
--- a/test/CodeGen/X86/musttail-varargs.ll
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
+; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32
 ; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
 ; RUN: llc < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86
 
@@ -57,6 +58,40 @@ define void @f_thunk(i8* %this, ...) {
 ; LINUX-DAG: movb {{.*}}, %al
 ; LINUX: jmpq *{{.*}}  # TAILCALL
 
+; LINUX-X32-LABEL: f_thunk:
+; LINUX-X32-DAG: movl %edi, {{.*}}
+; LINUX-X32-DAG: movq %rsi, {{.*}}
+; LINUX-X32-DAG: movq %rdx, {{.*}}
+; LINUX-X32-DAG: movq %rcx, {{.*}}
+; LINUX-X32-DAG: movq %r8, {{.*}}
+; LINUX-X32-DAG: movq %r9, {{.*}}
+; LINUX-X32-DAG: movb %al, {{.*}}
+; LINUX-X32-DAG: movaps %xmm0, {{[0-9]*}}(%esp)
+; LINUX-X32-DAG: movaps %xmm1, {{[0-9]*}}(%esp)
+; LINUX-X32-DAG: movaps %xmm2, {{[0-9]*}}(%esp)
+; LINUX-X32-DAG: movaps %xmm3, {{[0-9]*}}(%esp)
+; LINUX-X32-DAG: movaps %xmm4, {{[0-9]*}}(%esp)
+; LINUX-X32-DAG: movaps %xmm5, {{[0-9]*}}(%esp)
+; LINUX-X32-DAG: movaps %xmm6, {{[0-9]*}}(%esp)
+; LINUX-X32-DAG: movaps %xmm7, {{[0-9]*}}(%esp)
+; LINUX-X32: callq get_f
+; LINUX-X32-DAG: movaps {{[0-9]*}}(%esp), %xmm0
+; LINUX-X32-DAG: movaps {{[0-9]*}}(%esp), %xmm1
+; LINUX-X32-DAG: movaps {{[0-9]*}}(%esp), %xmm2
+; LINUX-X32-DAG: movaps {{[0-9]*}}(%esp), %xmm3
+; LINUX-X32-DAG: movaps {{[0-9]*}}(%esp), %xmm4
+; LINUX-X32-DAG: movaps {{[0-9]*}}(%esp), %xmm5
+; LINUX-X32-DAG: movaps {{[0-9]*}}(%esp), %xmm6
+; LINUX-X32-DAG: movaps {{[0-9]*}}(%esp), %xmm7
+; LINUX-X32-DAG: movl {{.*}}, %edi
+; LINUX-X32-DAG: movq {{.*}}, %rsi
+; LINUX-X32-DAG: movq {{.*}}, %rdx
+; LINUX-X32-DAG: movq {{.*}}, %rcx
+; LINUX-X32-DAG: movq {{.*}}, %r8
+; LINUX-X32-DAG: movq {{.*}}, %r9
+; LINUX-X32-DAG: movb {{.*}}, %al
+; LINUX-X32: jmpq *{{.*}}  # TAILCALL
+
 ; WINDOWS-LABEL: f_thunk:
 ; WINDOWS-NOT: mov{{.}}ps
 ; WINDOWS-DAG: movq %rdx, {{.*}}
@@ -92,6 +127,10 @@ define void @g_thunk(i8* %fptr_i8, ...) {
 ; LINUX-NOT: movq
 ; LINUX: jmpq *%rdi  # TAILCALL
 
+; LINUX-X32-LABEL: g_thunk:
+; LINUX-X32-DAG: movl %edi, %[[REG:e[abcd]x|ebp|esi|edi|r8|r9|r1[0-5]]]
+; LINUX-X32-DAG: jmpq *%[[REG]]  # TAILCALL
+
 ; WINDOWS-LABEL: g_thunk:
 ; WINDOWS-NOT: movq
 ; WINDOWS: jmpq *%rcx # TAILCALL
@@ -130,6 +169,10 @@ else:
 ; LINUX: jne
 ; LINUX: jmpq *{{.*}} # TAILCALL
 ; LINUX: jmpq *{{.*}} # TAILCALL
+; LINUX-X32-LABEL: h_thunk:
+; LINUX-X32: jne
+; LINUX-X32: jmpq *{{.*}} # TAILCALL
+; LINUX-X32: jmpq *{{.*}} # TAILCALL
 ; WINDOWS-LABEL: h_thunk:
 ; WINDOWS: jne
 ; WINDOWS: jmpq *{{.*}} # TAILCALL
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
index 8c08b3c163c02..c9767f88488c9 100644
--- a/test/CodeGen/X86/nontemporal-2.ll
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
 
 ; Make sure that we generate non-temporal stores for the test cases below.
 ; We use xorps for zeroing, so domain information isn't available anymore.
@@ -300,4 +300,19 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
   ret void
 }
 
+; 256-bit NT stores require 256-bit alignment.
+; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we
+; could even scalarize to movnti when we have 1-alignment: nontemporal is
+; probably always worth even some 20 instruction scalarization.
+define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
+; CHECK-LABEL: test_unaligned_v8f32:
+; SSE: movntps %xmm
+; SSE: movntps %xmm
+; AVX-NOT: movnt
+; AVX: vmovups %ymm
+  %r = fadd <8 x float> %a, %b
+  store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
 !1 = !{i32 1}
diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll
index f9385df364212..9a2f23596f79b 100644
--- a/test/CodeGen/X86/nontemporal.ll
+++ b/test/CodeGen/X86/nontemporal.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple x86_64-unknown-unknown | FileCheck %s
 
-define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E) {
+define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, i64 %F) {
 ; CHECK: movntps
   %cast = bitcast i8* %B to <4 x float>*
   %A2 = fadd <4 x float> %A, <float 0x0, float 0x0, float 0x0, float 0x4200000000000000>
@@ -13,9 +13,12 @@ define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E) {
   %cast2 = bitcast i8* %B to <2 x double>*
   %C2 = fadd <2 x double> %C, <double 0x0, double 0x4200000000000000>
   store <2 x double> %C2, <2 x double>* %cast2, align 16, !nontemporal !0
-; CHECK: movnti
+; CHECK: movntil
   %cast3 = bitcast i8* %B to i32*
-  store i32 %D, i32* %cast3, align 16, !nontemporal !0
+  store i32 %D, i32* %cast3, align 1, !nontemporal !0
+; CHECK: movntiq
+  %cast4 = bitcast i8* %B to i64*
+  store i64 %F, i64* %cast4, align 1, !nontemporal !0
   ret void
 }
 
diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll
index 3f5abfd40f298..e80f3fcbe58d2 100644
--- a/test/CodeGen/X86/null-streamer.ll
+++ b/test/CodeGen/X86/null-streamer.ll
@@ -14,11 +14,11 @@ define void @f1() {
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !13}
 
-!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: " ", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !9, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: " ", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !9, imports: !2)
 !1 = !DIFile(filename: "file.c", directory: "")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !1, type: !6, function: i32 ()* null, variables: !2)
+!4 = distinct !DISubprogram(name: "", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !1, type: !6, variables: !2)
 !6 = !DISubroutineType(types: !7)
 !7 = !{!8}
 !8 = !DIBasicType(tag: DW_TAG_base_type, size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/X86/opt-ext-uses.ll b/test/CodeGen/X86/opt-ext-uses.ll
index 5d05ad9c45447..39e6fd0e6a59f 100644
--- a/test/CodeGen/X86/opt-ext-uses.ll
+++ b/test/CodeGen/X86/opt-ext-uses.ll
@@ -1,4 +1,10 @@
-; RUN: llc < %s -march=x86 | grep movw | count 1
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+; This test should get one and only one register to register mov.
+; CHECK-LABEL: t:
+; CHECK:     movw
+; CHECK-NOT: movw
+; CHECK:     ret
 
 define signext i16 @t()   {
 entry:
diff --git a/test/CodeGen/X86/or-branch.ll b/test/CodeGen/X86/or-branch.ll
index 9db948adb4652..4899a0fc7e884 100644
--- a/test/CodeGen/X86/or-branch.ll
+++ b/test/CodeGen/X86/or-branch.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2
-; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1
+; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1 --check-prefix=CHECK
 
 define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind {
 ; JUMP2-LABEL: foo:
@@ -25,4 +25,30 @@ UnifiedReturnBlock:
   ret void
 }
 
+; If the branch is unpredictable, don't add another branch
+; regardless of whether they are expensive or not.
+
+define void @unpredictable(i32 %X, i32 %Y, i32 %Z) nounwind {
+; CHECK-LABEL: unpredictable:
+; CHECK-DAG:     sete
+; CHECK-DAG:     setl
+; CHECK:         orb
+; CHECK:         jne
+entry:
+  %tmp1 = icmp eq i32 %X, 0
+  %tmp3 = icmp slt i32 %Y, 5
+  %tmp4 = or i1 %tmp3, %tmp1
+  br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock, !unpredictable !0
+
+cond_true:
+  %tmp5 = tail call i32 (...) @bar( )
+  ret void
+
+UnifiedReturnBlock:
+  ret void
+}
+
 declare i32 @bar(...)
+
+!0 = !{}
+
diff --git a/test/CodeGen/X86/or-lea.ll b/test/CodeGen/X86/or-lea.ll
new file mode 100644
index 0000000000000..f45a639ffa2ce
--- /dev/null
+++ b/test/CodeGen/X86/or-lea.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; InstCombine and DAGCombiner transform an 'add' into an 'or'
+; if there are no common bits from the incoming operands.
+; LEA instruction selection should be able to see through that
+; transform and reduce add/shift/or instruction counts.
+
+define i32 @or_shift1_and1(i32 %x, i32 %y) {
+; CHECK-LABEL: or_shift1_and1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    leal (%rsi,%rdi,2), %eax
+; CHECK-NEXT:    retq
+
+  %shl = shl i32 %x, 1
+  %and = and i32 %y, 1
+  %or = or i32 %and, %shl
+  ret i32 %or
+}
+
+define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) {
+; CHECK-LABEL: or_shift1_and1_swapped:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    leal (%rsi,%rdi,2), %eax
+; CHECK-NEXT:    retq
+
+  %shl = shl i32 %x, 1
+  %and = and i32 %y, 1
+  %or = or i32 %shl, %and
+  ret i32 %or
+}
+
+define i32 @or_shift2_and1(i32 %x, i32 %y) {
+; CHECK-LABEL: or_shift2_and1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    leal (%rsi,%rdi,4), %eax
+; CHECK-NEXT:    retq
+
+  %shl = shl i32 %x, 2
+  %and = and i32 %y, 1
+  %or = or i32 %shl, %and
+  ret i32 %or
+}
+
+define i32 @or_shift3_and1(i32 %x, i32 %y) {
+; CHECK-LABEL: or_shift3_and1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    leal (%rsi,%rdi,8), %eax
+; CHECK-NEXT:    retq
+
+  %shl = shl i32 %x, 3
+  %and = and i32 %y, 1
+  %or = or i32 %shl, %and
+  ret i32 %or
+}
+
+define i32 @or_shift3_and7(i32 %x, i32 %y) {
+; CHECK-LABEL: or_shift3_and7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl $7, %esi
+; CHECK-NEXT:    leal (%rsi,%rdi,8), %eax
+; CHECK-NEXT:    retq
+
+  %shl = shl i32 %x, 3
+  %and = and i32 %y, 7
+  %or = or i32 %shl, %and
+  ret i32 %or
+}
+
+; The shift is too big for an LEA.
+
+define i32 @or_shift4_and1(i32 %x, i32 %y) {
+; CHECK-LABEL: or_shift4_and1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shll $4, %edi
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    leal (%rsi,%rdi), %eax
+; CHECK-NEXT:    retq
+
+  %shl = shl i32 %x, 4
+  %and = and i32 %y, 1
+  %or = or i32 %shl, %and
+  ret i32 %or
+}
+
+; The mask is too big for the shift, so the 'or' isn't equivalent to an 'add'.
+
+define i32 @or_shift3_and8(i32 %x, i32 %y) {
+; CHECK-LABEL: or_shift3_and8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    leal (,%rdi,8), %eax
+; CHECK-NEXT:    andl $8, %esi
+; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    retq
+
+  %shl = shl i32 %x, 3
+  %and = and i32 %y, 8
+  %or = or i32 %shl, %and
+  ret i32 %or
+}
+
+; 64-bit operands should work too.
+
+define i64 @or_shift1_and1_64(i64 %x, i64 %y) {
+; CHECK-LABEL: or_shift1_and1_64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    leaq (%rsi,%rdi,2), %rax
+; CHECK-NEXT:    retq
+
+  %shl = shl i64 %x, 1
+  %and = and i64 %y, 1
+  %or = or i64 %and, %shl
+  ret i64 %or
+}
+
diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll
index dfa2cedf45a2d..d75506cadfa29 100644
--- a/test/CodeGen/X86/palignr.ll
+++ b/test/CodeGen/X86/palignr.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=x86 -mcpu=core2 -mattr=+ssse3 | FileCheck %s
 ; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck --check-prefix=CHECK-YONAH %s
 
diff --git a/test/CodeGen/X86/patchpoint-verifiable.mir b/test/CodeGen/X86/patchpoint-verifiable.mir
new file mode 100644
index 0000000000000..300ecaf002f2c
--- /dev/null
+++ b/test/CodeGen/X86/patchpoint-verifiable.mir
@@ -0,0 +1,42 @@
+# RUN: llc -mtriple=x86_64-apple-darwin -stop-after branch-folder -start-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test verifies that the machine verifier won't report an error when
+# verifying the PATCHPOINT instruction.
+
+--- |
+
+  define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+  entry:
+    %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
+    ret void
+  }
+
+  declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
+...
+---
+name:            small_patchpoint_codegen
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+frameInfo:
+  hasPatchPoint: true
+  stackSize:     8
+  adjustsStack:  true
+  hasCalls:      true
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16 }
+body: |
+  bb.0.entry:
+    liveins: %rdi, %rsi, %rbp
+
+    frame-setup PUSH64r killed %rbp, implicit-def %rsp, implicit %rsp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 16
+    CFI_INSTRUCTION .cfi_offset %rbp, -16
+    %rbp = frame-setup MOV64rr %rsp
+    CFI_INSTRUCTION .cfi_def_cfa_register %rbp
+  ; CHECK: PATCHPOINT 5, 5, 0, 2, 0, %rdi, %rsi, csr_64, implicit-def dead early-clobber %r11, implicit-def %rsp, implicit-def dead %rax
+    PATCHPOINT 5, 5, 0, 2, 0, %rdi, %rsi, csr_64, implicit-def dead early-clobber %r11, implicit-def %rsp, implicit-def dead %rax
+    %rbp = POP64r implicit-def %rsp, implicit %rsp
+    RETQ
+...
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
new file mode 100644
index 0000000000000..bf457814079ca
--- /dev/null
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -0,0 +1,190 @@
+; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s
+
+; The peephole optimizer can elide some physical register copies such as
+; EFLAGS. Make sure the flags are used directly, instead of needlessly using
+; lahf, when possible.
+
+@L = external global i32
+@M = external global i8
+declare i32 @bar(i64)
+
+; CHECK-LABEL: plus_one
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: incl L
+define i1 @plus_one() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, 1 ; N.B. will emit inc.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: plus_forty_two
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: addl $42,
+define i1 @plus_forty_two() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, 42 ; N.B. won't emit inc.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: minus_one
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: decl L
+define i1 @minus_one() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, -1 ; N.B. will emit dec.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: minus_forty_two
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: addl $-42,
+define i1 @minus_forty_two() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, -42 ; N.B. won't emit dec.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: test_intervening_call:
+; CHECK:       cmpxchg
+; CHECK:       seto %al
+; CHECK-NEXT:  lahf
+; CHECK:       call{{[lq]}} bar
+; CHECK:       addb $127, %al
+; CHECK-NEXT:  sahf
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
+  ; cmpxchg sets EFLAGS, call clobbers it, then br uses EFLAGS.
+  %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+  %v = extractvalue { i64, i1 } %cx, 0
+  %p = extractvalue { i64, i1 } %cx, 1
+  call i32 @bar(i64 %v)
+  br i1 %p, label %t, label %f
+
+t:
+  ret i64 42
+
+f:
+  ret i64 0
+}
+
+; CHECK-LABEL: test_two_live_flags:
+; CHECK:       cmpxchg
+; CHECK:       seto %al
+; CHECK-NEXT:  lahf
+; Save result of the first cmpxchg into D.
+; CHECK-NEXT:  mov{{[lq]}} %[[AX:[er]ax]], %[[D:[re]d[xi]]]
+; CHECK:       cmpxchg
+; CHECK-NEXT:  sete %al
+; Save result of the second cmpxchg onto the stack.
+; CHECK-NEXT:  push{{[lq]}} %[[AX]]
+; Restore result of the first cmpxchg from D, put it back in EFLAGS.
+; CHECK-NEXT:  mov{{[lq]}} %[[D]], %[[AX]]
+; CHECK-NEXT:  addb $127, %al
+; CHECK-NEXT:  sahf
+; Restore result of the second cmpxchg from the stack.
+; CHECK-NEXT:  pop{{[lq]}} %[[AX]]
+; Test from EFLAGS restored from first cmpxchg, jump if that fails.
+; CHECK-NEXT:  jne
+; Fallthrough to test the second cmpxchg's result.
+; CHECK:       testb %al, %al
+; CHECK-NEXT:  je
+define i64 @test_two_live_flags(
+       i64* %foo0, i64 %bar0, i64 %baz0,
+       i64* %foo1, i64 %bar1, i64 %baz1) {
+  %cx0 = cmpxchg i64* %foo0, i64 %bar0, i64 %baz0 seq_cst seq_cst
+  %p0 = extractvalue { i64, i1 } %cx0, 1
+  %cx1 = cmpxchg i64* %foo1, i64 %bar1, i64 %baz1 seq_cst seq_cst
+  %p1 = extractvalue { i64, i1 } %cx1, 1
+  %flag = and i1 %p0, %p1
+  br i1 %flag, label %t, label %f
+
+t:
+  ret i64 42
+
+f:
+  ret i64 0
+}
+
+; CHECK-LABEL: asm_clobbering_flags:
+; CHECK:       test
+; CHECK-NEXT:  setg
+; CHECK-NEXT:  #APP
+; CHECK-NEXT:  bsfl
+; CHECK-NEXT:  #NO_APP
+; CHECK-NEXT:  movl
+; CHECK-NEXT:  ret
+define i1 @asm_clobbering_flags(i32* %mem) {
+  %val = load i32, i32* %mem, align 4
+  %cmp = icmp sgt i32 %val, 0
+  %res = tail call i32 asm "bsfl $1,$0", "=r,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i32 %val)
+  store i32 %res, i32* %mem, align 4
+  ret i1 %cmp
+}
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index dbe5bd646c7fd..37b6fdf7cfeb9 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
 
 define <16 x i8> @mul8c(<16 x i8> %i) nounwind  {
 ; SSE2-LABEL: mul8c:
@@ -34,16 +35,34 @@ define <16 x i8> @mul8c(<16 x i8> %i) nounwind  {
 ; SSE41-NEXT:    packuswb %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: mul8c:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 entry:
   %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
   ret <16 x i8> %A
 }
 
 define <8 x i16> @mul16c(<8 x i16> %i) nounwind  {
-; ALL-LABEL: mul16c:
-; ALL:       # BB#0: # %entry
-; ALL-NEXT:    pmullw {{.*}}(%rip), %xmm0
-; ALL-NEXT:    retq
+; SSE-LABEL: mul16c:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: mul16c:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
 entry:
   %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
   ret <8 x i16> %A
@@ -65,22 +84,38 @@ define <4 x i32> @a(<4 x i32> %i) nounwind  {
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: a:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 entry:
   %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
   ret <4 x i32> %A
 }
 
 define <2 x i64> @b(<2 x i64> %i) nounwind  {
-; ALL-LABEL: b:
-; ALL:       # BB#0: # %entry
-; ALL-NEXT:    movdqa {{.*#+}} xmm1 = [117,117]
-; ALL-NEXT:    movdqa %xmm0, %xmm2
-; ALL-NEXT:    pmuludq %xmm1, %xmm2
-; ALL-NEXT:    psrlq $32, %xmm0
-; ALL-NEXT:    pmuludq %xmm1, %xmm0
-; ALL-NEXT:    psllq $32, %xmm0
-; ALL-NEXT:    paddq %xmm2, %xmm0
-; ALL-NEXT:    retq
+; SSE-LABEL: b:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [117,117]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: b:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [117,117]
+; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
 entry:
   %A = mul <2 x i64> %i, < i64 117, i64 117 >
   ret <2 x i64> %A
@@ -123,16 +158,34 @@ define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind  {
 ; SSE41-NEXT:    packuswb %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: mul8:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 entry:
   %A = mul <16 x i8> %i, %j
   ret <16 x i8> %A
 }
 
 define <8 x i16> @mul16(<8 x i16> %i, <8 x i16> %j) nounwind  {
-; ALL-LABEL: mul16:
-; ALL:       # BB#0: # %entry
-; ALL-NEXT:    pmullw %xmm1, %xmm0
-; ALL-NEXT:    retq
+; SSE-LABEL: mul16:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: mul16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 entry:
   %A = mul <8 x i16> %i, %j
   ret <8 x i16> %A
@@ -154,26 +207,44 @@ define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind  {
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: c:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 entry:
   %A = mul <4 x i32> %i, %j
   ret <4 x i32> %A
 }
 
 define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind  {
-; ALL-LABEL: d:
-; ALL:       # BB#0: # %entry
-; ALL-NEXT:    movdqa %xmm0, %xmm2
-; ALL-NEXT:    pmuludq %xmm1, %xmm2
-; ALL-NEXT:    movdqa %xmm1, %xmm3
-; ALL-NEXT:    psrlq $32, %xmm3
-; ALL-NEXT:    pmuludq %xmm0, %xmm3
-; ALL-NEXT:    psllq $32, %xmm3
-; ALL-NEXT:    paddq %xmm3, %xmm2
-; ALL-NEXT:    psrlq $32, %xmm0
-; ALL-NEXT:    pmuludq %xmm1, %xmm0
-; ALL-NEXT:    psllq $32, %xmm0
-; ALL-NEXT:    paddq %xmm2, %xmm0
-; ALL-NEXT:    retq
+; SSE-LABEL: d:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    psrlq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm0, %xmm3
+; SSE-NEXT:    psllq $32, %xmm3
+; SSE-NEXT:    paddq %xmm3, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: d:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX2-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpsllq $32, %xmm3, %xmm3
+; AVX2-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
 entry:
   %A = mul <2 x i64> %i, %j
   ret <2 x i64> %A
@@ -210,6 +281,17 @@ define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind  {
 ; SSE41-NEXT:    pmulld {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
 ; SSE41-NEXT:    addq $40, %rsp
 ; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: e:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    subq $40, %rsp
+; AVX2-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    callq foo
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    addq $40, %rsp
+; AVX2-NEXT:    retq
 entry:
   ; Use a call to force spills.
   call void @foo()
@@ -218,27 +300,47 @@ entry:
 }
 
 define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind  {
-; ALL-LABEL: f:
-; ALL:       # BB#0: # %entry
-; ALL-NEXT:    subq $40, %rsp
-; ALL-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; ALL-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; ALL-NEXT:    callq foo
-; ALL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; ALL-NEXT:    movdqa %xmm0, %xmm2
-; ALL-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; ALL-NEXT:    pmuludq %xmm3, %xmm2
-; ALL-NEXT:    movdqa %xmm3, %xmm1
-; ALL-NEXT:    psrlq $32, %xmm1
-; ALL-NEXT:    pmuludq %xmm0, %xmm1
-; ALL-NEXT:    psllq $32, %xmm1
-; ALL-NEXT:    paddq %xmm1, %xmm2
-; ALL-NEXT:    psrlq $32, %xmm0
-; ALL-NEXT:    pmuludq %xmm3, %xmm0
-; ALL-NEXT:    psllq $32, %xmm0
-; ALL-NEXT:    paddq %xmm2, %xmm0
-; ALL-NEXT:    addq $40, %rsp
-; ALL-NEXT:    retq
+; SSE-LABEL: f:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    subq $40, %rsp
+; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    callq foo
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE-NEXT:    pmuludq %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    psrlq $32, %xmm1
+; SSE-NEXT:    pmuludq %xmm0, %xmm1
+; SSE-NEXT:    psllq $32, %xmm1
+; SSE-NEXT:    paddq %xmm1, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    pmuludq %xmm3, %xmm0
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    addq $40, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: f:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    subq $40, %rsp
+; AVX2-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    callq foo
+; AVX2-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
+; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm0
+; AVX2-NEXT:    vpsrlq $32, %xmm2, %xmm1
+; AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlq $32, %xmm3, %xmm1
+; AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    addq $40, %rsp
+; AVX2-NEXT:    retq
 entry:
   ; Use a call to force spills.
   call void @foo()
@@ -247,31 +349,76 @@ entry:
 }
 
 define <4 x i64> @b1(<4 x i64> %i) nounwind  {
-; AVX2-LABEL: @b1
-; AVX2: vpbroadcastq
-; AVX2-NEXT: vpmuludq
-; AVX2-NEXT: vpsrlq  $32 
-; AVX2-NEXT: vpmuludq
-; AVX2-NEXT: vpsllq  $32
-; AVX2-NEXT: vpaddq
-; AVX2-NEXT: retq
+; SSE-LABEL: b1:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [117,117]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    paddq %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    psrlq $32, %xmm1
+; SSE-NEXT:    pmuludq %xmm2, %xmm1
+; SSE-NEXT:    psllq $32, %xmm1
+; SSE-NEXT:    paddq %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: b1:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
 entry:
   %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 >
   ret <4 x i64> %A
 }
 
 define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind  {
-; AVX2-LABEL: @b2
-; AVX2:  vpmuludq
-; AVX2-NEXT: vpsrlq  $32
-; AVX2-NEXT: vpmuludq
-; AVX2-NEXT: vpsllq  $32
-; AVX2-NEXT: vpaddq
-; AVX2-NEXT: vpsrlq  $32
-; AVX2-NEXT: vpmuludq
-; AVX2-NEXT: vpsllq  $32
-; AVX2-NEXT: vpaddq
-; AVX2-NEXT: retq
+; SSE-LABEL: b2:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    pmuludq %xmm2, %xmm4
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    psrlq $32, %xmm5
+; SSE-NEXT:    pmuludq %xmm0, %xmm5
+; SSE-NEXT:    psllq $32, %xmm5
+; SSE-NEXT:    paddq %xmm5, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    paddq %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pmuludq %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm3, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    pmuludq %xmm1, %xmm4
+; SSE-NEXT:    psllq $32, %xmm4
+; SSE-NEXT:    paddq %xmm4, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm1
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
+; SSE-NEXT:    psllq $32, %xmm1
+; SSE-NEXT:    paddq %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: b2:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
 entry:
   %A = mul <4 x i64> %i, %j
   ret <4 x i64> %A
diff --git a/test/CodeGen/X86/pop-stack-cleanup.ll b/test/CodeGen/X86/pop-stack-cleanup.ll
new file mode 100644
index 0000000000000..bcf7594065f39
--- /dev/null
+++ b/test/CodeGen/X86/pop-stack-cleanup.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=CHECK 
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=LINUX64
+
+declare void @param1(i32 %a)
+declare i32 @param2_ret(i32 %a, i32 %b)
+declare i64 @param2_ret64(i32 %a, i32 %b)
+declare void @param2(i32 %a, i32 %b)
+declare void @param3(i32 %a, i32 %b, i32 %c)
+declare void @param8(i64, i64, i64, i64, i64, i64, i64, i64)
+
+
+define void @test() minsize nounwind {
+; CHECK-LABEL: test:
+; CHECK: calll _param1
+; CHECK-NEXT: popl %eax
+; CHECK: calll _param2
+; CHECK-NEXT: popl %eax
+; CHECK-NEXT: popl %ecx
+; CHECK: calll _param2_ret
+; CHECK-NEXT: popl %ecx
+; CHECK-NEXT: popl %edx
+; CHECK-NEXT: pushl %eax
+; CHECK: calll _param3
+; CHECK-NEXT: addl $12, %esp
+; CHECK: calll _param2_ret64
+; CHECK-NEXT: popl %ecx
+; CHECK-NEXT: popl %ecx
+  call void @param1(i32 1)
+  call void @param2(i32 1, i32 2)
+  %ret = call i32 @param2_ret(i32 1, i32 2)
+  call void @param3(i32 1, i32 2, i32 %ret)
+  %ret64 = call i64 @param2_ret64(i32 1, i32 2)  
+  ret void
+}
+
+define void @negative(i32 %k) {
+; CHECK-LABEL: negative:
+; CHECK: calll _param1
+; CHECK-NEXT: addl $4, %esp
+; CHECK: calll _param2
+; CHECK-NEXT: addl $8, %esp
+; CHECK: calll _param3
+; CHECK-NEXT: movl %ebp, %esp
+  %v = alloca i32, i32 %k
+  call void @param1(i32 1)
+  call void @param2(i32 1, i32 2)
+  call void @param3(i32 1, i32 2, i32 3)
+  ret void
+}
+
+define void @spill(i32 inreg %a, i32 inreg %b, i32 inreg %c) minsize nounwind {
+; CHECK-LABEL: spill:
+; CHECK-DAG: movl %ecx,
+; CHECK-DAG: movl %edx,
+; CHECK: calll _param2_ret
+; CHECK-NEXT: popl %ecx
+; CHECK-NEXT: popl %edx
+; CHECK-DAG: movl {{.*}}, %ecx
+; CHECK-DAG: movl {{.*}}, %edx
+; CHECK: calll _spill
+  %i = call i32 @param2_ret(i32 1, i32 2)
+  call void @spill(i32 %a, i32 %b, i32 %c)
+  ret void
+}
+
+define void @test_linux64(i32 %size) minsize nounwind {
+; LINUX64-LABEL: test_linux64:
+; LINUX64: pushq %rbp
+; LINUX64: callq param8
+; LINUX64-NEXT: popq %rax
+; LINUX64-NEXT: popq %rcx
+
+  %a = alloca i64, i32 %size, align 8
+  call void @param8(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8)
+  ret void
+}
diff --git a/test/CodeGen/X86/powi.ll b/test/CodeGen/X86/powi.ll
index c3d68312ce153..17d3e3e7d33ca 100644
--- a/test/CodeGen/X86/powi.ll
+++ b/test/CodeGen/X86/powi.ll
@@ -1,10 +1,38 @@
-; RUN: llc %s -march=x86 -mcpu=yonah -o - | grep mulsd | count 6
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
 ; Ideally this would compile to 5 multiplies.
 
-define double @_Z3f10d(double %a) nounwind readonly ssp noredzone {
-entry:
-  %0 = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
-  ret double %0
+define double @pow_wrapper(double %a) nounwind readonly ssp noredzone {
+; CHECK-LABEL: pow_wrapper:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    mulsd %xmm1, %xmm1
+; CHECK-NEXT:    mulsd %xmm1, %xmm0
+; CHECK-NEXT:    mulsd %xmm1, %xmm1
+; CHECK-NEXT:    mulsd %xmm1, %xmm0
+; CHECK-NEXT:    mulsd %xmm1, %xmm1
+; CHECK-NEXT:    mulsd %xmm0, %xmm1
+; CHECK-NEXT:    movapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
+  ret double %ret
+}
+
+define double @pow_wrapper_optsize(double %a) optsize {
+; CHECK-LABEL: pow_wrapper_optsize:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl  $15, %edi
+; CHECK-NEXT:    jmp
+  %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
+  ret double %ret
+}
+
+define double @pow_wrapper_minsize(double %a) minsize {
+; CHECK-LABEL: pow_wrapper_minsize:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl  $128, %edi
+; CHECK-NEXT:    jmp
+  %ret = tail call double @llvm.powi.f64(double %a, i32 128) nounwind ; <double> [#uses=1]
+  ret double %ret
 }
 
 declare double @llvm.powi.f64(double, i32) nounwind readonly
diff --git a/test/CodeGen/X86/pr11415.ll b/test/CodeGen/X86/pr11415.ll
index 6c32a2206a7ea..73c497014116f 100644
--- a/test/CodeGen/X86/pr11415.ll
+++ b/test/CodeGen/X86/pr11415.ll
@@ -4,15 +4,17 @@
 ; defining %0 before it was read. This caused us to omit the
 ; movq	-8(%rsp), %rdx
 
+; CHECK: pushq	%rax
 ; CHECK: 	#APP
 ; CHECK-NEXT:	#NO_APP
 ; CHECK-NEXT:	movq	%rcx, %rax
-; CHECK-NEXT:	movq	%rax, -8(%rsp)
-; CHECK-NEXT:	movq	-8(%rsp), %rdx
+; CHECK-NEXT:	movq	%rax, (%rsp)
+; CHECK-NEXT:	movq	(%rsp), %rdx
 ; CHECK-NEXT:	#APP
 ; CHECK-NEXT:	#NO_APP
 ; CHECK-NEXT:	movq	%rdx, %rax
-; CHECK-NEXT:	movq	%rdx, -8(%rsp)
+; CHECK-NEXT:	movq	%rdx, (%rsp)
+; CHECK-NEXT:	popq	%rcx
 ; CHECK-NEXT:	ret
 
 define i64 @foo() {
diff --git a/test/CodeGen/X86/pr11468.ll b/test/CodeGen/X86/pr11468.ll
index f721df11586b3..7a2cc5b1a60dd 100644
--- a/test/CodeGen/X86/pr11468.ll
+++ b/test/CodeGen/X86/pr11468.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -force-align-stack -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
 ; PR11468
 
 define void @f(i64 %sz) uwtable {
diff --git a/test/CodeGen/X86/pr11985.ll b/test/CodeGen/X86/pr11985.ll
index fa378502f724a..aae00de112d38 100644
--- a/test/CodeGen/X86/pr11985.ll
+++ b/test/CodeGen/X86/pr11985.ll
@@ -1,6 +1,28 @@
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=prescott | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=prescott | FileCheck %s --check-prefix=PRESCOTT
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM
+
+;;; TODO: (1) Some of the loads and stores are certainly unaligned and (2) the first load and first
+;;; store overlap with the second load and second store respectively.
+;;;
+;;; Is either of these sequences ideal? 
 
 define float @foo(i8* nocapture %buf, float %a, float %b) nounwind uwtable {
+; PRESCOTT-LABEL: foo:
+; PRESCOTT:       # BB#0: # %entry
+; PRESCOTT-NEXT:    movq   .Ltmp0+14(%rip), %rax
+; PRESCOTT-NEXT:    movq   %rax, 14(%rdi)
+; PRESCOTT-NEXT:    movq   .Ltmp0+8(%rip), %rax
+; PRESCOTT-NEXT:    movq   %rax, 8(%rdi)
+; PRESCOTT-NEXT:    movq   .Ltmp0(%rip), %rax
+; PRESCOTT-NEXT:    movq   %rax, (%rdi)
+;
+; NEHALEM-LABEL: foo:
+; NEHALEM:       # BB#0: # %entry
+; NEHALEM-NEXT:    movq .Ltmp0+14(%rip), %rax
+; NEHALEM-NEXT:    movq %rax, 14(%rdi)
+; NEHALEM-NEXT:    movups .Ltmp0(%rip), %xmm2
+; NEHALEM-NEXT:    movups %xmm2, (%rdi)
+
 entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %buf, i8* blockaddress(@foo, %out), i64 22, i32 1, i1 false)
   br label %out
@@ -8,12 +30,6 @@ entry:
 out:                                              ; preds = %entry
   %add = fadd float %a, %b
   ret float %add
-; CHECK: foo
-; CHECK: movw .L{{.*}}+20(%rip), %{{.*}}
-; CHECK: movl .L{{.*}}+16(%rip), %{{.*}}
-; CHECK: movq .L{{.*}}+8(%rip), %{{.*}}
-; CHECK: movq .L{{.*}}(%rip), %{{.*}}
-; CHECK: ret
 }
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/pr13577.ll b/test/CodeGen/X86/pr13577.ll
index a6b721a7a6f17..2228fbbaa53b9 100644
--- a/test/CodeGen/X86/pr13577.ll
+++ b/test/CodeGen/X86/pr13577.ll
@@ -6,10 +6,7 @@
 ; CHECK-NEXT: .long 2139095040
 
 ; CHECK-LABEL: foo:
-; CHECK: movq {{.*}}, %rax
-; CHECK: shlq $48, %rax
-; CHECK: sets %al
-; CHECK: testb %al, %al
+; CHECK: testb $-128, -15(%rsp)
 ; CHECK: flds LCPI0_0(%rip)
 ; CHECK: flds LCPI0_1(%rip)
 ; CHECK: fcmovne %st(1), %st(0)
diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll
index 95d7deb341709..9fc754aa11282 100644
--- a/test/CodeGen/X86/pr15267.ll
+++ b/test/CodeGen/X86/pr15267.ll
@@ -1,138 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7-avx | FileCheck %s
 
 define <4 x i3> @test1(<4 x i3>* %in) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $3, %ecx
+; CHECK-NEXT:    andl $7, %ecx
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    andl $7, %edx
+; CHECK-NEXT:    vmovd %edx, %xmm0
+; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $6, %ecx
+; CHECK-NEXT:    andl $7, %ecx
+; CHECK-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    shrl $9, %eax
+; CHECK-NEXT:    andl $7, %eax
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %ret = load <4 x i3>, <4 x i3>* %in, align 1
   ret <4 x i3> %ret
 }
-; CHECK-LABEL: test1
-; CHECK: movzwl
-; CHECK: shrl $3
-; CHECK: andl $7
-; CHECK: andl $7
-; CHECK: vmovd
-; CHECK: pinsrd $1
-; CHECK: shrl $6
-; CHECK: andl $7
-; CHECK: pinsrd $2
-; CHECK: shrl $9
-; CHECK: andl $7
-; CHECK: pinsrd $3
-; CHECK: ret
 
 define <4 x i1> @test2(<4 x i1>* %in) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl %ecx
+; CHECK-NEXT:    andl $1, %ecx
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    vmovd %edx, %xmm0
+; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $2, %ecx
+; CHECK-NEXT:    andl $1, %ecx
+; CHECK-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    shrl $3, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %ret = load <4 x i1>, <4 x i1>* %in, align 1
   ret <4 x i1> %ret
 }
 
-; CHECK-LABEL: test2
-; CHECK: movzbl
-; CHECK: shrl
-; CHECK: andl $1
-; CHECK: andl $1
-; CHECK: vmovd
-; CHECK: pinsrd $1
-; CHECK: shrl $2
-; CHECK: andl $1
-; CHECK: pinsrd $2
-; CHECK: shrl $3
-; CHECK: andl $1
-; CHECK: pinsrd $3
-; CHECK: ret
-
 define <4 x i64> @test3(<4 x i1>* %in) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shlq $62, %rcx
+; CHECK-NEXT:    sarq $63, %rcx
+; CHECK-NEXT:    movq %rax, %rdx
+; CHECK-NEXT:    shlq $63, %rdx
+; CHECK-NEXT:    sarq $63, %rdx
+; CHECK-NEXT:    vmovd %edx, %xmm0
+; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shlq $61, %rcx
+; CHECK-NEXT:    sarq $63, %rcx
+; CHECK-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    shlq $60, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %wide.load35 = load <4 x i1>, <4 x i1>* %in, align 1
   %sext = sext <4 x i1> %wide.load35 to <4 x i64>
   ret <4 x i64> %sext
 }
 
-; CHECK-LABEL: test3
-; CHECK: movzbl
-; CHECK: movq
-; CHECK: shlq
-; CHECK: sarq
-; CHECK: movq
-; CHECK: shlq
-; CHECK: sarq
-; CHECK: vmovd
-; CHECK: vpinsrd
-; CHECK: movq
-; CHECK: shlq
-; CHECK: sarq
-; CHECK: vpinsrd
-; CHECK: shlq
-; CHECK: sarq
-; CHECK: vpinsrd
-; CHECK: vpmovsxdq
-; CHECK: vmovd
-; CHECK: vpinsrd
-; CHECK: vpmovsxdq
-; CHECK: vinsertf128
-; CHECK: ret
-
 define <16 x i4> @test4(<16 x i4>* %in) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $4, %ecx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    andl $15, %edx
+; CHECK-NEXT:    vmovd %edx, %xmm0
+; CHECK-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $8, %ecx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $12, %ecx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $16, %ecx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $20, %ecx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $24, %ecx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $28, %ecx
+; CHECK-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shrq $32, %rcx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shrq $36, %rcx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shrq $40, %rcx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shrq $44, %rcx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shrq $48, %rcx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shrq $52, %rcx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    shrq $56, %rcx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    shrq $60, %rax
+; CHECK-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %ret = load <16 x i4>, <16 x i4>* %in, align 1
   ret <16 x i4> %ret
 }
-
-; CHECK-LABEL: test4
-; CHECK: movl
-; CHECK-NEXT: shrl
-; CHECK-NEXT: andl
-; CHECK-NEXT: movl
-; CHECK-NEXT: andl
-; CHECK-NEXT: vmovd
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movl
-; CHECK-NEXT: shrl
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movl
-; CHECK-NEXT: shrl
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movl
-; CHECK-NEXT: shrl
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movl
-; CHECK-NEXT: shrl
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movl
-; CHECK-NEXT: shrl
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movl
-; CHECK-NEXT: shrl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movq
-; CHECK-NEXT: shrq
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movq
-; CHECK-NEXT: shrq
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movq
-; CHECK-NEXT: shrq
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movq
-; CHECK-NEXT: shrq
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movq
-; CHECK-NEXT: shrq
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movq
-; CHECK-NEXT: shrq
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: movq
-; CHECK-NEXT: shrq
-; CHECK-NEXT: andl
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: shrq
-; CHECK-NEXT: vpinsrb
-; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
index 98f951f1b10c5..08c393d29d693 100644
--- a/test/CodeGen/X86/pr17631.ll
+++ b/test/CodeGen/X86/pr17631.ll
@@ -30,5 +30,5 @@ define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
 
 ; CHECK: foo
 ; CHECK-NOT: vzeroupper
-; CHECK: _ftol2
+; CHECK: {{cvtt|fist}}
 ; CHECK: ret
diff --git a/test/CodeGen/X86/pr21529.ll b/test/CodeGen/X86/pr21529.ll
deleted file mode 100644
index 655bc844f503f..0000000000000
--- a/test/CodeGen/X86/pr21529.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc -show-mc-encoding < %s | FileCheck %s
-
-; Test that the direct object emission selects the and variant with 8 bit
-; immediate.
-; We used to get this wrong when using direct object emission, but not when
-; reading assembly.
-
-; CHECK: andq    $-32, %rsp              # encoding: [0x48,0x83,0xe4,0xe0]
-
-target triple = "x86_64-pc-linux"
-
-define void @f() {
-  %foo = alloca i8, align 32
-  ret void
-}
diff --git a/test/CodeGen/X86/pr22019.ll b/test/CodeGen/X86/pr22019.ll
index 4cee5d704d3a7..cfc53cb6be0b2 100644
--- a/test/CodeGen/X86/pr22019.ll
+++ b/test/CodeGen/X86/pr22019.ll
@@ -20,4 +20,4 @@ define void @pselect() {
 @var = global i32 0
 
 ; CHECK: alias = var
-@alias = alias i32* @var
+@alias = alias i32, i32* @var
diff --git a/test/CodeGen/X86/pr23900.ll b/test/CodeGen/X86/pr23900.ll
deleted file mode 100644
index cbc77161c0428..0000000000000
--- a/test/CodeGen/X86/pr23900.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t.o
-; RUN: llvm-nm %t.o | FileCheck %s
-
-; Test that it doesn't crash (and produces an object file).
-; This use to pass a symbol with a null name to code that expected a valid
-; C string.
-
-; CHECK:         U __CxxFrameHandler3
-; CHECK:         T f
-; CHECK:         t f.cleanup
-; CHECK:         U g
-; CHECK:         U h
-
-
-target triple = "x86_64-pc-windows-msvc18.0.0"
-define void @f(i32 %x) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
-  invoke void @h()
-          to label %invoke.cont unwind label %lpad
-invoke.cont:
-  ret void
-lpad:
- landingpad { i8*, i32 }
-          cleanup
-  call void @g(i32 %x)
-  ret void
-}
-declare void @h()
-declare i32 @__CxxFrameHandler3(...)
-declare void @g(i32 %x)
diff --git a/test/CodeGen/X86/pr24139.ll b/test/CodeGen/X86/pr24139.ll
new file mode 100644
index 0000000000000..fbe55abcbf7cd
--- /dev/null
+++ b/test/CodeGen/X86/pr24139.ll
@@ -0,0 +1,148 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+
+; Check that we do not get excessive spilling from splitting of constant live ranges.
+
+; CHECK-LABEL: PR24139:
+; CHECK: # 16-byte Spill
+; CHECK-NOT: # 16-byte Spill
+; CHECK: retq
+
+define <2 x double> @PR24139(<2 x double> %arg, <2 x double> %arg1, <2 x double> %arg2) {
+  %tmp = bitcast <2 x double> %arg to <4 x float>
+  %tmp3 = fmul <4 x float> %tmp, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
+  %tmp4 = bitcast <2 x double> %arg to <4 x i32>
+  %tmp5 = and <4 x i32> %tmp4, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp6 = or <4 x i32> %tmp5, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
+  %tmp7 = bitcast <4 x i32> %tmp6 to <4 x float>
+  %tmp8 = fadd <4 x float> %tmp3, %tmp7
+  %tmp9 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp8) #2
+  %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
+  %tmp11 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp9) #2
+  %tmp12 = fmul <4 x float> %tmp11, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
+  %tmp13 = fsub <4 x float> %tmp, %tmp12
+  %tmp14 = fmul <4 x float> %tmp11, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
+  %tmp15 = fsub <4 x float> %tmp13, %tmp14
+  %tmp16 = fmul <4 x float> %tmp15, %tmp15
+  %tmp17 = fmul <4 x float> %tmp15, %tmp16
+  %tmp18 = fmul <4 x float> %tmp16, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
+  %tmp19 = fadd <4 x float> %tmp18, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
+  %tmp20 = fmul <4 x float> %tmp16, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
+  %tmp21 = fadd <4 x float> %tmp20, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
+  %tmp22 = fmul <4 x float> %tmp16, %tmp19
+  %tmp23 = fadd <4 x float> %tmp22, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
+  %tmp24 = fmul <4 x float> %tmp16, %tmp21
+  %tmp25 = fadd <4 x float> %tmp24, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
+  %tmp26 = fmul <4 x float> %tmp16, %tmp23
+  %tmp27 = fadd <4 x float> %tmp26, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %tmp28 = fmul <4 x float> %tmp17, %tmp25
+  %tmp29 = fadd <4 x float> %tmp15, %tmp28
+  %tmp30 = and <2 x i64> %tmp10, <i64 4294967297, i64 4294967297>
+  %tmp31 = bitcast <2 x i64> %tmp30 to <4 x i32>
+  %tmp32 = icmp eq <4 x i32> %tmp31, zeroinitializer
+  %tmp33 = sext <4 x i1> %tmp32 to <4 x i32>
+  %tmp34 = bitcast <4 x i32> %tmp33 to <4 x float>
+  %tmp35 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp27, <4 x float> %tmp29, <4 x float> %tmp34) #2
+  %tmp36 = and <2 x i64> %tmp10, <i64 8589934594, i64 8589934594>
+  %tmp37 = bitcast <2 x i64> %tmp36 to <4 x i32>
+  %tmp38 = icmp eq <4 x i32> %tmp37, zeroinitializer
+  %tmp39 = sext <4 x i1> %tmp38 to <4 x i32>
+  %tmp40 = bitcast <4 x float> %tmp35 to <4 x i32>
+  %tmp41 = xor <4 x i32> %tmp40, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp42 = bitcast <4 x i32> %tmp41 to <4 x float>
+  %tmp43 = bitcast <4 x i32> %tmp39 to <4 x float>
+  %tmp44 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp42, <4 x float> %tmp35, <4 x float> %tmp43) #2
+  %tmp45 = bitcast <2 x double> %arg1 to <4 x float>
+  %tmp46 = fmul <4 x float> %tmp45, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
+  %tmp47 = bitcast <2 x double> %arg1 to <4 x i32>
+  %tmp48 = and <4 x i32> %tmp47, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp49 = or <4 x i32> %tmp48, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
+  %tmp50 = bitcast <4 x i32> %tmp49 to <4 x float>
+  %tmp51 = fadd <4 x float> %tmp46, %tmp50
+  %tmp52 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp51) #2
+  %tmp53 = bitcast <4 x i32> %tmp52 to <2 x i64>
+  %tmp54 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp52) #2
+  %tmp55 = fmul <4 x float> %tmp54, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
+  %tmp56 = fsub <4 x float> %tmp45, %tmp55
+  %tmp57 = fmul <4 x float> %tmp54, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
+  %tmp58 = fsub <4 x float> %tmp56, %tmp57
+  %tmp59 = fmul <4 x float> %tmp58, %tmp58
+  %tmp60 = fmul <4 x float> %tmp58, %tmp59
+  %tmp61 = fmul <4 x float> %tmp59, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
+  %tmp62 = fadd <4 x float> %tmp61, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
+  %tmp63 = fmul <4 x float> %tmp59, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
+  %tmp64 = fadd <4 x float> %tmp63, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
+  %tmp65 = fmul <4 x float> %tmp59, %tmp62
+  %tmp66 = fadd <4 x float> %tmp65, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
+  %tmp67 = fmul <4 x float> %tmp59, %tmp64
+  %tmp68 = fadd <4 x float> %tmp67, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
+  %tmp69 = fmul <4 x float> %tmp59, %tmp66
+  %tmp70 = fadd <4 x float> %tmp69, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %tmp71 = fmul <4 x float> %tmp60, %tmp68
+  %tmp72 = fadd <4 x float> %tmp58, %tmp71
+  %tmp73 = and <2 x i64> %tmp53, <i64 4294967297, i64 4294967297>
+  %tmp74 = bitcast <2 x i64> %tmp73 to <4 x i32>
+  %tmp75 = icmp eq <4 x i32> %tmp74, zeroinitializer
+  %tmp76 = sext <4 x i1> %tmp75 to <4 x i32>
+  %tmp77 = bitcast <4 x i32> %tmp76 to <4 x float>
+  %tmp78 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp70, <4 x float> %tmp72, <4 x float> %tmp77) #2
+  %tmp79 = and <2 x i64> %tmp53, <i64 8589934594, i64 8589934594>
+  %tmp80 = bitcast <2 x i64> %tmp79 to <4 x i32>
+  %tmp81 = icmp eq <4 x i32> %tmp80, zeroinitializer
+  %tmp82 = sext <4 x i1> %tmp81 to <4 x i32>
+  %tmp83 = bitcast <4 x float> %tmp78 to <4 x i32>
+  %tmp84 = xor <4 x i32> %tmp83, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp85 = bitcast <4 x i32> %tmp84 to <4 x float>
+  %tmp86 = bitcast <4 x i32> %tmp82 to <4 x float>
+  %tmp87 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp85, <4 x float> %tmp78, <4 x float> %tmp86) #2
+  %tmp88 = fadd <4 x float> %tmp44, %tmp87
+  %tmp89 = bitcast <2 x double> %arg2 to <4 x float>
+  %tmp90 = fmul <4 x float> %tmp89, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
+  %tmp91 = bitcast <2 x double> %arg2 to <4 x i32>
+  %tmp92 = and <4 x i32> %tmp91, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp93 = or <4 x i32> %tmp92, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
+  %tmp94 = bitcast <4 x i32> %tmp93 to <4 x float>
+  %tmp95 = fadd <4 x float> %tmp90, %tmp94
+  %tmp96 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp95) #2
+  %tmp97 = bitcast <4 x i32> %tmp96 to <2 x i64>
+  %tmp98 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp96) #2
+  %tmp99 = fmul <4 x float> %tmp98, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
+  %tmp100 = fsub <4 x float> %tmp89, %tmp99
+  %tmp101 = fmul <4 x float> %tmp98, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
+  %tmp102 = fsub <4 x float> %tmp100, %tmp101
+  %tmp103 = fmul <4 x float> %tmp102, %tmp102
+  %tmp104 = fmul <4 x float> %tmp102, %tmp103
+  %tmp105 = fmul <4 x float> %tmp103, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
+  %tmp106 = fadd <4 x float> %tmp105, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
+  %tmp107 = fmul <4 x float> %tmp103, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
+  %tmp108 = fadd <4 x float> %tmp107, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
+  %tmp109 = fmul <4 x float> %tmp103, %tmp106
+  %tmp110 = fadd <4 x float> %tmp109, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
+  %tmp111 = fmul <4 x float> %tmp103, %tmp108
+  %tmp112 = fadd <4 x float> %tmp111, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
+  %tmp113 = fmul <4 x float> %tmp103, %tmp110
+  %tmp114 = fadd <4 x float> %tmp113, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %tmp115 = fmul <4 x float> %tmp104, %tmp112
+  %tmp116 = fadd <4 x float> %tmp102, %tmp115
+  %tmp117 = and <2 x i64> %tmp97, <i64 4294967297, i64 4294967297>
+  %tmp118 = bitcast <2 x i64> %tmp117 to <4 x i32>
+  %tmp119 = icmp eq <4 x i32> %tmp118, zeroinitializer
+  %tmp120 = sext <4 x i1> %tmp119 to <4 x i32>
+  %tmp121 = bitcast <4 x i32> %tmp120 to <4 x float>
+  %tmp122 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp114, <4 x float> %tmp116, <4 x float> %tmp121) #2
+  %tmp123 = and <2 x i64> %tmp97, <i64 8589934594, i64 8589934594>
+  %tmp124 = bitcast <2 x i64> %tmp123 to <4 x i32>
+  %tmp125 = icmp eq <4 x i32> %tmp124, zeroinitializer
+  %tmp126 = sext <4 x i1> %tmp125 to <4 x i32>
+  %tmp127 = bitcast <4 x float> %tmp122 to <4 x i32>
+  %tmp128 = xor <4 x i32> %tmp127, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp129 = bitcast <4 x i32> %tmp128 to <4 x float>
+  %tmp130 = bitcast <4 x i32> %tmp126 to <4 x float>
+  %tmp131 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp129, <4 x float> %tmp122, <4 x float> %tmp130) #2
+  %tmp132 = fadd <4 x float> %tmp88, %tmp131
+  %tmp133 = bitcast <4 x float> %tmp132 to <2 x double>
+  ret <2 x double> %tmp133
+}
+
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
+declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
diff --git a/test/CodeGen/X86/pr24602.ll b/test/CodeGen/X86/pr24602.ll
new file mode 100644
index 0000000000000..9c029aeefec9b
--- /dev/null
+++ b/test/CodeGen/X86/pr24602.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown  | FileCheck %s
+
+; PR24602: Make sure we don't barf on non-foldable code (with opaque constants).
+
+; CHECK-LABEL: pr24602:
+; CHECK-NEXT: # BB#0
+; CHECK-NEXT: movabsq $-10000000000, [[CST:%[a-z0-9]+]]
+; CHECK-NEXT: imulq [[CST]], %rsi
+; CHECK-NEXT: leaq (%rdi,%rsi,8), %rax
+; CHECK-NEXT: movq [[CST]], (%rdi,%rsi,8)
+; CHECK-NEXT: retq
+define i64* @pr24602(i64* %p, i64 %n) nounwind {
+  %mul = mul nsw i64 %n, -10000000000
+  %add.ptr = getelementptr inbounds i64, i64* %p, i64 %mul
+  store i64 -10000000000, i64* %add.ptr
+  ret i64* %add.ptr
+}
diff --git a/test/CodeGen/X86/pr25828.ll b/test/CodeGen/X86/pr25828.ll
new file mode 100644
index 0000000000000..8fbabc7d0c6d1
--- /dev/null
+++ b/test/CodeGen/X86/pr25828.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=i686-pc-windows-msvc -relocation-model=pic | FileCheck %s
+; MOVPC32r should not generate CFI under windows
+
+; CHECK-LABEL: _foo:
+; CHECK-NOT: .cfi_adjust_cfa_offset
+define void @foo(i8) {
+entry-block:
+  switch i8 %0, label %bb2 [
+    i8 1, label %bb1
+    i8 2, label %bb2
+    i8 3, label %bb3
+    i8 4, label %bb4
+    i8 5, label %bb5
+  ]
+
+bb1:
+  ret void
+
+bb2:
+  ret void
+
+bb3:
+  ret void
+
+bb4:
+  ret void
+
+bb5:
+  ret void
+}
diff --git a/test/CodeGen/X86/prolog-push-seq.ll b/test/CodeGen/X86/prolog-push-seq.ll
new file mode 100644
index 0000000000000..f23791aef9221
--- /dev/null
+++ b/test/CodeGen/X86/prolog-push-seq.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc18.0.0"
+
+declare x86_thiscallcc void @bar(i32 %a, i32 %b)
+
+define fastcc void @foo(i32 %a, i32 %b) #0 {
+; CHECK-LABEL: foo:
+; CHECK: subl $64, %esp
+; CHECK-NEXT: pushl
+; CHECK-NEXT: calll _bar
+  %local = alloca i32, i32 16
+  call x86_thiscallcc void @bar(i32 %a, i32 %b)
+  call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"()
+  ret void
+}
+
+attributes #0 = { nounwind optsize "no-frame-pointer-elim-non-leaf"}
\ No newline at end of file
diff --git a/test/CodeGen/X86/pseudo_cmov_lower.ll b/test/CodeGen/X86/pseudo_cmov_lower.ll
new file mode 100644
index 0000000000000..c59e3478ff51c
--- /dev/null
+++ b/test/CodeGen/X86/pseudo_cmov_lower.ll
@@ -0,0 +1,267 @@
+; RUN: llc < %s -mtriple=i386-linux-gnu -o - | FileCheck %s 
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo1:
+; CHECK: js
+; CHECK-NOT: js
+define i32 @foo1(i32 %v1, i32 %v2, i32 %v3) nounwind {
+entry:
+  %cmp = icmp slt i32 %v1, 0
+  %v2.v3 = select i1 %cmp, i32 %v2, i32 %v3
+  %v1.v2 = select i1 %cmp, i32 %v1, i32 %v2
+  %sub = sub i32 %v1.v2, %v2.v3
+  ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. This makes
+; sure the code for the lowering for opposite conditions gets tested.
+; CHECK-LABEL: foo11:
+; CHECK: js
+; CHECK-NOT: js
+; CHECK-NOT: jns
+define i32 @foo11(i32 %v1, i32 %v2, i32 %v3) nounwind {
+entry:
+  %cmp1 = icmp slt i32 %v1, 0
+  %v2.v3 = select i1 %cmp1, i32 %v2, i32 %v3
+  %cmp2 = icmp sge i32 %v1, 0
+  %v1.v2 = select i1 %cmp2, i32 %v1, i32 %v2
+  %sub = sub i32 %v1.v2, %v2.v3
+  ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo2:
+; CHECK: js
+; CHECK-NOT: js
+define i32 @foo2(i8 %v1, i8 %v2, i8 %v3) nounwind {
+entry:
+  %cmp = icmp slt i8 %v1, 0
+  %v2.v3 = select i1 %cmp, i8 %v2, i8 %v3
+  %v1.v2 = select i1 %cmp, i8 %v1, i8 %v2
+  %t1 = sext i8 %v2.v3 to i32
+  %t2 = sext i8 %v1.v2 to i32
+  %sub = sub i32 %t1, %t2
+  ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo3:
+; CHECK: js
+; CHECK-NOT: js
+define i32 @foo3(i16 %v1, i16 %v2, i16 %v3) nounwind {
+entry:
+  %cmp = icmp slt i16 %v1, 0
+  %v2.v3 = select i1 %cmp, i16 %v2, i16 %v3
+  %v1.v2 = select i1 %cmp, i16 %v1, i16 %v2
+  %t1 = sext i16 %v2.v3 to i32
+  %t2 = sext i16 %v1.v2 to i32
+  %sub = sub i32 %t1, %t2
+  ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo4:
+; CHECK: js
+; CHECK-NOT: js
+define float @foo4(i32 %v1, float %v2, float %v3, float %v4) nounwind {
+entry:
+  %cmp = icmp slt i32 %v1, 0
+  %t1 = select i1 %cmp, float %v2, float %v3
+  %t2 = select i1 %cmp, float %v3, float %v4
+  %sub = fsub float %t1, %t2
+  ret float %sub
+}
+
+; This test checks that only a single je gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo5:
+; CHECK: je
+; CHECK-NOT: je
+define double @foo5(i32 %v1, double %v2, double %v3, double %v4) nounwind {
+entry:
+  %cmp = icmp eq i32 %v1, 0
+  %t1 = select i1 %cmp, double %v2, double %v3
+  %t2 = select i1 %cmp, double %v3, double %v4
+  %sub = fsub double %t1, %t2
+  ret double %sub
+}
+
+; This test checks that only a single je gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo6:
+; CHECK: je
+; CHECK-NOT: je
+define <4 x float> @foo6(i32 %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> %v4) nounwind {
+entry:
+  %cmp = icmp eq i32 %v1, 0
+  %t1 = select i1 %cmp, <4 x float> %v2, <4 x float> %v3
+  %t2 = select i1 %cmp, <4 x float> %v3, <4 x float> %v4
+  %sub = fsub <4 x float> %t1, %t2
+  ret <4 x float> %sub
+}
+
+; This test checks that only a single je gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo7:
+; CHECK: je
+; CHECK-NOT: je
+define <2 x double> @foo7(i32 %v1, <2 x double> %v2, <2 x double> %v3, <2 x double> %v4) nounwind {
+entry:
+  %cmp = icmp eq i32 %v1, 0
+  %t1 = select i1 %cmp, <2 x double> %v2, <2 x double> %v3
+  %t2 = select i1 %cmp, <2 x double> %v3, <2 x double> %v4
+  %sub = fsub <2 x double> %t1, %t2
+  ret <2 x double> %sub
+}
+
+; This test checks that only a single ja gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. This combines
+; all the supported types together into one long string of selects based
+; on the same condition.
+; CHECK-LABEL: foo8:
+; CHECK: ja
+; CHECK-NOT: ja
+define void @foo8(i32 %v1,
+                  i8 %v2, i8 %v3,
+                  i16 %v12, i16 %v13,
+                  i32 %v22, i32 %v23,
+                  float %v32, float %v33,
+                  double %v42, double %v43,
+                  <4 x float> %v52, <4 x float> %v53,
+                  <2 x double> %v62, <2 x double> %v63,
+                  <8 x float> %v72, <8 x float> %v73,
+                  <4 x double> %v82, <4 x double> %v83,
+                  <16 x float> %v92, <16 x float> %v93,
+                  <8 x double> %v102, <8 x double> %v103,
+                  i8 * %dst) nounwind {
+entry:
+  %add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 2
+  %a11 = bitcast i8* %add.ptr11 to i16*
+
+  %add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
+  %a21 = bitcast i8* %add.ptr21 to i32*
+
+  %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
+  %a31 = bitcast i8* %add.ptr31 to float*
+
+  %add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
+  %a41 = bitcast i8* %add.ptr41 to double*
+
+  %add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 32
+  %a51 = bitcast i8* %add.ptr51 to <4 x float>*
+
+  %add.ptr61 = getelementptr inbounds i8, i8* %dst, i32 48
+  %a61 = bitcast i8* %add.ptr61 to <2 x double>*
+
+  %add.ptr71 = getelementptr inbounds i8, i8* %dst, i32 64
+  %a71 = bitcast i8* %add.ptr71 to <8 x float>*
+
+  %add.ptr81 = getelementptr inbounds i8, i8* %dst, i32 128
+  %a81 = bitcast i8* %add.ptr81 to <4 x double>*
+
+  %add.ptr91 = getelementptr inbounds i8, i8* %dst, i32 64
+  %a91 = bitcast i8* %add.ptr91 to <16 x float>*
+
+  %add.ptr101 = getelementptr inbounds i8, i8* %dst, i32 128
+  %a101 = bitcast i8* %add.ptr101 to <8 x double>*
+
+  ; These operations are necessary, because select of two single use loads
+  ; ends up getting optimized into a select of two leas, followed by a
+  ; single load of the selected address.
+  %t13 = xor i16 %v13, 11
+  %t23 = xor i32 %v23, 1234
+  %t33 = fadd float %v33, %v32
+  %t43 = fadd double %v43, %v42
+  %t53 = fadd <4 x float> %v53, %v52
+  %t63 = fadd <2 x double> %v63, %v62
+  %t73 = fsub <8 x float> %v73, %v72
+  %t83 = fsub <4 x double> %v83, %v82
+  %t93 = fsub <16 x float> %v93, %v92
+  %t103 = fsub <8 x double> %v103, %v102
+
+  %cmp = icmp ugt i32 %v1, 31
+  %t11 = select i1 %cmp, i16 %v12, i16 %t13
+  %t21 = select i1 %cmp, i32 %v22, i32 %t23
+  %t31 = select i1 %cmp, float %v32, float %t33
+  %t41 = select i1 %cmp, double %v42, double %t43
+  %t51 = select i1 %cmp, <4 x float> %v52, <4 x float> %t53
+  %t61 = select i1 %cmp, <2 x double> %v62, <2 x double> %t63
+  %t71 = select i1 %cmp, <8 x float> %v72, <8 x float> %t73
+  %t81 = select i1 %cmp, <4 x double> %v82, <4 x double> %t83
+  %t91 = select i1 %cmp, <16 x float> %v92, <16 x float> %t93
+  %t101 = select i1 %cmp, <8 x double> %v102, <8 x double> %t103
+
+  store i16 %t11, i16* %a11, align 2
+  store i32 %t21, i32* %a21, align 4
+  store float %t31, float* %a31, align 4
+  store double %t41, double* %a41, align 8
+  store <4 x float> %t51, <4 x float>* %a51, align 16
+  store <2 x double> %t61, <2 x double>* %a61, align 16
+  store <8 x float> %t71, <8 x float>* %a71, align 32
+  store <4 x double> %t81, <4 x double>* %a81, align 32
+  store <16 x float> %t91, <16 x float>* %a91, align 32
+  store <8 x double> %t101, <8 x double>* %a101, align 32
+
+  ret void
+}
+
+; This test checks that only a single ja gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; on the same condition.
+; Contrary to my expectations, this doesn't exercise the code for
+; CMOV_V8I1, CMOV_V16I1, CMOV_V32I1, or CMOV_V64I1.  Instead the selects all
+; get lowered into vector length number of selects, which all eventually turn
+; into a huge number of CMOV_GR8, which are all contiguous, so the optimization
+; kicks in as long as CMOV_GR8 is supported. I couldn't find a way to get
+; CMOV_V*I1 pseudo-opcodes to get generated. If a way exists to get CMOV_V*1
+; pseudo-opcodes to be generated, this test should be replaced with one that
+; tests those opcodes.
+;
+; CHECK-LABEL: foo9:
+; CHECK: ja
+; CHECK-NOT: ja
+define void @foo9(i32 %v1,
+                  <8 x i1> %v12, <8 x i1> %v13,
+                  <16 x i1> %v22, <16 x i1> %v23,
+                  <32 x i1> %v32, <32 x i1> %v33,
+                  <64 x i1> %v42, <64 x i1> %v43,
+                  i8 * %dst) nounwind {
+entry:
+  %add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 0
+  %a11 = bitcast i8* %add.ptr11 to <8 x i1>*
+
+  %add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
+  %a21 = bitcast i8* %add.ptr21 to <16 x i1>*
+
+  %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
+  %a31 = bitcast i8* %add.ptr31 to <32 x i1>*
+
+  %add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
+  %a41 = bitcast i8* %add.ptr41 to <64 x i1>*
+
+  ; These operations are necessary, because select of two single use loads
+  ; ends up getting optimized into a select of two leas, followed by a
+  ; single load of the selected address.
+  %t13 = xor <8 x i1> %v13, %v12
+  %t23 = xor <16 x i1> %v23, %v22
+  %t33 = xor <32 x i1> %v33, %v32
+  %t43 = xor <64 x i1> %v43, %v42
+
+  %cmp = icmp ugt i32 %v1, 31
+  %t11 = select i1 %cmp, <8 x i1> %v12, <8 x i1> %t13
+  %t21 = select i1 %cmp, <16 x i1> %v22, <16 x i1> %t23
+  %t31 = select i1 %cmp, <32 x i1> %v32, <32 x i1> %t33
+  %t41 = select i1 %cmp, <64 x i1> %v42, <64 x i1> %t43
+
+  store <8 x i1> %t11, <8 x i1>* %a11, align 16
+  store <16 x i1> %t21, <16 x i1>* %a21, align 4
+  store <32 x i1> %t31, <32 x i1>* %a31, align 8
+  store <64 x i1> %t41, <64 x i1>* %a41, align 16
+
+  ret void
+}
diff --git a/test/CodeGen/X86/pseudo_cmov_lower1.ll b/test/CodeGen/X86/pseudo_cmov_lower1.ll
new file mode 100644
index 0000000000000..4ce131bb86454
--- /dev/null
+++ b/test/CodeGen/X86/pseudo_cmov_lower1.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2 -o - | FileCheck %s 
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s 
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo1:
+; CHECK: jae
+; CHECK-NOT: jae
+define double @foo1(float %p1, double %p2, double %p3) nounwind {
+entry:
+  %c1 = fcmp oge float %p1, 0.000000e+00
+  %d0 = fadd double %p2, 1.25e0
+  %d1 = fadd double %p3, 1.25e0
+  %d2 = select i1 %c1, double %d0, double %d1
+  %d3 = select i1 %c1, double %d0, double %p2
+  %d4 = select i1 %c1, double %p3, double %d1
+  %d5 = fsub double %d2, %d3
+  %d6 = fadd double %d5, %d4
+  ret double %d6
+}
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo2:
+; CHECK: jae
+; CHECK-NOT: jae
+define float @foo2(float %p1, float %p2, float %p3) nounwind {
+entry:
+  %c1 = fcmp oge float %p1, 0.000000e+00
+  %d0 = fadd float %p2, 1.25e0
+  %d1 = fadd float %p3, 1.25e0
+  %d2 = select i1 %c1, float %d0, float %d1
+  %d3 = select i1 %c1, float %d1, float %p2
+  %d4 = select i1 %c1, float %d0, float %p3
+  %d5 = fsub float %d2, %d3
+  %d6 = fadd float %d5, %d4
+  ret float %d6
+}
+
diff --git a/test/CodeGen/X86/pseudo_cmov_lower2.ll b/test/CodeGen/X86/pseudo_cmov_lower2.ll
new file mode 100644
index 0000000000000..0133963b36d04
--- /dev/null
+++ b/test/CodeGen/X86/pseudo_cmov_lower2.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s 
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.  The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect.
+;
+; CHECK-LABEL: foo1:
+; CHECK: jae
+; CHECK-NOT: jae
+define double @foo1(float %p1, double %p2, double %p3) nounwind {
+entry:
+  %c1 = fcmp oge float %p1, 0.000000e+00
+  %d0 = fadd double %p2, 1.25e0
+  %d1 = fadd double %p3, 1.25e0
+  %d2 = select i1 %c1, double %d0, double %d1
+  %d3 = select i1 %c1, double %d2, double %p2
+  %d4 = select i1 %c1, double %d3, double %p3
+  %d5 = fsub double %d2, %d3
+  %d6 = fadd double %d5, %d4
+  ret double %d6
+}
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.  The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect.
+;
+; CHECK-LABEL: foo2:
+; CHECK: jae
+; CHECK-NOT: jae
+define double @foo2(float %p1, double %p2, double %p3) nounwind {
+entry:
+  %c1 = fcmp oge float %p1, 0.000000e+00
+  %d0 = fadd double %p2, 1.25e0
+  %d1 = fadd double %p3, 1.25e0
+  %d2 = select i1 %c1, double %d0, double %d1
+  %d3 = select i1 %c1, double %p2, double %d2
+  %d4 = select i1 %c1, double %p3, double %d3
+  %d5 = fsub double %d2, %d3
+  %d6 = fadd double %d5, %d4
+  ret double %d6
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.  The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect.  It also tests to make sure all
+; the operands of the resulting instructions are from the proper places.
+;
+; CHECK-LABEL: foo3:
+; CHECK:          js
+; CHECK-NOT: js
+; CHECK-LABEL: # BB#1:
+; CHECK-DAG:      movapd  %xmm2, %xmm1
+; CHECK-DAG:      movapd  %xmm2, %xmm0
+; CHECK-LABEL:.LBB2_2:
+; CHECK:          divsd   %xmm1, %xmm0
+; CHECK:          ret
+define double @foo3(i32 %p1, double %p2, double %p3,
+                             double %p4, double %p5) nounwind {
+entry:
+  %c1 = icmp slt i32 %p1, 0
+  %d2 = select i1 %c1, double %p2, double %p3
+  %d3 = select i1 %c1, double %p3, double %p4
+  %d4 = select i1 %c1, double %d2, double %d3
+  %d5 = fdiv double %d4, %d3
+  ret double %d5
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.  The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect.  It also tests to make sure all
+; the operands of the resulting instructions are from the proper places
+; when the "opposite condition" handling code in the compiler is used.
+; This should be the same code as foo3 above, because we use the opposite
+; condition code in the second two selects, but we also swap the operands
+; of the selects to give the same actual computation.
+;
+; CHECK-LABEL: foo4:
+; CHECK:          js
+; CHECK-NOT: js
+; CHECK-LABEL: # BB#1:
+; CHECK-DAG:      movapd  %xmm2, %xmm1
+; CHECK-DAG:      movapd  %xmm2, %xmm0
+; CHECK-LABEL:.LBB3_2:
+; CHECK:          divsd   %xmm1, %xmm0
+; CHECK:          ret
+define double @foo4(i32 %p1, double %p2, double %p3,
+                             double %p4, double %p5) nounwind {
+entry:
+  %c1 = icmp slt i32 %p1, 0
+  %d2 = select i1 %c1, double %p2, double %p3
+  %c2 = icmp sge i32 %p1, 0
+  %d3 = select i1 %c2, double %p4, double %p3
+  %d4 = select i1 %c2, double %d3, double %d2
+  %d5 = fdiv double %d4, %d3
+  ret double %d5
+}
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index 4b83b55997e23..c6d118d6da697 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -1,11 +1,22 @@
-; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSSE3
-; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
-; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 
 define void @test1(i16* nocapture %head) nounwind {
+; SSE-LABEL: test1:
+; SSE:       ## BB#0: ## %vector.ph
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqu %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test1:
+; AVX:       ## BB#0: ## %vector.ph
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX-NEXT:    retq
 vector.ph:
   %0 = getelementptr inbounds i16, i16* %head, i64 0
   %1 = bitcast i16* %0 to <8 x i16>*
@@ -15,30 +26,22 @@ vector.ph:
   %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
   store <8 x i16> %5, <8 x i16>* %1, align 2
   ret void
-
-; SSSE3: @test1
-; SSSE3:      # BB#0:
-; SSSE3-NEXT: movdqu (%rdi), %xmm0
-; SSSE3-NEXT: psubusw LCPI0_0(%rip), %xmm0
-; SSSE3-NEXT: movdqu %xmm0, (%rdi)
-; SSSE3-NEXT: retq
-
-; AVX1: @test1
-; AVX1:      # BB#0:
-; AVX1-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX1-NEXT: retq
-
-; AVX2: @test1
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX2-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX2-NEXT: retq
 }
 
 define void @test2(i16* nocapture %head) nounwind {
+; SSE-LABEL: test2:
+; SSE:       ## BB#0: ## %vector.ph
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqu %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2:
+; AVX:       ## BB#0: ## %vector.ph
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX-NEXT:    retq
 vector.ph:
   %0 = getelementptr inbounds i16, i16* %head, i64 0
   %1 = bitcast i16* %0 to <8 x i16>*
@@ -48,30 +51,46 @@ vector.ph:
   %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
   store <8 x i16> %5, <8 x i16>* %1, align 2
   ret void
-
-; SSSE3: @test2
-; SSSE3:      # BB#0:
-; SSSE3-NEXT: movdqu (%rdi), %xmm0
-; SSSE3-NEXT: psubusw LCPI1_0(%rip), %xmm0
-; SSSE3-NEXT: movdqu %xmm0, (%rdi)
-; SSSE3-NEXT: retq
-
-; AVX1: @test2
-; AVX1:      # BB#0:
-; AVX1-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX1-NEXT: retq
-
-; AVX2: @test2
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX2-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX2-NEXT: retq
 }
 
 define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
+; SSE2-LABEL: test3:
+; SSE2:       ## BB#0: ## %vector.ph
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    movdqu (%rdi), %xmm1
+; SSE2-NEXT:    psubusw %xmm0, %xmm1
+; SSE2-NEXT:    movdqu %xmm1, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: test3:
+; SSSE3:       ## BB#0: ## %vector.ph
+; SSSE3-NEXT:    movd %esi, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    movdqu (%rdi), %xmm1
+; SSSE3-NEXT:    psubusw %xmm0, %xmm1
+; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: test3:
+; AVX1:       ## BB#0: ## %vector.ph
+; AVX1-NEXT:    vmovd %esi, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
+; AVX1-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test3:
+; AVX2:       ## BB#0: ## %vector.ph
+; AVX2-NEXT:    vmovd %esi, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
+; AVX2-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT:    retq
 vector.ph:
   %0 = insertelement <8 x i16> undef, i16 %w, i32 0
   %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -83,36 +102,22 @@ vector.ph:
   %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
   store <8 x i16> %6, <8 x i16>* %2, align 2
   ret void
-
-; SSSE3: @test3
-; SSSE3:      # BB#0:
-; SSSE3-NEXT: movd %esi, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSSE3-NEXT: movdqu (%rdi), %xmm1
-; SSSE3-NEXT: psubusw %xmm0, %xmm1
-; SSSE3-NEXT: movdqu %xmm1, (%rdi)
-; SSSE3-NEXT: retq
-
-; AVX1: @test3
-; AVX1:      # BB#0:
-; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vmovdqu (%rdi), %xmm1
-; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX1-NEXT: retq
-
-; AVX2: @test3
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu (%rdi), %xmm1
-; AVX2-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX2-NEXT: retq
 }
 
 define void @test4(i8* nocapture %head) nounwind {
+; SSE-LABEL: test4:
+; SSE:       ## BB#0: ## %vector.ph
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqu %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test4:
+; AVX:       ## BB#0: ## %vector.ph
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX-NEXT:    retq
 vector.ph:
   %0 = getelementptr inbounds i8, i8* %head, i64 0
   %1 = bitcast i8* %0 to <16 x i8>*
@@ -122,30 +127,22 @@ vector.ph:
   %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
   store <16 x i8> %5, <16 x i8>* %1, align 1
   ret void
-
-; SSSE3: @test4
-; SSSE3:      # BB#0:
-; SSSE3-NEXT: movdqu (%rdi), %xmm0
-; SSSE3-NEXT: psubusb LCPI3_0(%rip), %xmm0
-; SSSE3-NEXT: movdqu %xmm0, (%rdi)
-; SSSE3-NEXT: retq
-
-; AVX1: @test4
-; AVX1:      # BB#0:
-; AVX1-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX1-NEXT: retq
-
-; AVX2: @test4
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX2-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX2-NEXT: retq
 }
 
 define void @test5(i8* nocapture %head) nounwind {
+; SSE-LABEL: test5:
+; SSE:       ## BB#0: ## %vector.ph
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqu %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test5:
+; AVX:       ## BB#0: ## %vector.ph
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX-NEXT:    retq
 vector.ph:
   %0 = getelementptr inbounds i8, i8* %head, i64 0
   %1 = bitcast i8* %0 to <16 x i8>*
@@ -155,30 +152,49 @@ vector.ph:
   %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
   store <16 x i8> %5, <16 x i8>* %1, align 1
   ret void
-
-; SSSE3: @test5
-; SSSE3:      # BB#0:
-; SSSE3-NEXT: movdqu (%rdi), %xmm0
-; SSSE3-NEXT: psubusb LCPI4_0(%rip), %xmm0
-; SSSE3-NEXT: movdqu %xmm0, (%rdi)
-; SSSE3-NEXT: retq
-
-; AVX1: @test5
-; AVX1:      # BB#0:
-; AVX1-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1-NEXT: vpsubusb LCPI4_0(%rip), %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX1-NEXT: retq
-
-; AVX2: @test5
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX2-NEXT: vpsubusb LCPI4_0(%rip), %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX2-NEXT: retq
 }
 
 define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
+; SSE2-LABEL: test6:
+; SSE2:       ## BB#0: ## %vector.ph
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    movdqu (%rdi), %xmm1
+; SSE2-NEXT:    psubusb %xmm0, %xmm1
+; SSE2-NEXT:    movdqu %xmm1, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: test6:
+; SSSE3:       ## BB#0: ## %vector.ph
+; SSSE3-NEXT:    movd %esi, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    movdqu (%rdi), %xmm1
+; SSSE3-NEXT:    psubusb %xmm0, %xmm1
+; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: test6:
+; AVX1:       ## BB#0: ## %vector.ph
+; AVX1-NEXT:    vmovd %esi, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
+; AVX1-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test6:
+; AVX2:       ## BB#0: ## %vector.ph
+; AVX2-NEXT:    vmovd %esi, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
+; AVX2-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT:    retq
 vector.ph:
   %0 = insertelement <16 x i8> undef, i8 %w, i32 0
   %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -190,38 +206,41 @@ vector.ph:
   %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
   store <16 x i8> %6, <16 x i8>* %2, align 1
   ret void
-
-; SSSE3: @test6
-; SSSE3:      # BB#0:
-; SSSE3-NEXT: movd %esi, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: movdqu (%rdi), %xmm1
-; SSSE3-NEXT: psubusb %xmm0, %xmm1
-; SSSE3-NEXT: movdqu %xmm1, (%rdi)
-; SSSE3-NEXT: retq
-
-; AVX1: @test6
-; AVX1:      # BB#0:
-; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm1, %xmm0
-; AVX1-NEXT: vmovdqu (%rdi), %xmm1
-; AVX1-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX1-NEXT: retq
-
-; AVX2: @test6
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu (%rdi), %xmm1
-; AVX2-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
-; AVX2-NEXT: retq
 }
 
 define void @test7(i16* nocapture %head) nounwind {
+; SSE-LABEL: test7:
+; SSE:       ## BB#0: ## %vector.ph
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    movdqu 16(%rdi), %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE-NEXT:    psubusw %xmm2, %xmm0
+; SSE-NEXT:    psubusw %xmm2, %xmm1
+; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
+; SSE-NEXT:    movdqu %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test7:
+; AVX1:       ## BB#0: ## %vector.ph
+; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test7:
+; AVX2:       ## BB#0: ## %vector.ph
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 vector.ph:
   %0 = getelementptr inbounds i16, i16* %head, i64 0
   %1 = bitcast i16* %0 to <16 x i16>*
@@ -231,17 +250,47 @@ vector.ph:
   %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
   store <16 x i16> %5, <16 x i16>* %1, align 2
   ret void
-
-; AVX2: @test7
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; AVX2-NEXT: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
 }
 
 define void @test8(i16* nocapture %head) nounwind {
+; SSE-LABEL: test8:
+; SSE:       ## BB#0: ## %vector.ph
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    movdqu 16(%rdi), %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; SSE-NEXT:    psubusw %xmm2, %xmm0
+; SSE-NEXT:    psubusw %xmm2, %xmm1
+; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
+; SSE-NEXT:    movdqu %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test8:
+; AVX1:       ## BB#0: ## %vector.ph
+; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
+; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769]
+; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test8:
+; AVX2:       ## BB#0: ## %vector.ph
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 vector.ph:
   %0 = getelementptr inbounds i16, i16* %head, i64 0
   %1 = bitcast i16* %0 to <16 x i16>*
@@ -252,16 +301,63 @@ vector.ph:
   store <16 x i16> %5, <16 x i16>* %1, align 2
   ret void
 
-; AVX2: @test8
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; AVX2-NEXT: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
 }
 
 define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
+; SSE2-LABEL: test9:
+; SSE2:       ## BB#0: ## %vector.ph
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    movdqu (%rdi), %xmm1
+; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
+; SSE2-NEXT:    psubusw %xmm0, %xmm1
+; SSE2-NEXT:    psubusw %xmm0, %xmm2
+; SSE2-NEXT:    movdqu %xmm2, 16(%rdi)
+; SSE2-NEXT:    movdqu %xmm1, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: test9:
+; SSSE3:       ## BB#0: ## %vector.ph
+; SSSE3-NEXT:    movd %esi, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    movdqu (%rdi), %xmm1
+; SSSE3-NEXT:    movdqu 16(%rdi), %xmm2
+; SSSE3-NEXT:    psubusw %xmm0, %xmm1
+; SSSE3-NEXT:    psubusw %xmm0, %xmm2
+; SSSE3-NEXT:    movdqu %xmm2, 16(%rdi)
+; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: test9:
+; AVX1:       ## BB#0: ## %vector.ph
+; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vmovd %esi, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test9:
+; AVX2:       ## BB#0: ## %vector.ph
+; AVX2-NEXT:    vmovd %esi, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
+; AVX2-NEXT:    vpsubusw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 vector.ph:
   %0 = insertelement <16 x i16> undef, i16 %w, i32 0
   %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -273,19 +369,41 @@ vector.ph:
   %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
   store <16 x i16> %6, <16 x i16>* %2, align 2
   ret void
-
-; AVX2: @test9
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT: vmovdqu (%rdi), %ymm1
-; AVX2-NEXT: vpsubusw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
 }
 
 define void @test10(i8* nocapture %head) nounwind {
+; SSE-LABEL: test10:
+; SSE:       ## BB#0: ## %vector.ph
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    movdqu 16(%rdi), %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT:    psubusb %xmm2, %xmm0
+; SSE-NEXT:    psubusb %xmm2, %xmm1
+; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
+; SSE-NEXT:    movdqu %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test10:
+; AVX1:       ## BB#0: ## %vector.ph
+; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test10:
+; AVX2:       ## BB#0: ## %vector.ph
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 vector.ph:
   %0 = getelementptr inbounds i8, i8* %head, i64 0
   %1 = bitcast i8* %0 to <32 x i8>*
@@ -296,16 +414,47 @@ vector.ph:
   store <32 x i8> %5, <32 x i8>* %1, align 1
   ret void
 
-; AVX2: @test10
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; AVX2-NEXT: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
 }
 
 define void @test11(i8* nocapture %head) nounwind {
+; SSE-LABEL: test11:
+; SSE:       ## BB#0: ## %vector.ph
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    movdqu 16(%rdi), %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    psubusb %xmm2, %xmm0
+; SSE-NEXT:    psubusb %xmm2, %xmm1
+; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
+; SSE-NEXT:    movdqu %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test11:
+; AVX1:       ## BB#0: ## %vector.ph
+; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
+; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test11:
+; AVX2:       ## BB#0: ## %vector.ph
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 vector.ph:
   %0 = getelementptr inbounds i8, i8* %head, i64 0
   %1 = bitcast i8* %0 to <32 x i8>*
@@ -315,17 +464,66 @@ vector.ph:
   %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
   store <32 x i8> %5, <32 x i8>* %1, align 1
   ret void
-
-; AVX2: @test11
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; AVX2-NEXT: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
 }
 
 define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
+; SSE2-LABEL: test12:
+; SSE2:       ## BB#0: ## %vector.ph
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    movdqu (%rdi), %xmm1
+; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
+; SSE2-NEXT:    psubusb %xmm0, %xmm1
+; SSE2-NEXT:    psubusb %xmm0, %xmm2
+; SSE2-NEXT:    movdqu %xmm2, 16(%rdi)
+; SSE2-NEXT:    movdqu %xmm1, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: test12:
+; SSSE3:       ## BB#0: ## %vector.ph
+; SSSE3-NEXT:    movd %esi, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    movdqu (%rdi), %xmm1
+; SSSE3-NEXT:    movdqu 16(%rdi), %xmm2
+; SSSE3-NEXT:    psubusb %xmm0, %xmm1
+; SSSE3-NEXT:    psubusb %xmm0, %xmm2
+; SSSE3-NEXT:    movdqu %xmm2, 16(%rdi)
+; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: test12:
+; AVX1:       ## BB#0: ## %vector.ph
+; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vmovd %esi, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test12:
+; AVX2:       ## BB#0: ## %vector.ph
+; AVX2-NEXT:    vmovd %esi, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
+; AVX2-NEXT:    vpsubusb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 vector.ph:
   %0 = insertelement <32 x i8> undef, i8 %w, i32 0
   %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -337,14 +535,4 @@ vector.ph:
   %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
   store <32 x i8> %6, <32 x i8>* %2, align 1
   ret void
-
-; AVX2: @test12
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT: vmovdqu (%rdi), %ymm1
-; AVX2-NEXT: vpsubusb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
 }
diff --git a/test/CodeGen/X86/push-cfi-debug.ll b/test/CodeGen/X86/push-cfi-debug.ll
new file mode 100644
index 0000000000000..cc00fab525ab3
--- /dev/null
+++ b/test/CodeGen/X86/push-cfi-debug.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s
+
+
+; Function Attrs: optsize
+declare void @foo(i32, i32) #0
+declare x86_stdcallcc void @stdfoo(i32, i32) #0
+
+; CHECK-LABEL: test1:
+; CHECK: subl $8, %esp
+; CHECK: .cfi_adjust_cfa_offset 8
+; CHECK: pushl $2
+; CHECK: .cfi_adjust_cfa_offset 4
+; CHECK: pushl $1
+; CHECK: .cfi_adjust_cfa_offset 4
+; CHECK: calll foo
+; CHECK: addl $16, %esp
+; CHECK: .cfi_adjust_cfa_offset -16
+; CHECK: subl $8, %esp
+; CHECK: .cfi_adjust_cfa_offset 8
+; CHECK: pushl $4
+; CHECK: .cfi_adjust_cfa_offset 4
+; CHECK: pushl $3
+; CHECK: .cfi_adjust_cfa_offset 4
+; CHECK: calll stdfoo
+; CHECK: .cfi_adjust_cfa_offset -8
+; CHECK: addl $8, %esp
+; CHECK: .cfi_adjust_cfa_offset -8
+define void @test1() #0 !dbg !4 {
+entry:
+  tail call void @foo(i32 1, i32 2) #1, !dbg !10
+  tail call x86_stdcallcc void @stdfoo(i32 3, i32 4) #1, !dbg !11
+  ret void, !dbg !12
+}
+
+attributes #0 = { nounwind optsize }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 250289)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!1 = !DIFile(filename: "foo.c", directory: "foo")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "test1", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, variables: !2)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null}
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.8.0 (trunk 250289)"}
+!10 = !DILocation(line: 4, column: 3, scope: !4)
+!11 = !DILocation(line: 5, column: 3, scope: !4)
+!12 = !DILocation(line: 6, column: 1, scope: !4)
diff --git a/test/CodeGen/X86/push-cfi-obj.ll b/test/CodeGen/X86/push-cfi-obj.ll
new file mode 100644
index 0000000000000..33291ec3318a4
--- /dev/null
+++ b/test/CodeGen/X86/push-cfi-obj.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -mtriple=i686-pc-linux -filetype=obj | llvm-readobj -s -sr -sd | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mtriple=i686-darwin-macosx10.7 -filetype=obj | llvm-readobj -sections | FileCheck -check-prefix=DARWIN %s
+
+; On darwin, check that we manage to generate the compact unwind section
+; DARWIN: Name: __compact_unwind
+; DARWIN: Segment: __LD
+
+; LINUX:         Name: .eh_frame
+; LINUX-NEXT:    Type: SHT_PROGBITS (0x1)
+; LINUX-NEXT:    Flags [ (0x2)
+; LINUX-NEXT:      SHF_ALLOC (0x2)
+; LINUX-NEXT:    ]
+; LINUX-NEXT:    Address: 0x0
+; LINUX-NEXT:    Offset: 0x68
+; LINUX-NEXT:    Size: 64
+; LINUX-NEXT:    Link: 0
+; LINUX-NEXT:    Info: 0
+; LINUX-NEXT:    AddressAlignment: 4
+; LINUX-NEXT:    EntrySize: 0
+; LINUX-NEXT:    Relocations [
+; LINUX-NEXT:    ]
+; LINUX-NEXT:    SectionData (
+; LINUX-NEXT:      0000: 1C000000 00000000 017A504C 5200017C  |.........zPLR..||
+; LINUX-NEXT:      0010: 08070000 00000000 1B0C0404 88010000  |................|
+; LINUX-NEXT:      0020: 1C000000 24000000 00000000 1D000000  |....$...........|
+; LINUX-NEXT:      0030: 04000000 00410E08 8502420D 05432E10  |.....A....B..C..|
+; LINUX-NEXT:    )
+
+declare i32 @__gxx_personality_v0(...)
+declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
+
+define void @test() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue unwind label %cleanup
+continue:
+  ret void
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+attributes #0 = { optsize "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll
new file mode 100644
index 0000000000000..6389708f42ccc
--- /dev/null
+++ b/test/CodeGen/X86/push-cfi.ll
@@ -0,0 +1,304 @@
+; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s -check-prefix=LINUX -check-prefix=CHECK
+; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=DARWIN -check-prefix=CHECK
+
+declare i32 @__gxx_personality_v0(...)
+declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
+declare void @large(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f)
+declare void @empty()
+
+; When we use an invoke, we expect a .cfi_escape GNU_ARGS_SIZE
+; with size 16 before the invocation. Without FP, we also expect
+; .cfi_adjust_cfa_offset after each push.
+; Darwin should not generate pushes in either circumstance.
+; CHECK-LABEL: test1_nofp:
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; LINUX: .cfi_adjust_cfa_offset -16
+; DARWIN-NOT: .cfi_escape
+; DARWIN-NOT: pushl
+define void @test1_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue unwind label %cleanup
+continue:
+  ret void
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; CHECK-LABEL: test1_fp:
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; DARWIN: pushl %ebp
+; DARWIN-NOT: .cfi_escape
+; DARWIN-NOT: pushl
+define void @test1_fp() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue unwind label %cleanup
+continue:
+  ret void
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; If the function has no handlers, we don't need to generate GNU_ARGS_SIZE,
+; even if it has an unwind table. Without FP, we still need cfi_adjust_cfa_offset,
+; so darwin should not generate pushes.
+; CHECK-LABEL: test2_nofp:
+; LINUX-NOT: .cfi_escape
+; LINUX: pushl   $4
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; LINUX: .cfi_adjust_cfa_offset -16
+; DARWIN-NOT: .cfi_escape
+; DARWIN-NOT: pushl
+define void @test2_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; CHECK-LABEL: test2_fp:
+; CHECK-NOT: .cfi_escape
+; CHECK-NOT: .cfi_adjust_cfa_offset
+; CHECK: pushl   $4
+; CHECK-NEXT: pushl   $3
+; CHECK-NEXT: pushl   $2
+; CHECK-NEXT: pushl   $1
+; CHECK-NEXT: call
+; CHECK-NEXT: addl $24, %esp
+define void @test2_fp() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; If we did not end up using any pushes, no need for GNU_ARGS_SIZE or
+; cfi_adjust_cfa_offset.
+; CHECK-LABEL: test3_nofp:
+; LINUX-NOT: .cfi_escape
+; LINUX-NOT: .cfi_adjust_cfa_offset
+; LINUX-NOT: pushl
+; LINUX: retl
+define void @test3_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @empty()
+          to label %continue unwind label %cleanup
+continue:
+  ret void
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; If we did not end up using any pushes, no need for GNU_ARGS_SIZE or
+; cfi_adjust_cfa_offset.
+; CHECK-LABEL: test3_fp:
+; LINUX: pushl %ebp
+; LINUX-NOT: .cfi_escape
+; LINUX-NOT: .cfi_adjust_cfa_offset
+; LINUX-NOT: pushl
+; LINUX: retl
+define void @test3_fp() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @empty()
+          to label %continue unwind label %cleanup
+continue:
+  ret void
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; Different sized stacks need different GNU_ARGS_SIZEs
+; CHECK-LABEL: test4:
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; LINUX: .cfi_escape 0x2e, 0x20
+; LINUX: subl    $8, %esp
+; LINUX-NEXT: pushl   $11
+; LINUX-NEXT: pushl   $10
+; LINUX-NEXT: pushl   $9
+; LINUX-NEXT: pushl   $8
+; LINUX-NEXT: pushl   $7
+; LINUX-NEXT: pushl   $6
+; LINUX-NEXT: calll   large
+; LINUX-NEXT: addl $32, %esp
+define void @test4() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue1 unwind label %cleanup
+continue1:
+  invoke void @large(i32 6, i32 7, i32 8, i32 9, i32 10, i32 11)
+          to label %continue2 unwind label %cleanup
+continue2:
+  ret void          
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; If we did use pushes, we need to reset GNU_ARGS_SIZE before a call
+; without parameters, but don't need to adjust the cfa offset
+; CHECK-LABEL: test5_nofp:
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; LINUX: .cfi_adjust_cfa_offset -16
+; LINUX-NOT: .cfi_adjust_cfa_offset
+; LINUX: .cfi_escape 0x2e, 0x00
+; LINUX-NOT: .cfi_adjust_cfa_offset
+; LINUX: call
+define void @test5_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue1 unwind label %cleanup
+continue1:
+  invoke void @empty()
+          to label %continue2 unwind label %cleanup
+continue2:
+  ret void          
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; CHECK-LABEL: test5_fp:
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; LINUX: .cfi_escape 0x2e, 0x00
+; LINUX-NOT: .cfi_adjust_cfa_offset
+; LINUX: call
+define void @test5_fp() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue1 unwind label %cleanup
+continue1:
+  invoke void @empty()
+          to label %continue2 unwind label %cleanup
+continue2:
+  ret void          
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; FIXME: This is actually inefficient - we don't need to repeat the .cfi_escape twice.
+; CHECK-LABEL: test6:
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX: call
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX: call
+define void @test6() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue1 unwind label %cleanup
+continue1:
+  invoke void @good(i32 5, i32 6, i32 7, i32 8)
+          to label %continue2 unwind label %cleanup
+continue2:
+  ret void          
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; Darwin should generate pushes in the presense of FP and an unwind table,
+; but not FP and invoke.
+; CHECK-LABEL: test7:
+; DARWIN: pushl %ebp
+; DARWIN: movl %esp, %ebp
+; DARWIN: .cfi_def_cfa_register %ebp
+; DARWIN-NOT: .cfi_adjust_cfa_offset
+; DARWIN: pushl   $4
+; DARWIN-NEXT: pushl   $3
+; DARWIN-NEXT: pushl   $2
+; DARWIN-NEXT: pushl   $1
+; DARWIN-NEXT: call
+define void @test7() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; CHECK-LABEL: test8:
+; DARWIN: pushl %ebp
+; DARWIN: movl %esp, %ebp
+; DARWIN-NOT: .cfi_adjust_cfa_offset
+; DARWIN-NOT: pushl
+define void @test8() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue unwind label %cleanup
+continue:
+  ret void
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+attributes #0 = { optsize }
+attributes #1 = { optsize "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/X86/ragreedy-hoist-spill.ll b/test/CodeGen/X86/ragreedy-hoist-spill.ll
index e7dda5349568f..46b65bd24fc08 100644
--- a/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
 
-; This testing case is reduced from 254.gap SyFgets funciton.
+; This testing case is reduced from 254.gap SyFgets function.
 ; We make sure a spill is not hoisted to a hotter outer loop.
 
 %struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] }
diff --git a/test/CodeGen/X86/rem_crash.ll b/test/CodeGen/X86/rem_crash.ll
new file mode 100644
index 0000000000000..8363b22ab65fe
--- /dev/null
+++ b/test/CodeGen/X86/rem_crash.ll
@@ -0,0 +1,257 @@
+; RUN: llc < %s
+
+define i8 @test_minsize_uu8(i8 %x) minsize optsize {
+entry:
+  %0 = udiv i8 %x, 10
+  %1 = urem i8 %x, 10
+  %res = add i8 %0, %1
+  ret i8 %res
+}
+
+define i8 @test_minsize_ss8(i8 %x) minsize optsize {
+entry:
+  %0 = sdiv i8 %x, 10
+  %1 = srem i8 %x, 10
+  %res = add i8 %0, %1
+  ret i8 %res
+}
+
+define i8 @test_minsize_us8(i8 %x) minsize optsize {
+entry:
+  %0 = udiv i8 %x, 10
+  %1 = srem i8 %x, 10
+  %res = add i8 %0, %1
+  ret i8 %res
+}
+
+define i8 @test_minsize_su8(i8 %x) minsize optsize {
+entry:
+  %0 = sdiv i8 %x, 10
+  %1 = urem i8 %x, 10
+  %res = add i8 %0, %1
+  ret i8 %res
+}
+
+define i16 @test_minsize_uu16(i16 %x) minsize optsize {
+entry:
+  %0 = udiv i16 %x, 10
+  %1 = urem i16 %x, 10
+  %res = add i16 %0, %1
+  ret i16 %res
+}
+
+define i16 @test_minsize_ss16(i16 %x) minsize optsize {
+entry:
+  %0 = sdiv i16 %x, 10
+  %1 = srem i16 %x, 10
+  %res = add i16 %0, %1
+  ret i16 %res
+}
+
+define i16 @test_minsize_us16(i16 %x) minsize optsize {
+entry:
+  %0 = udiv i16 %x, 10
+  %1 = srem i16 %x, 10
+  %res = add i16 %0, %1
+  ret i16 %res
+}
+
+define i16 @test_minsize_su16(i16 %x) minsize optsize {
+entry:
+  %0 = sdiv i16 %x, 10
+  %1 = urem i16 %x, 10
+  %res = add i16 %0, %1
+  ret i16 %res
+}
+
+define i32 @test_minsize_uu32(i32 %x) minsize optsize {
+entry:
+  %0 = udiv i32 %x, 10
+  %1 = urem i32 %x, 10
+  %res = add i32 %0, %1
+  ret i32 %res
+}
+
+define i32 @test_minsize_ss32(i32 %x) minsize optsize {
+entry:
+  %0 = sdiv i32 %x, 10
+  %1 = srem i32 %x, 10
+  %res = add i32 %0, %1
+  ret i32 %res
+}
+
+define i32 @test_minsize_us32(i32 %x) minsize optsize {
+entry:
+  %0 = udiv i32 %x, 10
+  %1 = srem i32 %x, 10
+  %res = add i32 %0, %1
+  ret i32 %res
+}
+
+define i32 @test_minsize_su32(i32 %x) minsize optsize {
+entry:
+  %0 = sdiv i32 %x, 10
+  %1 = urem i32 %x, 10
+  %res = add i32 %0, %1
+  ret i32 %res
+}
+
+define i64 @test_minsize_uu64(i64 %x) minsize optsize {
+entry:
+  %0 = udiv i64 %x, 10
+  %1 = urem i64 %x, 10
+  %res = add i64 %0, %1
+  ret i64 %res
+}
+
+define i64 @test_minsize_ss64(i64 %x) minsize optsize {
+entry:
+  %0 = sdiv i64 %x, 10
+  %1 = srem i64 %x, 10
+  %res = add i64 %0, %1
+  ret i64 %res
+}
+
+define i64 @test_minsize_us64(i64 %x) minsize optsize {
+entry:
+  %0 = udiv i64 %x, 10
+  %1 = srem i64 %x, 10
+  %res = add i64 %0, %1
+  ret i64 %res
+}
+
+define i64 @test_minsize_su64(i64 %x) minsize optsize {
+entry:
+  %0 = sdiv i64 %x, 10
+  %1 = urem i64 %x, 10
+  %res = add i64 %0, %1
+  ret i64 %res
+}
+
+define i8 @test_uu8(i8 %x) optsize {
+entry:
+  %0 = udiv i8 %x, 10
+  %1 = urem i8 %x, 10
+  %res = add i8 %0, %1
+  ret i8 %res
+}
+
+define i8 @test_ss8(i8 %x) optsize {
+entry:
+  %0 = sdiv i8 %x, 10
+  %1 = srem i8 %x, 10
+  %res = add i8 %0, %1
+  ret i8 %res
+}
+
+define i8 @test_us8(i8 %x) optsize {
+entry:
+  %0 = udiv i8 %x, 10
+  %1 = srem i8 %x, 10
+  %res = add i8 %0, %1
+  ret i8 %res
+}
+
+define i8 @test_su8(i8 %x) optsize {
+entry:
+  %0 = sdiv i8 %x, 10
+  %1 = urem i8 %x, 10
+  %res = add i8 %0, %1
+  ret i8 %res
+}
+
+define i16 @test_uu16(i16 %x) optsize {
+entry:
+  %0 = udiv i16 %x, 10
+  %1 = urem i16 %x, 10
+  %res = add i16 %0, %1
+  ret i16 %res
+}
+
+define i16 @test_ss16(i16 %x) optsize {
+entry:
+  %0 = sdiv i16 %x, 10
+  %1 = srem i16 %x, 10
+  %res = add i16 %0, %1
+  ret i16 %res
+}
+
+define i16 @test_us16(i16 %x) optsize {
+entry:
+  %0 = udiv i16 %x, 10
+  %1 = srem i16 %x, 10
+  %res = add i16 %0, %1
+  ret i16 %res
+}
+
+define i16 @test_su16(i16 %x) optsize {
+entry:
+  %0 = sdiv i16 %x, 10
+  %1 = urem i16 %x, 10
+  %res = add i16 %0, %1
+  ret i16 %res
+}
+
+define i32 @test_uu32(i32 %x) optsize {
+entry:
+  %0 = udiv i32 %x, 10
+  %1 = urem i32 %x, 10
+  %res = add i32 %0, %1
+  ret i32 %res
+}
+
+define i32 @test_ss32(i32 %x) optsize {
+entry:
+  %0 = sdiv i32 %x, 10
+  %1 = srem i32 %x, 10
+  %res = add i32 %0, %1
+  ret i32 %res
+}
+
+define i32 @test_us32(i32 %x) optsize {
+entry:
+  %0 = udiv i32 %x, 10
+  %1 = srem i32 %x, 10
+  %res = add i32 %0, %1
+  ret i32 %res
+}
+
+define i32 @test_su32(i32 %x) optsize {
+entry:
+  %0 = sdiv i32 %x, 10
+  %1 = urem i32 %x, 10
+  %res = add i32 %0, %1
+  ret i32 %res
+}
+
+define i64 @test_uu64(i64 %x) optsize {
+entry:
+  %0 = udiv i64 %x, 10
+  %1 = urem i64 %x, 10
+  %res = add i64 %0, %1
+  ret i64 %res
+}
+
+define i64 @test_ss64(i64 %x) optsize {
+entry:
+  %0 = sdiv i64 %x, 10
+  %1 = srem i64 %x, 10
+  %res = add i64 %0, %1
+  ret i64 %res
+}
+
+define i64 @test_us64(i64 %x) optsize {
+entry:
+  %0 = udiv i64 %x, 10
+  %1 = srem i64 %x, 10
+  %res = add i64 %0, %1
+  ret i64 %res
+}
+
+define i64 @test_su64(i64 %x) optsize {
+entry:
+  %0 = sdiv i64 %x, 10
+  %1 = urem i64 %x, 10
+  %res = add i64 %0, %1
+  ret i64 %res
+}
diff --git a/test/CodeGen/X86/remat-invalid-liveness.ll b/test/CodeGen/X86/remat-invalid-liveness.ll
deleted file mode 100644
index c6b43b0dd3e4f..0000000000000
--- a/test/CodeGen/X86/remat-invalid-liveness.ll
+++ /dev/null
@@ -1,85 +0,0 @@
-; RUN: llc %s -mcpu=core2 -o - | FileCheck %s
-; This test was failing while tracking the liveness in the register scavenger
-; during the branching folding pass. The allocation of the subregisters was
-; incorrect.
-; I.e., the faulty pattern looked like:
-; CH = movb 64
-; ECX = movl 3 <- CH was killed here.
-; CH = subb CH, ...
-;
-; This reduced test case triggers the crash before the fix, but does not
-; strictly speaking check that the resulting code is correct.
-; To check that the code is actually correct we would need to check the
-; liveness of the produced code.
-;
-; Currently, we check that after ECX = movl 3, we do not have subb CH,
-; whereas CH could have been redefine in between and that would have been
-; totally fine.
-; <rdar://problem/16582185>
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.9"
-
-%struct.A = type { %struct.B, %struct.C, %struct.D*, [1 x i8*] }
-%struct.B = type { i32, [4 x i8] }
-%struct.C = type { i128 }
-%struct.D = type { {}*, [0 x i32] }
-%union.E = type { i32 }
-
-; CHECK-LABEL: __XXX1:
-; CHECK: movl $3, %ecx
-; CHECK-NOT: subb %{{[a-z]+}}, %ch
-; Function Attrs: nounwind optsize ssp
-define fastcc void @__XXX1(%struct.A* %ht) #0 {
-entry:
-  %const72 = bitcast i128 72 to i128
-  %const3 = bitcast i128 3 to i128
-  switch i32 undef, label %if.end196 [
-    i32 1, label %sw.bb.i
-    i32 3, label %sw.bb2.i
-  ]
-
-sw.bb.i:                                          ; preds = %entry
-  %call.i.i.i = tail call i32 undef(%struct.A* %ht, i8 zeroext 22, i32 undef, i32 0, %struct.D* undef)
-  %bf.load.i.i = load i128, i128* undef, align 4
-  %bf.lshr.i.i = lshr i128 %bf.load.i.i, %const72
-  %shl1.i.i = shl nuw nsw i128 %bf.lshr.i.i, 8
-  %shl.i.i = trunc i128 %shl1.i.i to i32
-  br i1 undef, label %cond.false10.i.i, label %__XXX2.exit.i.i
-
-__XXX2.exit.i.i:                    ; preds = %sw.bb.i
-  %extract11.i.i.i = lshr i128 %bf.load.i.i, %const3
-  %extract.t12.i.i.i = trunc i128 %extract11.i.i.i to i32
-  %bf.cast7.i.i.i = and i32 %extract.t12.i.i.i, 3
-  %arrayidx.i.i.i = getelementptr inbounds %struct.A, %struct.A* %ht, i32 0, i32 3, i32 %bf.cast7.i.i.i
-  br label %cond.end12.i.i
-
-cond.false10.i.i:                                 ; preds = %sw.bb.i
-  %arrayidx.i6.i.i = getelementptr inbounds %struct.A, %struct.A* %ht, i32 0, i32 3, i32 0
-  br label %cond.end12.i.i
-
-cond.end12.i.i:                                   ; preds = %cond.false10.i.i, %__XXX2.exit.i.i
-  %.sink.in.i.i = phi i8** [ %arrayidx.i.i.i, %__XXX2.exit.i.i ], [ %arrayidx.i6.i.i, %cond.false10.i.i ]
-  %.sink.i.i = load i8*, i8** %.sink.in.i.i, align 4
-  %tmp = bitcast i8* %.sink.i.i to %union.E*
-  br i1 undef, label %for.body.i.i, label %if.end196
-
-for.body.i.i:                                     ; preds = %for.body.i.i, %cond.end12.i.i
-  %weak.i.i = getelementptr inbounds %union.E, %union.E* %tmp, i32 undef, i32 0
-  %tmp1 = load i32, i32* %weak.i.i, align 4
-  %cmp36.i.i = icmp ne i32 %tmp1, %shl.i.i
-  %or.cond = and i1 %cmp36.i.i, false
-  br i1 %or.cond, label %for.body.i.i, label %if.end196
-
-sw.bb2.i:                                         ; preds = %entry
-  %bf.lshr.i85.i = lshr i128 undef, %const72
-  br i1 undef, label %if.end196, label %__XXX2.exit.i95.i
-
-__XXX2.exit.i95.i:                  ; preds = %sw.bb2.i
-  %extract11.i.i91.i = lshr i128 undef, %const3
-  br label %if.end196
-
-if.end196:                                        ; preds = %__XXX2.exit.i95.i, %sw.bb2.i, %for.body.i.i, %cond.end12.i.i, %entry
-  ret void
-}
-
-attributes #0 = { nounwind optsize ssp "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/rodata-relocs.ll b/test/CodeGen/X86/rodata-relocs.ll
index 9228ea1f621f1..6379ef1bf7371 100644
--- a/test/CodeGen/X86/rodata-relocs.ll
+++ b/test/CodeGen/X86/rodata-relocs.ll
@@ -32,15 +32,15 @@ target triple = "x86_64-unknown-linux-gnu"
 ; PIC: .section .rodata.cst16,"aM",@progbits,16
 ; PIC: e:
 ; PIC: e1:
-; PIC: .section .data.rel.ro.local,"aw",@progbits
+; PIC: .section .data.rel.ro,"aw",@progbits
 ; PIC: p:
 ; PIC: t:
-; PIC: .section .data.rel.ro,"aw",@progbits
+; PIC-NOT: .section
 ; PIC: p1:
 ; PIC: t1:
-; PIC: .section .data.rel,"aw",@progbits
+; PIC: .data
 ; PIC: p2:
 ; PIC: t2:
-; PIC: .section .data.rel.local,"aw",@progbits
+; PIC-NOT: .section
 ; PIC: p3:
 ; PIC: t3:
diff --git a/test/CodeGen/X86/rounding-ops.ll b/test/CodeGen/X86/rounding-ops.ll
index 69f4bfb9f47de..15a11d1d6a962 100644
--- a/test/CodeGen/X86/rounding-ops.ll
+++ b/test/CodeGen/X86/rounding-ops.ll
@@ -6,10 +6,10 @@ define float @test1(float %x) nounwind  {
   ret float %call
 
 ; CHECK-SSE-LABEL: test1:
-; CHECK-SSE: roundss $1
+; CHECK-SSE: roundss $9
 
 ; CHECK-AVX-LABEL: test1:
-; CHECK-AVX: vroundss $1
+; CHECK-AVX: vroundss $9
 }
 
 declare float @floorf(float) nounwind readnone
@@ -19,10 +19,10 @@ define double @test2(double %x) nounwind  {
   ret double %call
 
 ; CHECK-SSE-LABEL: test2:
-; CHECK-SSE: roundsd $1
+; CHECK-SSE: roundsd $9
 
 ; CHECK-AVX-LABEL: test2:
-; CHECK-AVX: vroundsd $1
+; CHECK-AVX: vroundsd $9
 }
 
 declare double @floor(double) nounwind readnone
@@ -58,10 +58,10 @@ define float @test5(float %x) nounwind  {
   ret float %call
 
 ; CHECK-SSE-LABEL: test5:
-; CHECK-SSE: roundss $2
+; CHECK-SSE: roundss $10
 
 ; CHECK-AVX-LABEL: test5:
-; CHECK-AVX: vroundss $2
+; CHECK-AVX: vroundss $10
 }
 
 declare float @ceilf(float) nounwind readnone
@@ -71,10 +71,10 @@ define double @test6(double %x) nounwind  {
   ret double %call
 
 ; CHECK-SSE-LABEL: test6:
-; CHECK-SSE: roundsd $2
+; CHECK-SSE: roundsd $10
 
 ; CHECK-AVX-LABEL: test6:
-; CHECK-AVX: vroundsd $2
+; CHECK-AVX: vroundsd $10
 }
 
 declare double @ceil(double) nounwind readnone
@@ -110,10 +110,10 @@ define float @test9(float %x) nounwind  {
   ret float %call
 
 ; CHECK-SSE-LABEL: test9:
-; CHECK-SSE: roundss $3
+; CHECK-SSE: roundss $11
 
 ; CHECK-AVX-LABEL: test9:
-; CHECK-AVX: vroundss $3
+; CHECK-AVX: vroundss $11
 }
 
 declare float @truncf(float) nounwind readnone
@@ -123,10 +123,10 @@ define double @test10(double %x) nounwind  {
   ret double %call
 
 ; CHECK-SSE-LABEL: test10:
-; CHECK-SSE: roundsd $3
+; CHECK-SSE: roundsd $11
 
 ; CHECK-AVX-LABEL: test10:
-; CHECK-AVX: vroundsd $3
+; CHECK-AVX: vroundsd $11
 }
 
 declare double @trunc(double) nounwind readnone
diff --git a/test/CodeGen/X86/safestack.ll b/test/CodeGen/X86/safestack.ll
new file mode 100644
index 0000000000000..1ff9a050aefbd
--- /dev/null
+++ b/test/CodeGen/X86/safestack.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple=i386-linux < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
+; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+; RUN: llc -mtriple=i386-linux-android < %s -o - | FileCheck --check-prefix=ANDROID-I386 %s
+; RUN: llc -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefix=ANDROID-X64 %s
+
+define void @_Z1fv() safestack {
+entry:
+  %x = alloca i32, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @_Z7CapturePi(i32* nonnull %x)
+  ret void
+}
+
+declare void @_Z7CapturePi(i32*)
+
+; LINUX-X64: movq __safestack_unsafe_stack_ptr@GOTTPOFF(%rip), %[[A:.*]]
+; LINUX-X64: movq %fs:(%[[A]]), %[[B:.*]]
+; LINUX-X64: leaq -16(%[[B]]), %[[C:.*]]
+; LINUX-X64: movq %[[C]], %fs:(%[[A]])
+
+; LINUX-I386: movl __safestack_unsafe_stack_ptr@INDNTPOFF, %[[A:.*]]
+; LINUX-I386: movl %gs:(%[[A]]), %[[B:.*]]
+; LINUX-I386: leal -16(%[[B]]), %[[C:.*]]
+; LINUX-I386: movl %[[C]], %gs:(%[[A]])
+
+; ANDROID-I386: movl %gs:36, %[[A:.*]]
+; ANDROID-I386: leal -16(%[[A]]), %[[B:.*]]
+; ANDROID-I386: movl %[[B]], %gs:36
+
+; ANDROID-X64: movq %fs:72, %[[A:.*]]
+; ANDROID-X64: leaq -16(%[[A]]), %[[B:.*]]
+; ANDROID-X64: movq %[[B]], %fs:72
diff --git a/test/CodeGen/X86/sar_fold.ll b/test/CodeGen/X86/sar_fold.ll
new file mode 100644
index 0000000000000..bd0d0c7057d34
--- /dev/null
+++ b/test/CodeGen/X86/sar_fold.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s
+
+define i32 @shl16sar15(i32 %a) #0 {
+; CHECK-LABEL: shl16sar15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+  %1 = shl i32 %a, 16
+  %2 = ashr exact i32 %1, 15
+  ret i32 %2
+}
+
+define i32 @shl16sar17(i32 %a) #0 {
+; CHECK-LABEL: shl16sar17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+  %1 = shl i32 %a, 16
+  %2 = ashr exact i32 %1, 17
+  ret i32 %2
+}
+
+define i32 @shl24sar23(i32 %a) #0 {
+; CHECK-LABEL: shl24sar23:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+  %1 = shl i32 %a, 24
+  %2 = ashr exact i32 %1, 23
+  ret i32 %2
+}
+
+define i32 @shl24sar25(i32 %a) #0 {
+; CHECK-LABEL: shl24sar25:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+  %1 = shl i32 %a, 24
+  %2 = ashr exact i32 %1, 25
+  ret i32 %2
+}
diff --git a/test/CodeGen/X86/sar_fold64.ll b/test/CodeGen/X86/sar_fold64.ll
new file mode 100644
index 0000000000000..7b33bb8c06169
--- /dev/null
+++ b/test/CodeGen/X86/sar_fold64.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define i32 @shl48sar47(i64 %a) #0 {
+; CHECK-LABEL: shl48sar47:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movswq %di, %rax
+  %1 = shl i64 %a, 48
+  %2 = ashr exact i64 %1, 47
+  %3 = trunc i64 %2 to i32
+  ret i32 %3
+}
+
+define i32 @shl48sar49(i64 %a) #0 {
+; CHECK-LABEL: shl48sar49:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movswq %di, %rax
+  %1 = shl i64 %a, 48
+  %2 = ashr exact i64 %1, 49
+  %3 = trunc i64 %2 to i32
+  ret i32 %3
+}
+
+define i32 @shl56sar55(i64 %a) #0 {
+; CHECK-LABEL: shl56sar55:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsbq %dil, %rax
+  %1 = shl i64 %a, 56
+  %2 = ashr exact i64 %1, 55
+  %3 = trunc i64 %2 to i32
+  ret i32 %3
+}
+
+define i32 @shl56sar57(i64 %a) #0 {
+; CHECK-LABEL: shl56sar57:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsbq %dil, %rax
+  %1 = shl i64 %a, 56
+  %2 = ashr exact i64 %1, 57
+  %3 = trunc i64 %2 to i32
+  ret i32 %3
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/scalar-fp-to-i64.ll b/test/CodeGen/X86/scalar-fp-to-i64.ll
new file mode 100644
index 0000000000000..d112d2340bdb4
--- /dev/null
+++ b/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -0,0 +1,151 @@
+; Check that scalar FP conversions to signed and unsigned int64 are using
+; reasonable sequences, across platforms and target switches.
+;
+; The signed case is straight forward, and the tests here basically
+; ensure successful compilation (f80 with avx512 was broken at one point).
+;
+; For the unsigned case there are many possible sequences, so to avoid
+; a fragile test we just check for the presence of a few key instructions.
+; AVX512 on Intel64 can use vcvtts[ds]2usi directly for float and double.
+; Otherwise the sequence will involve an FP subtract (fsub, subss or subsd),
+; and a truncating conversion (cvtts[ds]2si, fisttp, or fnstcw+fist).  When
+; both a subtract and fnstcw are needed, they can occur in either order.
+;
+; The interesting subtargets are AVX512F (vcvtts[ds]2usi), SSE3 (fisttp),
+; SSE2 (cvtts[ds]2si) and vanilla X87 (fnstcw+fist, 32-bit only).
+;
+; RUN: llc < %s -mtriple=i386-pc-windows-msvc     -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512_32
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu   -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512_32
+; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc   -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512_64
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512_64
+; RUN: llc < %s -mtriple=i386-pc-windows-msvc     -mattr=+sse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE3_32
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu   -mattr=+sse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE3_32
+; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc   -mattr=+sse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE3_64
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE3_64
+; RUN: llc < %s -mtriple=i386-pc-windows-msvc     -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2_32
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu   -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2_32
+; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc   -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2_64
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2_64
+; RUN: llc < %s -mtriple=i386-pc-windows-msvc     -mattr=-sse  | FileCheck %s --check-prefix=CHECK --check-prefix=X87
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu   -mattr=-sse  | FileCheck %s --check-prefix=CHECK --check-prefix=X87
+
+; CHECK-LABEL: f_to_u64
+; X87-DAG: fsub
+; X87-DAG: fnstcw
+; X87: fist
+; SSE2_32-DAG: {{subss|fsub}}
+; SSE2_32-DAG: fnstcw
+; SSE2_32: fist
+; SSE2_64: subss
+; SSE2_64: cvttss2si
+; SSE3_32: {{subss|fsub}}
+; SSE3_32: fistt
+; SSE3_64: subss
+; SSE3_64: cvttss2si
+; AVX512_32: {{subss|fsub}}
+; AVX512_32: fistt
+; AVX512_64: vcvttss2usi
+; CHECK: ret
+define i64 @f_to_u64(float %a) nounwind {
+  %r = fptoui float %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: f_to_s64
+; X87: fnstcw
+; X87: fist
+; SSE2_32: fnstcw
+; SSE2_32: fist
+; SSE2_64: cvttss2si
+; SSE3_32: fistt
+; SSE3_64: cvttss2si
+; AVX512_32: fistt
+; AVX512_64: vcvttss2si
+; CHECK: ret
+define i64 @f_to_s64(float %a) nounwind {
+  %r = fptosi float %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: d_to_u64
+; X87-DAG: fsub
+; X87-DAG: fnstcw
+; X87: fist
+; SSE2_32-DAG: {{subsd|fsub}}
+; SSE2_32-DAG: fnstcw
+; SSE2_32: fist
+; SSE2_64: subsd
+; SSE2_64: cvttsd2si
+; SSE3_32: {{subsd|fsub}}
+; SSE3_32: fistt
+; SSE3_64: subsd
+; SSE3_64: cvttsd2si
+; AVX512_32: {{subsd|fsub}}
+; AVX512_32: fistt
+; AVX512_64: vcvttsd2usi
+; CHECK: ret
+define i64 @d_to_u64(double %a) nounwind {
+  %r = fptoui double %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: d_to_s64
+; X87: fnstcw
+; X87: fist
+; SSE2_32: fnstcw
+; SSE2_32: fist
+; SSE2_64: cvttsd2si
+; SSE3_32: fistt
+; SSE3_64: cvttsd2si
+; AVX512_32: fistt
+; AVX512_64: vcvttsd2si
+; CHECK: ret
+define i64 @d_to_s64(double %a) nounwind {
+  %r = fptosi double %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: x_to_u64
+; CHECK-DAG: fsub
+; X87-DAG: fnstcw
+; SSE2_32-DAG: fnstcw
+; SSE2_64-DAG: fnstcw
+; CHECK: fist
+; CHECK: ret
+define i64 @x_to_u64(x86_fp80 %a) nounwind {
+  %r = fptoui x86_fp80 %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: x_to_s64
+; X87: fnstcw
+; X87: fist
+; SSE2_32: fnstcw
+; SSE2_32: fist
+; SSE2_64: fnstcw
+; SSE2_64: fist
+; SSE3_32: fistt
+; SSE3_64: fistt
+; AVX512_32: fistt
+; AVX512_64: fistt
+; CHECK: ret
+define i64 @x_to_s64(x86_fp80 %a) nounwind {
+  %r = fptosi x86_fp80 %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: t_to_u64
+; CHECK: __fixunstfdi
+; CHECK: ret
+define i64 @t_to_u64(fp128 %a) nounwind {
+  %r = fptoui fp128 %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: t_to_s64
+; CHECK: __fixtfdi
+; CHECK: ret
+define i64 @t_to_s64(fp128 %a) nounwind {
+  %r = fptosi fp128 %a to i64
+  ret i64 %r
+}
diff --git a/test/CodeGen/X86/scalar-int-to-fp.ll b/test/CodeGen/X86/scalar-int-to-fp.ll
new file mode 100644
index 0000000000000..93039859cdfb4
--- /dev/null
+++ b/test/CodeGen/X86/scalar-int-to-fp.ll
@@ -0,0 +1,132 @@
+; Verify that scalar integer conversions to FP compile successfully
+; (at one time long double failed with avx512f), and that reasonable
+; instruction sequences are selected based on subtarget features.
+; Due to the plethora of reasonable sequences we just check for
+; one key instruction, usually a cvt or fild, allowing the test
+; to be relatively easily updated when sequences are improved.
+;
+; RUN: llc < %s -mtriple=i386-unknown-unknown     -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512_32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown   -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512_64
+; RUN: llc < %s -mtriple=i386-unknown-unknown     -mattr=+sse2    | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2_32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown   -mattr=+sse2    | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2_64
+; RUN: llc < %s -mtriple=i386-unknown-unknown     -mattr=-sse     | FileCheck %s --check-prefix=CHECK --check-prefix=X87
+
+; CHECK-LABEL: u32_to_f
+; AVX512_32: vcvtusi2ssl
+; AVX512_64: vcvtusi2ssl
+; SSE2_32: cvtsd2ss
+; SSE2_64: cvtsi2ssq
+; X87: fildll
+define float @u32_to_f(i32 %a) nounwind {
+  %r = uitofp i32 %a to float
+  ret float %r
+}
+
+; CHECK-LABEL: s32_to_f
+; AVX512_32: vcvtsi2ssl
+; AVX512_64: vcvtsi2ssl
+; SSE2_32: cvtsi2ssl
+; SSE2_64: cvtsi2ssl
+; X87: fildl
+define float @s32_to_f(i32 %a) nounwind {
+  %r = sitofp i32 %a to float
+  ret float %r
+}
+
+; CHECK-LABEL: u32_to_d
+; AVX512_32: vcvtusi2sdl
+; AVX512_64: vcvtusi2sdl
+; SSE2_32: subsd
+; SSE2_64: cvtsi2sdq
+; X87: fildll
+define double @u32_to_d(i32 %a) nounwind {
+  %r = uitofp i32 %a to double
+  ret double %r
+}
+
+; CHECK-LABEL: s32_to_d
+; AVX512_32: vcvtsi2sdl
+; AVX512_64: vcvtsi2sdl
+; SSE2_32: cvtsi2sdl
+; SSE2_64: cvtsi2sdl
+; X87: fildl
+define double @s32_to_d(i32 %a) nounwind {
+  %r = sitofp i32 %a to double
+  ret double %r
+}
+
+; CHECK-LABEL: u32_to_x
+; AVX512_32: vsubsd
+; AVX512_64: vsubsd
+; SSE2_32: subsd
+; SSE2_64: fildll
+; X87: fildll
+define x86_fp80 @u32_to_x(i32 %a) nounwind {
+  %r = uitofp i32 %a to x86_fp80
+  ret x86_fp80 %r
+}
+
+; CHECK-LABEL: s32_to_x
+; CHECK: fildl
+define x86_fp80 @s32_to_x(i32 %a) nounwind {
+  %r = sitofp i32 %a to x86_fp80
+  ret x86_fp80 %r
+}
+
+; CHECK-LABEL: u64_to_f
+; AVX512_32: fildll
+; AVX512_64: vcvtusi2ssq
+; SSE2_32: fildll
+; SSE2_64: cvtsi2ssq
+; X87: fildll
+define float @u64_to_f(i64 %a) nounwind {
+  %r = uitofp i64 %a to float
+  ret float %r
+}
+
+; CHECK-LABEL: s64_to_f
+; AVX512_32: fildll
+; AVX512_64: vcvtsi2ssq
+; SSE2_32: fildll
+; SSE2_64: cvtsi2ssq
+; X87: fildll
+define float @s64_to_f(i64 %a) nounwind {
+  %r = sitofp i64 %a to float
+  ret float %r
+}
+
+; CHECK-LABEL: u64_to_d
+; AVX512_32: vpunpckldq
+; AVX512_64: vcvtusi2sdq
+; SSE2_32: punpckldq
+; SSE2_64: punpckldq
+; X87: fildll
+define double @u64_to_d(i64 %a) nounwind {
+  %r = uitofp i64 %a to double
+  ret double %r
+}
+
+; CHECK-LABEL: s64_to_d
+; AVX512_32: fildll
+; AVX512_64: vcvtsi2sdq
+; SSE2_32: fildll
+; SSE2_64: cvtsi2sdq
+; X87: fildll
+define double @s64_to_d(i64 %a) nounwind {
+  %r = sitofp i64 %a to double
+  ret double %r
+}
+
+; CHECK-LABEL: u64_to_x
+; CHECK: fildll
+define x86_fp80 @u64_to_x(i64 %a) nounwind {
+  %r = uitofp i64 %a to x86_fp80
+  ret x86_fp80 %r
+}
+
+; CHECK-LABEL: s64_to_x
+; CHECK: fildll
+define x86_fp80 @s64_to_x(i64 %a) nounwind {
+  %r = sitofp i64 %a to x86_fp80
+  ret x86_fp80 %r
+}
diff --git a/test/CodeGen/X86/sdiv-pow2.ll b/test/CodeGen/X86/sdiv-pow2.ll
new file mode 100644
index 0000000000000..e89f76931e183
--- /dev/null
+++ b/test/CodeGen/X86/sdiv-pow2.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+
+; No attributes, should not use idiv
+define i32 @test1(i32 inreg %x) {
+entry:
+  %div = sdiv i32 %x, 16
+  ret i32 %div
+; CHECK-LABEL: test1:
+; CHECK-NOT: idivl
+; CHECK: ret
+}
+
+; Has minsize (-Oz) attribute, should generate idiv
+define i32 @test2(i32 inreg %x) minsize {
+entry:
+  %div = sdiv i32 %x, 16
+  ret i32 %div
+; CHECK-LABEL: test2:
+; CHECK: idivl
+; CHECK: ret
+}
+
+; Has optsize (-Os) attribute, should not generate idiv
+define i32 @test3(i32 inreg %x) optsize {
+entry:
+  %div = sdiv i32 %x, 16
+  ret i32 %div
+; CHECK-LABEL: test3:
+; CHECK-NOT: idivl
+; CHECK: ret
+}
+
+
diff --git a/test/CodeGen/X86/seh-catch-all-win32.ll b/test/CodeGen/X86/seh-catch-all-win32.ll
index a4ea8ab78c798..e8da7ab971b13 100644
--- a/test/CodeGen/X86/seh-catch-all-win32.ll
+++ b/test/CodeGen/X86/seh-catch-all-win32.ll
@@ -22,23 +22,16 @@ entry:
           to label %__try.cont unwind label %lpad
 
 lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* bitcast (i32 ()* @"filt$main" to i8*)
-  %1 = extractvalue { i8*, i32 } %0, 1
-  %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @"filt$main" to i8*)) #4
-  %matches = icmp eq i32 %1, %2
-  br i1 %matches, label %__except, label %eh.resume
+  %cs1 = catchswitch within none [label %__except] unwind to caller
 
 __except:                                         ; preds = %lpad
-  %3 = load i32, i32* %__exceptioncode, align 4
-  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %3) #4
-  br label %__try.cont
+  %p = catchpad within %cs1 [i8* bitcast (i32 ()* @"filt$main" to i8*)]
+  %code = load i32, i32* %__exceptioncode, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %code) #4 [ "funclet"(token %p) ]
+  catchret from %p to label %__try.cont
 
 __try.cont:                                       ; preds = %entry, %__except
   ret i32 0
-
-eh.resume:                                        ; preds = %lpad
-  resume { i8*, i32 } %0
 }
 
 define internal i32 @"filt$main"() {
@@ -68,33 +61,31 @@ entry:
 ; CHECK: pushl %esi
 
 ; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
-; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]]
-; CHECK: movl %esp, [[reg_offs]](%ebp)
+; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%ebp)
 ; CHECK: movl $L__ehtable$main,
-; 	EH state 0
+;       EH state 0
 ; CHECK: movl $0, -16(%ebp)
 ; CHECK: calll _crash
 ; CHECK: popl %esi
 ; CHECK: popl %edi
 ; CHECK: popl %ebx
 ; CHECK: retl
-; CHECK: # Block address taken
-; 	stackrestore
+; CHECK: LBB0_[[lpbb:[0-9]+]]: # %__except{{$}}
+;       stackrestore
 ; CHECK: movl -24(%ebp), %esp
-; 	EH state -1
+;       EH state -1
 ; CHECK: movl [[code_offs]](%ebp), %[[code:[a-z]+]]
-; CHECK: movl $-1, -16(%ebp)
 ; CHECK-DAG: movl %[[code]], 4(%esp)
 ; CHECK-DAG: movl $_str, (%esp)
 ; CHECK: calll _printf
 
 ; CHECK: .section .xdata,"dr"
-; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1
+; CHECK: Lmain$parent_frame_offset = [[reg_offs]]
 ; CHECK: .align 4
 ; CHECK: L__ehtable$main
 ; CHECK-NEXT: .long -1
 ; CHECK-NEXT: .long _filt$main
-; CHECK-NEXT: .long Ltmp{{[0-9]+}}
+; CHECK-NEXT: .long LBB0_[[lpbb]]
 
 ; CHECK-LABEL: _filt$main:
 ; CHECK: pushl %ebp
diff --git a/test/CodeGen/X86/seh-catch-all.ll b/test/CodeGen/X86/seh-catch-all.ll
index 1c1a3c2139d6d..c6a2e4a1094aa 100644
--- a/test/CodeGen/X86/seh-catch-all.ll
+++ b/test/CodeGen/X86/seh-catch-all.ll
@@ -2,6 +2,7 @@
 
 @str = linkonce_odr unnamed_addr constant [27 x i8] c"GetExceptionCode(): 0x%lx\0A\00", align 1
 
+declare i32 @llvm.eh.exceptioncode(token)
 declare i32 @__C_specific_handler(...)
 declare void @crash()
 declare i32 @printf(i8* nocapture readonly, ...) nounwind
@@ -11,20 +12,17 @@ entry:
   invoke void @crash()
           to label %__try.cont unwind label %lpad
 
-lpad:
-  %0 = landingpad { i8*, i32 }
-          catch i8* null
-  %1 = extractvalue { i8*, i32 } %0, 0
-  %2 = ptrtoint i8* %1 to i64
-  %3 = trunc i64 %2 to i32
-  call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i64 0, i64 0), i32 %3)
-  br label %__try.cont
-
 __try.cont:
   ret i32 0
 
-eh.resume:
-  resume { i8*, i32 } %0
+lpad:
+  %cs1 = catchswitch within none [label %catchall] unwind to caller
+
+catchall:
+  %p = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  %code = call i32 @llvm.eh.exceptioncode(token %p)
+  call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i64 0, i64 0), i32 %code) [ "funclet"(token %p) ]
+  catchret from %p to label %__try.cont
 }
 
 ; Check that we can get the exception code from eax to the printf.
@@ -32,14 +30,17 @@ eh.resume:
 ; CHECK-LABEL: main:
 ; CHECK: callq crash
 ; CHECK: retq
-; CHECK: # Block address taken
+; CHECK: .LBB0_2: # %catchall
 ; CHECK: leaq str(%rip), %rcx
 ; CHECK: movl %eax, %edx
 ; CHECK: callq printf
 
 ; CHECK: .seh_handlerdata
-; CHECK-NEXT: .long 1
+; CHECK-NEXT: .Lmain$parent_frame_offset
+; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
+; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
 ; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL+1
 ; CHECK-NEXT: .long 1
-; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
+; CHECK-NEXT: .long .LBB0_2@IMGREL
+; CHECK-NEXT: .Llsda_end0:
diff --git a/test/CodeGen/X86/seh-catchpad.ll b/test/CodeGen/X86/seh-catchpad.ll
new file mode 100644
index 0000000000000..d9b4c5c6bcf54
--- /dev/null
+++ b/test/CodeGen/X86/seh-catchpad.ll
@@ -0,0 +1,198 @@
+; RUN: llc < %s | FileCheck %s
+
+; Based on the source:
+; extern "C" int puts(const char *);
+; extern "C" int printf(const char *, ...);
+; extern "C" int do_div(int a, int b) { return a / b; }
+; extern "C" int filt();
+; int main() {
+;   __try {
+;     __try {
+;       do_div(1, 0);
+;     } __except (1) {
+;       __try {
+;         do_div(1, 0);
+;       } __finally {
+;         puts("finally");
+;       }
+;     }
+;   } __except (filt()) {
+;     puts("caught");
+;   }
+;   return 0;
+; }
+
+; ModuleID = 't.cpp'
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+$"\01??_C@_07MKBLAIAL@finally?$AA@" = comdat any
+
+$"\01??_C@_06IBDBCMGJ@caught?$AA@" = comdat any
+
+@"\01??_C@_07MKBLAIAL@finally?$AA@" = linkonce_odr unnamed_addr constant [8 x i8] c"finally\00", comdat, align 1
+@"\01??_C@_06IBDBCMGJ@caught?$AA@" = linkonce_odr unnamed_addr constant [7 x i8] c"caught\00", comdat, align 1
+
+; Function Attrs: nounwind readnone
+define i32 @do_div(i32 %a, i32 %b) #0 {
+entry:
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+
+define i32 @main() #1 personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+  %call = invoke i32 @do_div(i32 1, i32 0) #4
+          to label %__try.cont.12 unwind label %catch.dispatch
+
+__except.2:                                       ; preds = %__except
+  %call4 = invoke i32 @do_div(i32 1, i32 0) #4
+          to label %invoke.cont.3 unwind label %ehcleanup
+
+invoke.cont.3:                                    ; preds = %__except.2
+  invoke fastcc void @"\01?fin$0@0@main@@"() #4
+          to label %__try.cont.12 unwind label %catch.dispatch.7
+
+__except.9:                                       ; preds = %__except.ret
+  %call11 = tail call i32 @puts(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @"\01??_C@_06IBDBCMGJ@caught?$AA@", i64 0, i64 0))
+  br label %__try.cont.12
+
+__try.cont.12:                                    ; preds = %invoke.cont.3, %entry, %__except.9
+  ret i32 0
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %__except] unwind label %catch.dispatch.7
+
+__except:                                         ; preds = %catch.dispatch
+  %cp1 = catchpad within %cs1 [i8* null]
+  catchret from %cp1 to label %__except.2
+
+ehcleanup:                                        ; preds = %__except.2
+  %cp2 = cleanuppad within none []
+  invoke fastcc void @"\01?fin$0@0@main@@"() #4 [ "funclet"(token %cp2) ]
+          to label %invoke.cont.6 unwind label %catch.dispatch.7
+
+invoke.cont.6:                                    ; preds = %ehcleanup
+  cleanupret from %cp2 unwind label %catch.dispatch.7
+
+catch.dispatch.7:
+  %cs2 = catchswitch within none [label %__except.ret] unwind to caller
+
+__except.ret:                                     ; preds = %catch.dispatch.7
+  %cp3 = catchpad within %cs2 [i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@main@@" to i8*)]
+  catchret from %cp3 to label %__except.9
+}
+
+; CHECK: main:                                   # @main
+; CHECK: .seh_proc main
+; CHECK:         .seh_handler __C_specific_handler, @unwind, @except
+; CHECK:         pushq   %rbp
+; CHECK:         .seh_pushreg 5
+; CHECK:         subq    $32, %rsp
+; CHECK:         .seh_stackalloc 32
+; CHECK:         leaq    32(%rsp), %rbp
+; CHECK:         .seh_setframe 5, 32
+; CHECK:         .seh_endprologue
+; CHECK: .Ltmp0:
+; CHECK:         movl    $1, %ecx
+; CHECK:         xorl    %edx, %edx
+; CHECK:         callq   do_div
+; CHECK: .Ltmp1:
+; CHECK: .LBB1_[[epilogue:[0-9]+]]:                                # %__try.cont.12
+; CHECK:         xorl    %eax, %eax
+; CHECK:         addq    $32, %rsp
+; CHECK:         popq    %rbp
+; CHECK:         retq
+; CHECK: .LBB1_[[except1bb:[0-9]+]]:                                # %__except
+; CHECK: .Ltmp2:
+; CHECK:         movl    $1, %ecx
+; CHECK:         xorl    %edx, %edx
+; CHECK:         callq   do_div
+; CHECK: .Ltmp3:
+; CHECK:         callq   "?fin$0@0@main@@"
+; CHECK:         jmp     .LBB1_[[epilogue]]
+; CHECK: .LBB1_[[except2bb:[0-9]+]]:                                # %__except.ret
+; CHECK:         leaq    "??_C@_06IBDBCMGJ@caught?$AA@"(%rip), %rcx
+; CHECK:         callq   puts
+; CHECK:         jmp     .LBB1_[[epilogue]]
+
+; CHECK:         .seh_handlerdata
+; CHECK-NEXT:         .Lmain$parent_frame_offset = 32
+; CHECK-NEXT:         .long   (.Llsda_end0-.Llsda_begin0)/16
+; CHECK-NEXT: .Llsda_begin0:
+; CHECK-NEXT:         .long   .Ltmp0@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp1@IMGREL+1
+; CHECK-NEXT:         .long   1
+; CHECK-NEXT:         .long   .LBB1_[[except1bb]]@IMGREL
+; CHECK-NEXT:         .long   .Ltmp0@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp1@IMGREL+1
+; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
+; CHECK-NEXT:         .long   .LBB1_[[except2bb]]@IMGREL
+; CHECK-NEXT:         .long   .Ltmp2@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp3@IMGREL+1
+; CHECK-NEXT:         .long   "?dtor$[[finbb:[0-9]+]]@?0?main@4HA"@IMGREL
+; CHECK-NEXT:         .long   0
+; CHECK-NEXT:         .long   .Ltmp2@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp3@IMGREL+1
+; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
+; CHECK-NEXT:         .long   .LBB1_3@IMGREL
+; CHECK-NEXT:         .long   .Ltmp6@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp7@IMGREL+1
+; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
+; CHECK-NEXT:         .long   .LBB1_3@IMGREL
+; CHECK-NEXT: .Llsda_end0:
+
+; CHECK:         .text
+; CHECK:         .seh_endproc
+
+; CHECK: "?dtor$[[finbb]]@?0?main@4HA":
+; CHECK: .seh_proc "?dtor$[[finbb]]@?0?main@4HA"
+; CHECK:         .seh_handler __C_specific_handler, @unwind, @except
+; CHECK: .LBB1_[[finbb]]:                                # %ehcleanup
+; CHECK:         movq    %rdx, 16(%rsp)
+; CHECK:         pushq   %rbp
+; CHECK:         .seh_pushreg 5
+; CHECK:         subq    $32, %rsp
+; CHECK:         .seh_stackalloc 32
+; CHECK:         leaq    32(%rdx), %rbp
+; CHECK:         .seh_endprologue
+; CHECK:         callq   "?fin$0@0@main@@"
+; CHECK:         nop
+; CHECK:         addq    $32, %rsp
+; CHECK:         popq    %rbp
+; CHECK:         retq
+; CHECK:         .seh_handlerdata
+; CHECK:         .seh_endproc
+
+define internal i32 @"\01?filt$0@0@main@@"(i8* nocapture readnone %exception_pointers, i8* nocapture readnone %frame_pointer) #1 {
+entry:
+  %call = tail call i32 @filt()
+  ret i32 %call
+}
+
+; CHECK: "?filt$0@0@main@@":                     # @"\01?filt$0@0@main@@"
+; CHECK: .seh_proc "?filt$0@0@main@@"
+; CHECK:         .seh_endprologue
+; CHECK:         rex64 jmp       filt  # TAILCALL
+; CHECK:         .seh_handlerdata
+
+declare i32 @filt() #1
+
+declare i32 @__C_specific_handler(...)
+
+; Function Attrs: noinline nounwind
+define internal fastcc void @"\01?fin$0@0@main@@"() #2 {
+entry:
+  %call = tail call i32 @puts(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @"\01??_C@_07MKBLAIAL@finally?$AA@", i64 0, i64 0)) #5
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #3
+
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { noinline }
+attributes #5 = { nounwind }
diff --git a/test/CodeGen/X86/seh-except-finally.ll b/test/CodeGen/X86/seh-except-finally.ll
index 0630d001bb764..b29788cd015d2 100644
--- a/test/CodeGen/X86/seh-except-finally.ll
+++ b/test/CodeGen/X86/seh-except-finally.ll
@@ -38,84 +38,63 @@ entry:
   %exn.slot = alloca i8*
   %ehselector.slot = alloca i32
   invoke void @crash() #5
-          to label %invoke.cont unwind label %lpad
+          to label %invoke.cont unwind label %__finally
 
 invoke.cont:                                      ; preds = %entry
   %0 = call i8* @llvm.localaddress()
   invoke void @"\01?fin$0@0@use_both@@"(i1 zeroext false, i8* %0) #5
-          to label %invoke.cont2 unwind label %lpad1
+          to label %invoke.cont2 unwind label %catch.dispatch
 
 invoke.cont2:                                     ; preds = %invoke.cont
   br label %__try.cont
 
-lpad:                                             ; preds = %entry
-  %1 = landingpad { i8*, i32 }
-          cleanup
-          catch i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@use_both@@" to i8*)
-  %2 = extractvalue { i8*, i32 } %1, 0
-  store i8* %2, i8** %exn.slot
-  %3 = extractvalue { i8*, i32 } %1, 1
-  store i32 %3, i32* %ehselector.slot
-  %4 = call i8* @llvm.localaddress()
-  invoke void @"\01?fin$0@0@use_both@@"(i1 zeroext true, i8* %4) #5
-          to label %invoke.cont3 unwind label %lpad1
-
-lpad1:                                            ; preds = %lpad, %invoke.cont
-  %5 = landingpad { i8*, i32 }
-          catch i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@use_both@@" to i8*)
-  %6 = extractvalue { i8*, i32 } %5, 0
-  store i8* %6, i8** %exn.slot
-  %7 = extractvalue { i8*, i32 } %5, 1
-  store i32 %7, i32* %ehselector.slot
-  br label %catch.dispatch
+__finally:                                             ; preds = %entry
+  %cleanuppad = cleanuppad within none []
+  %locals = call i8* @llvm.localaddress()
+  invoke void @"\01?fin$0@0@use_both@@"(i1 zeroext true, i8* %locals) #5 [ "funclet"(token %cleanuppad) ]
+          to label %invoke.cont3 unwind label %catch.dispatch
 
-invoke.cont3:                                     ; preds = %lpad
-  br label %catch.dispatch
+invoke.cont3:                                     ; preds = %__finally
+  cleanupret from %cleanuppad unwind label %catch.dispatch
 
 catch.dispatch:                                   ; preds = %invoke.cont3, %lpad1
-  %sel = load i32, i32* %ehselector.slot
-  %8 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@use_both@@" to i8*)) #6
-  %matches = icmp eq i32 %sel, %8
-  br i1 %matches, label %__except, label %eh.resume
+  %cs1 = catchswitch within none [label %__except] unwind to caller
 
 __except:                                         ; preds = %catch.dispatch
-  %call = call i32 @puts(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @"\01??_C@_08MLCMLGHM@__except?$AA@", i32 0, i32 0))
-  br label %__try.cont
+  %catchpad = catchpad within %cs1 [i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@use_both@@" to i8*)]
+  %call = call i32 @puts(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @"\01??_C@_08MLCMLGHM@__except?$AA@", i32 0, i32 0)) [ "funclet"(token %catchpad) ]
+  catchret from %catchpad to label %__try.cont
 
 __try.cont:                                       ; preds = %__except, %invoke.cont2
   ret void
-
-eh.resume:                                        ; preds = %catch.dispatch
-  %exn = load i8*, i8** %exn.slot
-  %sel4 = load i32, i32* %ehselector.slot
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
-  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel4, 1
-  resume { i8*, i32 } %lpad.val5
 }
 
 ; CHECK-LABEL: use_both:
 ; CHECK: .Ltmp0
 ; CHECK: callq crash
 ; CHECK: .Ltmp1
-; CHECK: .Ltmp3
-; CHECK: callq "?fin$0@0@use_both@@"
 ; CHECK: .Ltmp4
+; CHECK: callq "?fin$0@0@use_both@@"
+; CHECK: .Ltmp5
 ; CHECK: retq
 ;
 ; CHECK: .seh_handlerdata
-; CHECK-NEXT: .long 3
-; CHECK-NEXT: .long .Ltmp0@IMGREL
+; CHECK-NEXT: .Luse_both$parent_frame_offset
+; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
+; CHECK-NEXT: .Llsda_begin0:
+; CHECK-NEXT: .long .Ltmp0@IMGREL+1
 ; CHECK-NEXT: .long .Ltmp1@IMGREL+1
-; CHECK-NEXT: .long "?fin$0@0@use_both@@"@IMGREL
+; CHECK-NEXT: .long "?dtor$2@?0?use_both@4HA"@IMGREL
 ; CHECK-NEXT: .long 0
-; CHECK-NEXT: .long .Ltmp0@IMGREL
+; CHECK-NEXT: .long .Ltmp0@IMGREL+1
 ; CHECK-NEXT: .long .Ltmp1@IMGREL+1
 ; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
-; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
-; CHECK-NEXT: .long .Ltmp3@IMGREL
+; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
 ; CHECK-NEXT: .long .Ltmp4@IMGREL+1
+; CHECK-NEXT: .long .Ltmp5@IMGREL+1
 ; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
-; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
+; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
+; CHECK-NEXT: .Llsda_end0:
 
 ; Function Attrs: noinline nounwind
 define internal i32 @"\01?filt$0@0@use_both@@"(i8* %exception_pointers, i8* %frame_pointer) #2 {
diff --git a/test/CodeGen/X86/seh-exception-code.ll b/test/CodeGen/X86/seh-exception-code.ll
new file mode 100644
index 0000000000000..20e1544e0b593
--- /dev/null
+++ b/test/CodeGen/X86/seh-exception-code.ll
@@ -0,0 +1,38 @@
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @f(i32)
+declare i32 @__C_specific_handler(...)
+declare i32 @llvm.eh.exceptioncode(token)
+
+define void @ehcode() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+  invoke void @f(i32 0)
+          to label %__try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs = catchswitch within none [label %__except] unwind to caller
+
+__except:                                         ; preds = %catch.dispatch
+  %pad = catchpad within %cs [i8* null]
+  catchret from %pad to label %__except.1
+
+__except.1:                                       ; preds = %__except
+  %code = call i32 @llvm.eh.exceptioncode(token %pad)
+  call void @f(i32 %code)
+  br label %__try.cont
+
+__try.cont:                                       ; preds = %entry, %__except.1
+  ret void
+}
+
+; CHECK-LABEL: ehcode:
+; CHECK: xorl %ecx, %ecx
+; CHECK: callq f
+
+; CHECK: # %__except
+; CHECK: movl %eax, %ecx
+; CHECK-NEXT: callq f
diff --git a/test/CodeGen/X86/seh-filter.ll b/test/CodeGen/X86/seh-filter.ll
deleted file mode 100644
index 37ed15841a93d..0000000000000
--- a/test/CodeGen/X86/seh-filter.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc -O0 -mtriple=x86_64-windows-msvc < %s | FileCheck %s
-
-declare void @g()
-define void @f() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
-  invoke void @g() to label %return unwind label %lpad
-
-return:
-  ret void
-
-lpad:
-  %ehptrs = landingpad {i8*, i32}
-    filter [0 x i8*] zeroinitializer
-  call void @__cxa_call_unexpected(i8* null)
-  unreachable
-}
-declare i32 @__C_specific_handler(...)
-declare void @__cxa_call_unexpected(i8*)
-
-; We don't emit entries for filters.
-; CHECK: .seh_handlerdata
-; CHECK: .long 0
diff --git a/test/CodeGen/X86/seh-finally.ll b/test/CodeGen/X86/seh-finally.ll
index 350cd932f4815..2ef1c984851c8 100644
--- a/test/CodeGen/X86/seh-finally.ll
+++ b/test/CodeGen/X86/seh-finally.ll
@@ -17,50 +17,42 @@ invoke.cont:                                      ; preds = %entry
   ret i32 0
 
 lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          cleanup
-  %1 = extractvalue { i8*, i32 } %0, 0
-  %2 = extractvalue { i8*, i32 } %0, 1
-  %call2 = invoke i32 @puts(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @str_recovered, i64 0, i64 0))
-          to label %invoke.cont1 unwind label %terminate.lpad
-
-invoke.cont1:                                     ; preds = %lpad
-  resume { i8*, i32 } %0
-
-terminate.lpad:                                   ; preds = %lpad
-  %3 = landingpad { i8*, i32 }
-          catch i8* null
-  call void @abort()
-  unreachable
+  %p = cleanuppad within none []
+  %call2 = call i32 @puts(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @str_recovered, i64 0, i64 0)) [ "funclet"(token %p) ]
+  cleanupret from %p unwind to caller
 }
 
 ; X64-LABEL: main:
 ; X64: retq
 
 ; X64: .seh_handlerdata
-; X64-NEXT: .long 1
-; X64-NEXT: .long .Ltmp0@IMGREL
-; X64-NEXT: .long .Ltmp1@IMGREL
-; X64-NEXT: .long main.cleanup@IMGREL
-; X64-NEXT: .long 0
-
-; X64-LABEL: main.cleanup:
+; X64-NEXT: .Lmain$parent_frame_offset = 32
+; X64-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites
+; X64-NEXT: .Llsda_begin0:
+; X64-NEXT: .long   .Ltmp0@IMGREL+1 # LabelStart
+; X64-NEXT: .long   .Ltmp1@IMGREL+1 # LabelEnd
+; X64-NEXT: .long   "?dtor$2@?0?main@4HA"@IMGREL # FinallyFunclet
+; X64-NEXT: .long   0               # Null
+; X64-NEXT: .Llsda_end0:
+
+; X64-LABEL: "?dtor$2@?0?main@4HA":
 ; X64: callq puts
 ; X64: retq
 
 ; X86-LABEL: _main:
 ; X86: retl
 
-; X86: .section .xdata,"dr"
-; X86: L__ehtable$main:
-; X86-NEXT: .long -1
-; X86-NEXT: .long 0
-; X86-NEXT: .long _main.cleanup
-
-; X86-LABEL: _main.cleanup:
+; X86-LABEL: "?dtor$2@?0?main@4HA":
+; X86: LBB0_2:
 ; X86: calll _puts
 ; X86: retl
 
+; X86: .section .xdata,"dr"
+; X86: L__ehtable$main:
+; X86-NEXT: .long -1 # ToState
+; X86-NEXT: .long 0  # Null
+; X86-NEXT: .long "?dtor$2@?0?main@4HA" # FinallyFunclet
+
 declare i32 @__C_specific_handler(...)
 
 declare i32 @puts(i8*)
diff --git a/test/CodeGen/X86/seh-safe-div-win32.ll b/test/CodeGen/X86/seh-safe-div-win32.ll
index b1bcde2c7ff3b..643af3a472fb4 100644
--- a/test/CodeGen/X86/seh-safe-div-win32.ll
+++ b/test/CodeGen/X86/seh-safe-div-win32.ll
@@ -28,35 +28,25 @@ entry:
   %r = alloca i32, align 4
   store i32 42, i32* %r
   invoke void @try_body(i32* %r, i32* %n, i32* %d)
-          to label %__try.cont unwind label %lpad
-
-lpad:
-  %vals = landingpad { i8*, i32 }
-          catch i8* bitcast (i32 ()* @safe_div_filt0 to i8*)
-          catch i8* bitcast (i32 ()* @safe_div_filt1 to i8*)
-  %ehptr = extractvalue { i8*, i32 } %vals, 0
-  %sel = extractvalue { i8*, i32 } %vals, 1
-  %filt0_val = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @safe_div_filt0 to i8*))
-  %is_filt0 = icmp eq i32 %sel, %filt0_val
-  br i1 %is_filt0, label %handler0, label %eh.dispatch1
-
-eh.dispatch1:
-  %filt1_val = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @safe_div_filt1 to i8*))
-  %is_filt1 = icmp eq i32 %sel, %filt1_val
-  br i1 %is_filt1, label %handler1, label %eh.resume
+          to label %__try.cont unwind label %lpad0
+
+lpad0:
+  %cs0 = catchswitch within none [label %handler0] unwind label %lpad1
 
 handler0:
-  call void @puts(i8* getelementptr ([27 x i8], [27 x i8]* @str1, i32 0, i32 0))
+  %p0 = catchpad within %cs0 [i8* bitcast (i32 ()* @safe_div_filt0 to i8*)]
+  call void @puts(i8* getelementptr ([27 x i8], [27 x i8]* @str1, i32 0, i32 0)) [ "funclet"(token %p0) ]
   store i32 -1, i32* %r, align 4
-  br label %__try.cont
+  catchret from %p0 to label %__try.cont
+
+lpad1:
+  %cs1 = catchswitch within none [label %handler1] unwind to caller
 
 handler1:
-  call void @puts(i8* getelementptr ([29 x i8], [29 x i8]* @str2, i32 0, i32 0))
+  %p1 = catchpad within %cs1 [i8* bitcast (i32 ()* @safe_div_filt1 to i8*)]
+  call void @puts(i8* getelementptr ([29 x i8], [29 x i8]* @str2, i32 0, i32 0)) [ "funclet"(token %p1) ]
   store i32 -2, i32* %r, align 4
-  br label %__try.cont
-
-eh.resume:
-  resume { i8*, i32 } %vals
+  catchret from %p1 to label %__try.cont
 
 __try.cont:
   %safe_ret = load i32, i32* %r, align 4
@@ -75,15 +65,13 @@ __try.cont:
 
 ; Landing pad code
 
-; CHECK: [[handler0:Ltmp[0-9]+]]: # Block address taken
-; CHECK: # %handler0
+; CHECK: [[handler0:LBB0_[0-9]+]]: # %handler0
 ; 	Restore SP
 ; CHECK: movl {{.*}}(%ebp), %esp
 ; CHECK: calll _puts
 ; CHECK: jmp [[cont_bb]]
 
-; CHECK: [[handler1:Ltmp[0-9]+]]: # Block address taken
-; CHECK: # %handler1
+; CHECK: [[handler1:LBB0_[0-9]+]]: # %handler1
 ; 	Restore SP
 ; CHECK: movl {{.*}}(%ebp), %esp
 ; CHECK: calll _puts
diff --git a/test/CodeGen/X86/seh-safe-div.ll b/test/CodeGen/X86/seh-safe-div.ll
index 699e58ee8bae8..60918cf07058a 100644
--- a/test/CodeGen/X86/seh-safe-div.ll
+++ b/test/CodeGen/X86/seh-safe-div.ll
@@ -27,35 +27,25 @@ define i32 @safe_div(i32* %n, i32* %d) personality i8* bitcast (i32 (...)* @__C_
 entry:
   %r = alloca i32, align 4
   invoke void @try_body(i32* %r, i32* %n, i32* %d)
-          to label %__try.cont unwind label %lpad
-
-lpad:
-  %vals = landingpad { i8*, i32 }
-          catch i8* bitcast (i32 (i8*, i8*)* @safe_div_filt0 to i8*)
-          catch i8* bitcast (i32 (i8*, i8*)* @safe_div_filt1 to i8*)
-  %ehptr = extractvalue { i8*, i32 } %vals, 0
-  %sel = extractvalue { i8*, i32 } %vals, 1
-  %filt0_val = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @safe_div_filt0 to i8*))
-  %is_filt0 = icmp eq i32 %sel, %filt0_val
-  br i1 %is_filt0, label %handler0, label %eh.dispatch1
-
-eh.dispatch1:
-  %filt1_val = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @safe_div_filt1 to i8*))
-  %is_filt1 = icmp eq i32 %sel, %filt1_val
-  br i1 %is_filt1, label %handler1, label %eh.resume
+          to label %__try.cont unwind label %lpad0
+
+lpad0:
+  %cs0 = catchswitch within none [label %handler0] unwind label %lpad1
 
 handler0:
-  call void @puts(i8* getelementptr ([27 x i8], [27 x i8]* @str1, i32 0, i32 0))
+  %p0 = catchpad within %cs0 [i8* bitcast (i32 (i8*, i8*)* @safe_div_filt0 to i8*)]
+  call void @puts(i8* getelementptr ([27 x i8], [27 x i8]* @str1, i32 0, i32 0)) [ "funclet"(token %p0) ]
   store i32 -1, i32* %r, align 4
-  br label %__try.cont
+  catchret from %p0 to label %__try.cont
+
+lpad1:
+  %cs1 = catchswitch within none [label %handler1] unwind to caller
 
 handler1:
-  call void @puts(i8* getelementptr ([29 x i8], [29 x i8]* @str2, i32 0, i32 0))
+  %p1 = catchpad within %cs1 [i8* bitcast (i32 (i8*, i8*)* @safe_div_filt1 to i8*)]
+  call void @puts(i8* getelementptr ([29 x i8], [29 x i8]* @str2, i32 0, i32 0)) [ "funclet"(token %p1) ]
   store i32 -2, i32* %r, align 4
-  br label %__try.cont
-
-eh.resume:
-  resume { i8*, i32 } %vals
+  catchret from %p1 to label %__try.cont
 
 __try.cont:
   %safe_ret = load i32, i32* %r, align 4
@@ -68,7 +58,7 @@ __try.cont:
 ; CHECK: .seh_proc safe_div
 ; CHECK: .seh_handler __C_specific_handler, @unwind, @except
 ; CHECK: .Ltmp0:
-; CHECK: leaq [[rloc:.*\(%rsp\)]], %rcx
+; CHECK: leaq [[rloc:.*\(%rbp\)]], %rcx
 ; CHECK: callq try_body
 ; CHECK-NEXT: .Ltmp1
 ; CHECK: [[cont_bb:\.LBB0_[0-9]+]]:
@@ -77,32 +67,32 @@ __try.cont:
 
 ; Landing pad code
 
-; CHECK: [[handler0:\.Ltmp[0-9]+]]: # Block address taken
-; CHECK: # %handler0
+; CHECK: [[handler0:\.LBB0_[0-9]+]]: # %handler0
 ; CHECK: callq puts
 ; CHECK: movl $-1, [[rloc]]
 ; CHECK: jmp [[cont_bb]]
 
-; CHECK: [[handler1:\.Ltmp[0-9]+]]: # Block address taken
-; CHECK: # %handler1
+; CHECK: [[handler1:\.LBB0_[0-9]+]]: # %handler1
 ; CHECK: callq puts
 ; CHECK: movl $-2, [[rloc]]
 ; CHECK: jmp [[cont_bb]]
 
 ; CHECK: .seh_handlerdata
-; CHECK-NEXT: .long 2
-; CHECK-NEXT: .long .Ltmp0@IMGREL
+; CHECK-NEXT: .Lsafe_div$parent_frame_offset
+; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
+; CHECK-NEXT: .Llsda_begin0:
+; CHECK-NEXT: .long .Ltmp0@IMGREL+1
 ; CHECK-NEXT: .long .Ltmp1@IMGREL+1
 ; CHECK-NEXT: .long safe_div_filt0@IMGREL
 ; CHECK-NEXT: .long [[handler0]]@IMGREL
-; CHECK-NEXT: .long .Ltmp0@IMGREL
+; CHECK-NEXT: .long .Ltmp0@IMGREL+1
 ; CHECK-NEXT: .long .Ltmp1@IMGREL+1
 ; CHECK-NEXT: .long safe_div_filt1@IMGREL
 ; CHECK-NEXT: .long [[handler1]]@IMGREL
+; CHECK-NEXT: .Llsda_end0:
 ; CHECK: .text
 ; CHECK: .seh_endproc
 
-
 define void @try_body(i32* %r, i32* %n, i32* %d) {
 entry:
   %0 = load i32, i32* %n, align 4
diff --git a/test/CodeGen/X86/seh-stack-realign-win32.ll b/test/CodeGen/X86/seh-stack-realign-win32.ll
deleted file mode 100644
index f3ab71803ca7a..0000000000000
--- a/test/CodeGen/X86/seh-stack-realign-win32.ll
+++ /dev/null
@@ -1,99 +0,0 @@
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
-
-; 32-bit catch-all has to use a filter function because that's how it saves the
-; exception code.
-
-@str = linkonce_odr unnamed_addr constant [27 x i8] c"GetExceptionCode(): 0x%lx\0A\00", align 1
-
-declare i32 @_except_handler3(...)
-declare void @crash()
-declare i32 @printf(i8* nocapture readonly, ...) nounwind
-declare i32 @llvm.eh.typeid.for(i8*)
-declare i8* @llvm.frameaddress(i32)
-declare i8* @llvm.localrecover(i8*, i8*, i32)
-declare void @llvm.localescape(...)
-declare i8* @llvm.x86.seh.recoverfp(i8*, i8*)
-
-define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
-entry:
-  ; The EH code allocation is overaligned, triggering realignment.
-  %__exceptioncode = alloca i32, align 8
-  call void (...) @llvm.localescape(i32* %__exceptioncode)
-  invoke void @crash() #5
-          to label %__try.cont unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* bitcast (i32 ()* @"filt$main" to i8*)
-  %1 = extractvalue { i8*, i32 } %0, 1
-  %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @"filt$main" to i8*)) #4
-  %matches = icmp eq i32 %1, %2
-  br i1 %matches, label %__except, label %eh.resume
-
-__except:                                         ; preds = %lpad
-  %3 = load i32, i32* %__exceptioncode, align 4
-  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %3) #4
-  br label %__try.cont
-
-__try.cont:                                       ; preds = %entry, %__except
-  ret i32 0
-
-eh.resume:                                        ; preds = %lpad
-  resume { i8*, i32 } %0
-}
-
-define internal i32 @"filt$main"() {
-entry:
-  %ebp = tail call i8* @llvm.frameaddress(i32 1)
-  %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp)
-  %code.i8 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0)
-  %__exceptioncode = bitcast i8* %code.i8 to i32*
-  %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20
-  %0 = bitcast i8* %info.addr to i32***
-  %1 = load i32**, i32*** %0, align 4
-  %2 = load i32*, i32** %1, align 4
-  %3 = load i32, i32* %2, align 4
-  store i32 %3, i32* %__exceptioncode, align 4
-  ret i32 1
-}
-
-; Check that we can get the exception code from eax to the printf.
-
-; CHECK-LABEL: _main:
-; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
-; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]]
-; CHECK: movl %esp, [[reg_offs]](%esi)
-; CHECK: movl $L__ehtable$main,
-;       EH state 0
-; CHECK: movl $0, 40(%esi)
-; CHECK: calll _crash
-; CHECK: retl
-; CHECK: # Block address taken
-;       stackrestore
-; CHECK: movl -24(%ebp), %esp
-; CHECK: movl $Lmain$parent_frame_offset, %eax
-; CHECK: negl %eax
-; CHECK: leal -24(%ebp,%eax), %esi
-; CHECK: movl 12(%esi), %ebp    # 4-byte Reload
-;       EH state -1
-; CHECK: movl [[code_offs]](%esi), %[[code:[a-z]+]]
-; CHECK: movl $-1, 40(%esi)
-; CHECK-DAG: movl %[[code]], 4(%esp)
-; CHECK-DAG: movl $_str, (%esp)
-; CHECK: calll _printf
-
-; CHECK: .section .xdata,"dr"
-; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1
-; CHECK: L__ehtable$main
-; CHECK-NEXT: .long -1
-; CHECK-NEXT: .long _filt$main
-; CHECK-NEXT: .long Ltmp{{[0-9]+}}
-
-; CHECK-LABEL: _filt$main:
-; CHECK: pushl %ebp
-; CHECK: movl %esp, %ebp
-; CHECK: movl (%ebp), %[[oldebp:[a-z]+]]
-; CHECK: movl -20(%[[oldebp]]), %[[ehinfo:[a-z]+]]
-; CHECK: movl (%[[ehinfo]]), %[[ehrec:[a-z]+]]
-; CHECK: movl (%[[ehrec]]), %[[ehcode:[a-z]+]]
-; CHECK: movl %[[ehcode]], {{.*}}(%{{.*}})
diff --git a/test/CodeGen/X86/seh-stack-realign.ll b/test/CodeGen/X86/seh-stack-realign.ll
index f2fb28a081f9d..654cad347f6b4 100644
--- a/test/CodeGen/X86/seh-stack-realign.ll
+++ b/test/CodeGen/X86/seh-stack-realign.ll
@@ -23,23 +23,16 @@ entry:
           to label %__try.cont unwind label %lpad
 
 lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 }
-          catch i8* bitcast (i32 ()* @"filt$main" to i8*)
-  %1 = extractvalue { i8*, i32 } %0, 1
-  %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @"filt$main" to i8*)) #4
-  %matches = icmp eq i32 %1, %2
-  br i1 %matches, label %__except, label %eh.resume
+  %cs1 = catchswitch within none [label %__except] unwind to caller
 
 __except:                                         ; preds = %lpad
-  %3 = load i32, i32* %__exceptioncode, align 4
-  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %3) #4
-  br label %__try.cont
+  %p = catchpad within %cs1 [i8* bitcast (i32 ()* @"filt$main" to i8*)]
+  %code = load i32, i32* %__exceptioncode, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %code) #4 [ "funclet"(token %p) ]
+  catchret from %p to label %__try.cont
 
 __try.cont:                                       ; preds = %entry, %__except
   ret i32 0
-
-eh.resume:                                        ; preds = %lpad
-  resume { i8*, i32 } %0
 }
 
 define internal i32 @"filt$main"() {
@@ -61,35 +54,30 @@ entry:
 
 ; CHECK-LABEL: _main:
 ; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
-; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]]
-; CHECK: movl %esp, [[reg_offs]](%esi)
+; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%esi)
 ; CHECK: movl $L__ehtable$main,
 ;       EH state 0
 ; CHECK: movl $0, 40(%esi)
 ; CHECK: calll _crash
 ; CHECK: retl
-; CHECK: # Block address taken
+; CHECK: LBB0_[[lpbb:[0-9]+]]: # %__except
 ;       Restore ESP
 ; CHECK: movl -24(%ebp), %esp
 ;       Restore ESI
-; CHECK: movl $Lmain$parent_frame_offset, %eax
-; CHECK: negl %eax
-; CHECK: leal -24(%ebp,%eax), %esi
+; CHECK: leal -44(%ebp), %esi
 ;       Restore EBP
-; CHECK: movl 12(%esi), %ebp    # 4-byte Reload
-;       EH state -1
+; CHECK: movl 12(%esi), %ebp
 ; CHECK: movl [[code_offs]](%esi), %[[code:[a-z]+]]
-; CHECK: movl $-1, 40(%esi)
 ; CHECK-DAG: movl %[[code]], 4(%esp)
 ; CHECK-DAG: movl $_str, (%esp)
 ; CHECK: calll _printf
 
 ; CHECK: .section .xdata,"dr"
-; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1
+; CHECK: Lmain$parent_frame_offset = [[reg_offs]]
 ; CHECK: L__ehtable$main
 ; CHECK-NEXT: .long -1
 ; CHECK-NEXT: .long _filt$main
-; CHECK-NEXT: .long Ltmp{{[0-9]+}}
+; CHECK-NEXT: .long LBB0_[[lpbb]]
 
 ; CHECK-LABEL: _filt$main:
 ; CHECK: pushl %ebp
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 3149fb51576f4..77739e72fcc80 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
 
 ; Verify that we don't crash during codegen due to a wrong lowering
diff --git a/test/CodeGen/X86/setcc.ll b/test/CodeGen/X86/setcc.ll
index 6f1ddbdc6aca3..b4847c54ffafe 100644
--- a/test/CodeGen/X86/setcc.ll
+++ b/test/CodeGen/X86/setcc.ll
@@ -34,3 +34,23 @@ entry:
   %iftmp.2.0 = select i1 %0, i64 64, i64 0        ; <i64> [#uses=1]
   ret i64 %iftmp.2.0
 }
+
+@v4 = common global i32 0, align 4
+
+define i32 @t4(i32 %a) {
+entry:
+; CHECK-LABEL: t4:
+; CHECK:  movq    _v4@GOTPCREL(%rip), %rax
+; CHECK:  cmpl    $1, (%rax)
+; CHECK:  sbbl    %eax, %eax
+; CHECK:  andl    $32768, %eax
+; CHECK:  leal    65536(%rax,%rax), %eax
+  %0 = load i32, i32* @v4, align 4
+  %not.tobool = icmp eq i32 %0, 0
+  %conv.i = sext i1 %not.tobool to i16
+  %call.lobit = lshr i16 %conv.i, 15
+  %add.i.1 = add nuw nsw i16 %call.lobit, 1
+  %conv4.2 = zext i16 %add.i.1 to i32
+  %add = shl nuw nsw i32 %conv4.2, 16
+  ret i32 %add
+}
diff --git a/test/CodeGen/X86/shift-bmi2.ll b/test/CodeGen/X86/shift-bmi2.ll
index 63b6ec55fac8e..fdeddffdfb0e5 100644
--- a/test/CodeGen/X86/shift-bmi2.ll
+++ b/test/CodeGen/X86/shift-bmi2.ll
@@ -30,11 +30,10 @@ entry:
   %x = load i32, i32* %p
   %shl = shl i32 %x, %shamt
 ; BMI2: shl32p
-; Source order scheduling prevents folding, rdar:14208996.
-; BMI2: shlxl %{{.+}}, %{{.+}}, %{{.+}}
+; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}}
 ; BMI2: ret
 ; BMI264: shl32p
-; BMI264: shlxl %{{.+}}, %{{.+}}, %{{.+}}
+; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}}
 ; BMI264: ret
   ret i32 %shl
 }
@@ -75,7 +74,7 @@ entry:
   %x = load i64, i64* %p
   %shl = shl i64 %x, %shamt
 ; BMI264: shl64p
-; BMI264: shlxq %{{.+}}, %{{.+}}, %{{.+}}
+; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}}
 ; BMI264: ret
   ret i64 %shl
 }
@@ -107,11 +106,10 @@ entry:
   %x = load i32, i32* %p
   %shl = lshr i32 %x, %shamt
 ; BMI2: lshr32p
-; Source order scheduling prevents folding, rdar:14208996.
-; BMI2: shrxl %{{.+}}, %{{.+}}, %{{.+}}
+; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}}
 ; BMI2: ret
 ; BMI264: lshr32p
-; BMI264: shrxl %{{.+}}, %{{.+}}, %{{.+}}
+; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}}
 ; BMI264: ret
   ret i32 %shl
 }
@@ -130,7 +128,7 @@ entry:
   %x = load i64, i64* %p
   %shl = lshr i64 %x, %shamt
 ; BMI264: lshr64p
-; BMI264: shrxq %{{.+}}, %{{.+}}, %{{.+}}
+; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}}
 ; BMI264: ret
   ret i64 %shl
 }
@@ -153,10 +151,10 @@ entry:
   %shl = ashr i32 %x, %shamt
 ; BMI2: ashr32p
 ; Source order scheduling prevents folding, rdar:14208996.
-; BMI2: sarxl %{{.+}}, %{{.+}}, %{{.+}}
+; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}}
 ; BMI2: ret
 ; BMI264: ashr32p
-; BMI264: sarxl %{{.+}}, %{{.+}}, %{{.+}}
+; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}}
 ; BMI264: ret
   ret i32 %shl
 }
@@ -175,7 +173,7 @@ entry:
   %x = load i64, i64* %p
   %shl = ashr i64 %x, %shamt
 ; BMI264: ashr64p
-; BMI264: sarxq %{{.+}}, %{{.+}}, %{{.+}}
+; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}}
 ; BMI264: ret
   ret i64 %shl
 }
diff --git a/test/CodeGen/X86/shrink-wrap-chkstk.ll b/test/CodeGen/X86/shrink-wrap-chkstk.ll
new file mode 100644
index 0000000000000..c0b2b45e676f7
--- /dev/null
+++ b/test/CodeGen/X86/shrink-wrap-chkstk.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -enable-shrink-wrap=true | FileCheck %s
+
+; chkstk cannot come before the usual prologue, since it adjusts ESP.
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+%struct.S = type { [12 x i8] }
+
+define x86_thiscallcc void @call_inalloca(i1 %x) {
+entry:
+  %argmem = alloca inalloca <{ %struct.S }>, align 4
+  %argidx1 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0, i32 0
+  %argidx2 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0, i32 1
+  store i8 42, i8* %argidx2, align 4
+  br i1 %x, label %bb1, label %bb2
+
+bb1:
+  store i8 42, i8* %argidx1, align 4
+  br label %bb2
+
+bb2:
+  call void @inalloca_params(<{ %struct.S }>* inalloca nonnull %argmem)
+  ret void
+}
+
+; CHECK-LABEL: _call_inalloca: # @call_inalloca
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: movl $12, %eax
+; CHECK: calll __chkstk
+; CHECK: calll _inalloca_params
+; CHECK: movl %ebp, %esp
+; CHECK: popl %ebp
+; CHECK: retl
+
+declare void @inalloca_params(<{ %struct.S }>* inalloca)
diff --git a/test/CodeGen/X86/slow-div.ll b/test/CodeGen/X86/slow-div.ll
index 52223824bf961..82928521ac2b2 100644
--- a/test/CodeGen/X86/slow-div.ll
+++ b/test/CodeGen/X86/slow-div.ll
@@ -25,4 +25,19 @@ entry:
   ret i64 %div
 }
 
+; Verify that no extra code is generated when optimizing for size.
+
+define i32 @div32_optsize(i32 %a, i32 %b) optsize {
+; DIV32-LABEL: div32_optsize:
+; DIV32-NOT: divb
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+
+define i32 @div32_minsize(i32 %a, i32 %b) minsize {
+; DIV32-LABEL: div32_minsize:
+; DIV32-NOT: divb
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
 
diff --git a/test/CodeGen/X86/slow-unaligned-mem.ll b/test/CodeGen/X86/slow-unaligned-mem.ll
new file mode 100644
index 0000000000000..27cbef681b7e5
--- /dev/null
+++ b/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -0,0 +1,95 @@
+; Intel chips with slow unaligned memory accesses
+
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3      2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m     2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m     2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4      2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m     2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah         2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott      2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona        2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=core2         2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=penryn        2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bonnell       2>&1 | FileCheck %s --check-prefix=SLOW
+
+; Intel chips with fast unaligned memory accesses
+
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont    2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nehalem       2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=westmere      2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=sandybridge   2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=ivybridge     2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=haswell       2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=broadwell     2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl           2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=skylake       2>&1 | FileCheck %s --check-prefix=FAST
+
+; AMD chips with slow unaligned memory accesses
+
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-4      2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-xp     2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8            2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron       2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64      2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-fx     2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8-sse3       2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron-sse3  2>&1 | FileCheck %s --check-prefix=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefix=SLOW
+
+; AMD chips with fast unaligned memory accesses
+
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=amdfam10      2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=barcelona     2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=btver1        2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=btver2        2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver1        2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver2        2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver3        2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver4        2>&1 | FileCheck %s --check-prefix=FAST
+
+; Other chips with slow unaligned memory accesses
+
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=c3-2          2>&1 | FileCheck %s --check-prefix=SLOW
+
+; Verify that the slow/fast unaligned memory attribute is set correctly for each CPU model.
+; Slow chips use 4-byte stores. Fast chips with SSE or later use something other than 4-byte stores.
+; Chips that don't have SSE use 4-byte stores either way, so they're not tested.
+
+; Also verify that SSE4.2 or SSE4a imply fast unaligned accesses.
+
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4.2       2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4a        2>&1 | FileCheck %s --check-prefix=FAST
+
+define void @store_zeros(i8* %a) {
+; SLOW-NOT: not a recognized processor
+; SLOW-LABEL: store_zeros:
+; SLOW:       # BB#0:
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+; SLOW-NEXT:    movl
+;
+; FAST-NOT: not a recognized processor
+; FAST-LABEL: store_zeros:
+; FAST:       # BB#0:
+; FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FAST-NOT:     movl
+  call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
diff --git a/test/CodeGen/X86/soft-fp.ll b/test/CodeGen/X86/soft-fp.ll
index fa38d1044a483..138e66c394baa 100644
--- a/test/CodeGen/X86/soft-fp.ll
+++ b/test/CodeGen/X86/soft-fp.ll
@@ -1,7 +1,14 @@
-; RUN: llc < %s -march=x86    -mattr=+sse2,+soft-float | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+soft-float | FileCheck %s
+; RUN: llc < %s -march=x86    -mattr=+mmx,+sse,+soft-float \
+; RUN:     | FileCheck %s --check-prefix=SOFT1 --check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2,+soft-float \
+; RUN:     | FileCheck %s --check-prefix=SOFT2 --check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse \
+; RUN:     | FileCheck %s --check-prefix=SSE1 --check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 \
+; RUN:     | FileCheck %s --check-prefix=SSE2 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-gnux32 -mattr=+mmx,+sse2,+soft-float | FileCheck %s
 
-; CHECK-NOT: xmm{[0-9]+}
+; CHECK-NOT: xmm{{[0-9]+}}
 
 %struct.__va_list_tag = type { i32, i32, i8*, i8* }
 
@@ -14,6 +21,8 @@ entry:
 	call void @bar(%struct.__va_list_tag* %va3) nounwind
 	call void @llvm.va_end(i8* %va12)
 	ret i32 undef
+; CHECK-LABEL: t1:
+; CHECK:       ret{{[lq]}}
 }
 
 declare void @llvm.va_start(i8*) nounwind
@@ -26,4 +35,23 @@ define float @t2(float %a, float %b) nounwind readnone {
 entry:
 	%0 = fadd float %a, %b		; <float> [#uses=1]
 	ret float %0
+; CHECK-LABEL: t2:
+; SOFT1-NOT:   xmm{{[0-9]+}}
+; SOFT2-NOT:   xmm{{[0-9]+}}
+; SSE1:        xmm{{[0-9]+}}
+; SSE2:        xmm{{[0-9]+}}
+; CHECK:       ret{{[lq]}}
+}
+
+; soft-float means no SSE instruction and passing fp128 as pair of i64.
+define fp128 @t3(fp128 %a, fp128 %b) nounwind readnone {
+entry:
+	%0 = fadd fp128 %b, %a
+	ret fp128 %0
+; CHECK-LABEL: t3:
+; SOFT1-NOT:   xmm{{[0-9]+}}
+; SOFT2-NOT:   xmm{{[0-9]+}}
+; SSE1:        xmm{{[0-9]+}}
+; SSE2:        xmm{{[0-9]+}}
+; CHECK:       ret{{[lq]}}
 }
diff --git a/test/CodeGen/X86/soft-sitofp.ll b/test/CodeGen/X86/soft-sitofp.ll
new file mode 100644
index 0000000000000..acb4bb906e702
--- /dev/null
+++ b/test/CodeGen/X86/soft-sitofp.ll
@@ -0,0 +1,169 @@
+; RUN: llc -mtriple=i386-pc-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+
+; Function Attrs: nounwind
+; CHECK-LABEL: s64_to_d:
+; CHECK: call{{l|q}} __floatdidf
+define double @s64_to_d(i64 %n) #0 {
+entry:
+  %conv = sitofp i64 %n to double
+  ret double %conv
+}
+
+; CHECK-LABEL: s64_to_f:
+; CHECK: call{{l|q}} __floatdisf
+define float @s64_to_f(i64 %n) #0 {
+entry:
+  %conv = sitofp i64 %n to float
+  ret float %conv
+}
+
+; CHECK-LABEL: s32_to_d:
+; CHECK: call{{l|q}} __floatsidf
+define double @s32_to_d(i32 %n) #0 {
+entry:
+  %conv = sitofp i32 %n to double
+  ret double %conv
+}
+
+; CHECK-LABEL: s32_to_f:
+; CHECK: call{{l|q}} __floatsisf
+define float @s32_to_f(i32 %n) #0 {
+entry:
+  %conv = sitofp i32 %n to float
+  ret float %conv
+}
+
+; CHECK-LABEL: u64_to_d:
+; CHECK: call{{l|q}} __floatundidf
+define double @u64_to_d(i64 %n) #0 {
+entry:
+  %conv = uitofp i64 %n to double
+  ret double %conv
+}
+
+; CHECK-LABEL: u64_to_f:
+; CHECK: call{{l|q}} __floatundisf
+define float @u64_to_f(i64 %n) #0 {
+entry:
+  %conv = uitofp i64 %n to float
+  ret float %conv
+}
+
+; CHECK-LABEL: u32_to_d:
+; CHECK: call{{l|q}} __floatunsidf
+define double @u32_to_d(i32 %n) #0 {
+entry:
+  %conv = uitofp i32 %n to double
+  ret double %conv
+}
+
+; CHECK-LABEL: u32_to_f:
+; CHECK: call{{l|q}} __floatunsisf
+define float @u32_to_f(i32 %n) #0 {
+entry:
+  %conv = uitofp i32 %n to float
+  ret float %conv
+}
+
+; CHECK-LABEL: d_to_s64:
+; CHECK: call{{l|q}} __fixdfdi
+define i64 @d_to_s64(double %n) #0 {
+entry:
+  %conv = fptosi double %n to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: d_to_s32:
+; CHECK: call{{l|q}} __fixdfsi
+define i32 @d_to_s32(double %n) #0 {
+entry:
+  %conv = fptosi double %n to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: f_to_s64:
+; CHECK: call{{l|q}} __fixsfdi
+define i64 @f_to_s64(float %n) #0 {
+entry:
+  %conv = fptosi float %n to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: f_to_s32:
+; CHECK: call{{l|q}} __fixsfsi
+define i32 @f_to_s32(float %n) #0 {
+entry:
+  %conv = fptosi float %n to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: d_to_u64:
+; CHECK: call{{l|q}} __fixunsdfdi
+define i64 @d_to_u64(double %n) #0 {
+entry:
+  %conv = fptoui double %n to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: d_to_u32:
+; CHECK: call{{l|q}} __fixunsdfsi
+define i32 @d_to_u32(double %n) #0 {
+entry:
+  %conv = fptoui double %n to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: f_to_u64:
+; CHECK: call{{l|q}} __fixunssfdi
+define i64 @f_to_u64(float %n) #0 {
+entry:
+  %conv = fptoui float %n to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: f_to_u32:
+; CHECK: call{{l|q}} __fixunssfsi
+define i32 @f_to_u32(float %n) #0 {
+entry:
+  %conv = fptoui float %n to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: f_to_s8:
+; CHECK: call{{l|q}} __fixsfsi
+define i8 @f_to_s8(float %f, i8 %i) #0 {
+entry:
+  %conv = fptosi float %f to i8
+  %add = add i8 %conv, %i
+  ret i8 %add
+}
+
+; CHECK-LABEL: f_to_u8:
+; CHECK: call{{l|q}} __fixunssfsi
+define i8 @f_to_u8(float %f, i8 %i) #0 {
+entry:
+  %conv = fptoui float %f to i8
+  %add = add i8 %conv, %i
+  ret i8 %add
+}
+
+; CHECK-LABEL: f_to_s16:
+; CHECK: call{{l|q}} __fixsfsi
+define i16 @f_to_s16(float %f, i16 %i) #0 {
+entry:
+  %conv = fptosi float %f to i16
+  %add = add i16 %conv, %i
+  ret i16 %add
+}
+
+; CHECK-LABEL: f_to_u16:
+; CHECK: call{{l|q}} __fixunssfsi
+define i16 @f_to_u16(float %f, i16 %i) #0 {
+entry:
+  %conv = fptoui float %f to i16
+  %add = add i16 %conv, %i
+  ret i16 %add
+}
+
+attributes #0 = { nounwind "use-soft-float"="true" }
diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll
index 635aa821d78a7..277472f49b3a7 100644
--- a/test/CodeGen/X86/splat-for-size.ll
+++ b/test/CodeGen/X86/splat-for-size.ll
@@ -1,141 +1,191 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
 
-; Check constant loads of every 128-bit and 256-bit vector type 
+; Check constant loads of every 128-bit and 256-bit vector type
 ; for size optimization using splat ops available with AVX and AVX2.
 
 ; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr).
 define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: splat_v2f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %add = fadd <2 x double> %x, <double 1.0, double 1.0>
   ret <2 x double> %add
-; CHECK-LABEL: splat_v2f64
-; CHECK: vmovddup
-; CHECK: vaddpd 
-; CHECK-NEXT: retq
 }
 
-define <4 x double> @splat_v4f64(<4 x double> %x) #0 {
+define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
+; CHECK-LABEL: splat_v4f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm1
+; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
   ret <4 x double> %add
-; CHECK-LABEL: splat_v4f64
-; CHECK: vbroadcastsd 
-; CHECK-NEXT: vaddpd
-; CHECK-NEXT: retq
 }
 
 define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: splat_v4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
   ret <4 x float> %add
-; CHECK-LABEL: splat_v4f32
-; CHECK: vbroadcastss 
-; CHECK-NEXT: vaddps
-; CHECK-NEXT: retq
 }
 
-define <8 x float> @splat_v8f32(<8 x float> %x) #0 {
+define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
+; CHECK-LABEL: splat_v8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
   ret <8 x float> %add
-; CHECK-LABEL: splat_v8f32
-; CHECK: vbroadcastss 
-; CHECK-NEXT: vaddps
-; CHECK-NEXT: retq
 }
 
 ; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value.
 ; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq.
-define <2 x i64> @splat_v2i64(<2 x i64> %x) #0 {
+define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
+; CHECK-LABEL: splat_v2i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %add = add <2 x i64> %x, <i64 1, i64 1>
   ret <2 x i64> %add
-; CHECK-LABEL: splat_v2i64
-; CHECK: vmovddup 
-; CHECK: vpaddq
-; CHECK-NEXT: retq
 }
 
 ; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors,
 ; and then we fake it: use vmovddup to splat 64-bit value.
 define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
+; AVX-LABEL: splat_v4i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
   %add = add <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1>
   ret <4 x i64> %add
-; CHECK-LABEL: splat_v4i64
-; AVX: vmovddup
-; AVX: vpaddq 
-; AVX: vpaddq 
-; AVX2: vpbroadcastq 
-; AVX2: vpaddq 
-; CHECK: retq
 }
 
 ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
-define <4 x i32> @splat_v4i32(<4 x i32> %x) #0 {
+define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 {
+; AVX-LABEL: splat_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %add = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %add
-; CHECK-LABEL: splat_v4i32
-; AVX: vbroadcastss
-; AVX2: vpbroadcastd 
-; CHECK-NEXT: vpaddd 
-; CHECK-NEXT: retq
 }
 
 ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
 define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
+; AVX-LABEL: splat_v8i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
   %add = add <8 x i32> %x, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <8 x i32> %add
-; CHECK-LABEL: splat_v8i32
-; AVX: vbroadcastss
-; AVX: vpaddd 
-; AVX: vpaddd 
-; AVX2: vpbroadcastd 
-; AVX2: vpaddd 
-; CHECK: retq
 }
 
 ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
-define <8 x i16> @splat_v8i16(<8 x i16> %x) #0 {
+define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 {
+; AVX-LABEL: splat_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v8i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %add = add <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   ret <8 x i16> %add
-; CHECK-LABEL: splat_v8i16
-; AVX-NOT: broadcast
-; AVX2: vpbroadcastw 
-; CHECK: vpaddw 
-; CHECK-NEXT: retq
 }
 
 ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
 define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
+; AVX-LABEL: splat_v16i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
   %add = add <16 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   ret <16 x i16> %add
-; CHECK-LABEL: splat_v16i16
-; AVX-NOT: broadcast
-; AVX: vpaddw 
-; AVX: vpaddw 
-; AVX2: vpbroadcastw 
-; AVX2: vpaddw 
-; CHECK: retq
 }
 
 ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
-define <16 x i8> @splat_v16i8(<16 x i8> %x) #0 {
+define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 {
+; AVX-LABEL: splat_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v16i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %add = add <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   ret <16 x i8> %add
-; CHECK-LABEL: splat_v16i8
-; AVX-NOT: broadcast
-; AVX2: vpbroadcastb 
-; CHECK: vpaddb 
-; CHECK-NEXT: retq
 }
 
 ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
 define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
+; AVX-LABEL: splat_v32i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
   %add = add <32 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   ret <32 x i8> %add
-; CHECK-LABEL: splat_v32i8
-; AVX-NOT: broadcast
-; AVX: vpaddb 
-; AVX: vpaddb 
-; AVX2: vpbroadcastb 
-; AVX2: vpaddb 
-; CHECK: retq
 }
 
 ; PR23259: Verify that ISel doesn't crash with a 'fatal error in backend'
@@ -144,7 +194,7 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
 
 @A = common global <3 x i64> zeroinitializer, align 32
 
-define <8 x i64> @pr23259() #0 {
+define <8 x i64> @pr23259() #1 {
 entry:
   %0 = load <4 x i64>, <4 x i64>* bitcast (<3 x i64>* @A to <4 x i64>*), align 32
   %1 = shufflevector <4 x i64> %0, <4 x i64> undef, <3 x i32> <i32 undef, i32 undef, i32 2>
@@ -153,3 +203,4 @@ entry:
 }
 
 attributes #0 = { optsize }
+attributes #1 = { minsize }
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 9b851db8121c4..386409a674ef0 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 -recip=!sqrtf,!vec-sqrtf,!divf,!vec-divf | FileCheck %s --check-prefix=NORECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE
 
@@ -99,8 +100,8 @@ define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
 ; ESTIMATE-LABEL: reciprocal_square_root_v4f32:
 ; ESTIMATE:       # BB#0:
 ; ESTIMATE-NEXT:    vrsqrtps %xmm0, %xmm1
-; ESTIMATE-NEXT:    vmulps %xmm1, %xmm1, %xmm2
-; ESTIMATE-NEXT:    vmulps %xmm0, %xmm2, %xmm0
+; ESTIMATE-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; ESTIMATE-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; ESTIMATE-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
 ; ESTIMATE-NEXT:    vmulps {{.*}}(%rip), %xmm1, %xmm1
 ; ESTIMATE-NEXT:    vmulps %xmm1, %xmm0, %xmm0
@@ -124,8 +125,8 @@ define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 {
 ; ESTIMATE-LABEL: reciprocal_square_root_v8f32:
 ; ESTIMATE:       # BB#0:
 ; ESTIMATE-NEXT:    vrsqrtps %ymm0, %ymm1
-; ESTIMATE-NEXT:    vmulps %ymm1, %ymm1, %ymm2
-; ESTIMATE-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; ESTIMATE-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; ESTIMATE-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; ESTIMATE-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
 ; ESTIMATE-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
 ; ESTIMATE-NEXT:    vmulps %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll
index 9441cc0002fbe..4fbb6e42ccae5 100644
--- a/test/CodeGen/X86/sse-align-12.ll
+++ b/test/CodeGen/X86/sse-align-12.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=nehalem | FileCheck %s
 
 define <4 x float> @a(<4 x float>* %y) nounwind {
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index e4d0373299fb1..f0341277851d9 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-no-nans-fp-math  | FileCheck -check-prefix=FINITE %s
 
 ; Some of these patterns can be matched as SSE min or max. Some of
-; then can be matched provided that the operands are swapped.
+; them can be matched provided that the operands are swapped.
 ; Some of them can't be matched at all and require a comparison
 ; and a conditional branch.
 
diff --git a/test/CodeGen/X86/sse-only.ll b/test/CodeGen/X86/sse-only.ll
new file mode 100644
index 0000000000000..3fe9faaba850d
--- /dev/null
+++ b/test/CodeGen/X86/sse-only.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=x86 -mattr=+sse2,-mmx | FileCheck %s
+
+; Test that turning off mmx doesn't turn off sse
+
+define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movapd (%ecx), %xmm0
+; CHECK-NEXT:    movlpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    movapd %xmm0, (%eax)
+; CHECK-NEXT:    retl
+	%tmp3 = load <2 x double>, <2 x double>* %A, align 16
+	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
+	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
+	store <2 x double> %tmp9, <2 x double>* %r, align 16
+	ret void
+}
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll b/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll
index fab4f90279e83..63751e1ab7e1c 100644
--- a/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll
+++ b/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2 < %s   | FileCheck --check-prefix=SSE %s
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse4.1 < %s | FileCheck --check-prefix=SSE %s
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s    | FileCheck --check-prefix=AVX %s
diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll
index 45028cf4bd372..d1c7adb6263b7 100644
--- a/test/CodeGen/X86/sse2-vector-shifts.ll
+++ b/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -1,367 +1,373 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s
 
 ; SSE2 Logical Shift Left
 
 define <8 x i16> @test_sllw_1(<8 x i16> %InVec) {
+; CHECK-LABEL: test_sllw_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %shl = shl <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
   ret <8 x i16> %shl
 }
 
-; CHECK-LABEL: test_sllw_1:
-; CHECK-NOT: psllw   $0, %xmm0
-; CHECK: ret
-
 define <8 x i16> @test_sllw_2(<8 x i16> %InVec) {
+; CHECK-LABEL: test_sllw_2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    paddw %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = shl <8 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   ret <8 x i16> %shl
 }
 
-; CHECK-LABEL: test_sllw_2:
-; CHECK: paddw   %xmm0, %xmm0
-; CHECK-NEXT: ret
-
 define <8 x i16> @test_sllw_3(<8 x i16> %InVec) {
+; CHECK-LABEL: test_sllw_3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psllw $15, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = shl <8 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <8 x i16> %shl
 }
 
-; CHECK-LABEL: test_sllw_3:
-; CHECK: psllw $15, %xmm0
-; CHECK-NEXT: ret
-
 define <4 x i32> @test_slld_1(<4 x i32> %InVec) {
+; CHECK-LABEL: test_slld_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %shl = shl <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i32> %shl
 }
 
-; CHECK-LABEL: test_slld_1:
-; CHECK-NOT: pslld   $0, %xmm0
-; CHECK: ret
-
 define <4 x i32> @test_slld_2(<4 x i32> %InVec) {
+; CHECK-LABEL: test_slld_2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    paddd %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = shl <4 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %shl
 }
 
-; CHECK-LABEL: test_slld_2:
-; CHECK: paddd   %xmm0, %xmm0
-; CHECK-NEXT: ret
-
 define <4 x i32> @test_slld_3(<4 x i32> %InVec) {
+; CHECK-LABEL: test_slld_3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pslld $31, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = shl <4 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %shl
 }
 
-; CHECK-LABEL: test_slld_3:
-; CHECK: pslld $31, %xmm0
-; CHECK-NEXT: ret
-
 define <2 x i64> @test_sllq_1(<2 x i64> %InVec) {
+; CHECK-LABEL: test_sllq_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %shl = shl <2 x i64> %InVec, <i64 0, i64 0>
   ret <2 x i64> %shl
 }
 
-; CHECK-LABEL: test_sllq_1:
-; CHECK-NOT: psllq   $0, %xmm0
-; CHECK: ret
-
 define <2 x i64> @test_sllq_2(<2 x i64> %InVec) {
+; CHECK-LABEL: test_sllq_2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    paddq %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = shl <2 x i64> %InVec, <i64 1, i64 1>
   ret <2 x i64> %shl
 }
 
-; CHECK-LABEL: test_sllq_2:
-; CHECK: paddq   %xmm0, %xmm0
-; CHECK-NEXT: ret
-
 define <2 x i64> @test_sllq_3(<2 x i64> %InVec) {
+; CHECK-LABEL: test_sllq_3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psllq $63, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = shl <2 x i64> %InVec, <i64 63, i64 63>
   ret <2 x i64> %shl
 }
 
-; CHECK-LABEL: test_sllq_3:
-; CHECK: psllq $63, %xmm0
-; CHECK-NEXT: ret
-
 ; SSE2 Arithmetic Shift
 
 define <8 x i16> @test_sraw_1(<8 x i16> %InVec) {
+; CHECK-LABEL: test_sraw_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %shl = ashr <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
   ret <8 x i16> %shl
 }
 
-; CHECK-LABEL: test_sraw_1:
-; CHECK-NOT: psraw   $0, %xmm0
-; CHECK: ret
-
 define <8 x i16> @test_sraw_2(<8 x i16> %InVec) {
+; CHECK-LABEL: test_sraw_2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psraw $1, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = ashr <8 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   ret <8 x i16> %shl
 }
 
-; CHECK-LABEL: test_sraw_2:
-; CHECK: psraw   $1, %xmm0
-; CHECK-NEXT: ret
-
 define <8 x i16> @test_sraw_3(<8 x i16> %InVec) {
+; CHECK-LABEL: test_sraw_3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psraw $15, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = ashr <8 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <8 x i16> %shl
 }
 
-; CHECK-LABEL: test_sraw_3:
-; CHECK: psraw   $15, %xmm0
-; CHECK-NEXT: ret
-
 define <4 x i32> @test_srad_1(<4 x i32> %InVec) {
+; CHECK-LABEL: test_srad_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %shl = ashr <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i32> %shl
 }
 
-; CHECK-LABEL: test_srad_1:
-; CHECK-NOT: psrad   $0, %xmm0
-; CHECK: ret
-
 define <4 x i32> @test_srad_2(<4 x i32> %InVec) {
+; CHECK-LABEL: test_srad_2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psrad $1, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = ashr <4 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %shl
 }
 
-; CHECK-LABEL: test_srad_2:
-; CHECK: psrad   $1, %xmm0
-; CHECK-NEXT: ret
-
 define <4 x i32> @test_srad_3(<4 x i32> %InVec) {
+; CHECK-LABEL: test_srad_3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psrad $31, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = ashr <4 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %shl
 }
 
-; CHECK-LABEL: test_srad_3:
-; CHECK: psrad   $31, %xmm0
-; CHECK-NEXT: ret
-
 ; SSE Logical Shift Right
 
 define <8 x i16> @test_srlw_1(<8 x i16> %InVec) {
+; CHECK-LABEL: test_srlw_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %shl = lshr <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
   ret <8 x i16> %shl
 }
 
-; CHECK-LABEL: test_srlw_1:
-; CHECK-NOT: psrlw   $0, %xmm0
-; CHECK: ret
-
 define <8 x i16> @test_srlw_2(<8 x i16> %InVec) {
+; CHECK-LABEL: test_srlw_2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psrlw $1, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = lshr <8 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   ret <8 x i16> %shl
 }
 
-; CHECK-LABEL: test_srlw_2:
-; CHECK: psrlw   $1, %xmm0
-; CHECK-NEXT: ret
-
 define <8 x i16> @test_srlw_3(<8 x i16> %InVec) {
+; CHECK-LABEL: test_srlw_3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psrlw $15, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = lshr <8 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <8 x i16> %shl
 }
 
-; CHECK-LABEL: test_srlw_3:
-; CHECK: psrlw $15, %xmm0
-; CHECK-NEXT: ret
-
 define <4 x i32> @test_srld_1(<4 x i32> %InVec) {
+; CHECK-LABEL: test_srld_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %shl = lshr <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i32> %shl
 }
 
-; CHECK-LABEL: test_srld_1:
-; CHECK-NOT: psrld   $0, %xmm0
-; CHECK: ret
-
 define <4 x i32> @test_srld_2(<4 x i32> %InVec) {
+; CHECK-LABEL: test_srld_2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psrld $1, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = lshr <4 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %shl
 }
 
-; CHECK-LABEL: test_srld_2:
-; CHECK: psrld   $1, %xmm0
-; CHECK-NEXT: ret
-
 define <4 x i32> @test_srld_3(<4 x i32> %InVec) {
+; CHECK-LABEL: test_srld_3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psrld $31, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = lshr <4 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %shl
 }
 
-; CHECK-LABEL: test_srld_3:
-; CHECK: psrld $31, %xmm0
-; CHECK-NEXT: ret
-
 define <2 x i64> @test_srlq_1(<2 x i64> %InVec) {
+; CHECK-LABEL: test_srlq_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %shl = lshr <2 x i64> %InVec, <i64 0, i64 0>
   ret <2 x i64> %shl
 }
 
-; CHECK-LABEL: test_srlq_1:
-; CHECK-NOT: psrlq   $0, %xmm0
-; CHECK: ret
-
 define <2 x i64> @test_srlq_2(<2 x i64> %InVec) {
+; CHECK-LABEL: test_srlq_2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psrlq $1, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = lshr <2 x i64> %InVec, <i64 1, i64 1>
   ret <2 x i64> %shl
 }
 
-; CHECK-LABEL: test_srlq_2:
-; CHECK: psrlq   $1, %xmm0
-; CHECK-NEXT: ret
-
 define <2 x i64> @test_srlq_3(<2 x i64> %InVec) {
+; CHECK-LABEL: test_srlq_3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    psrlq $63, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %shl = lshr <2 x i64> %InVec, <i64 63, i64 63>
   ret <2 x i64> %shl
 }
 
-; CHECK-LABEL: test_srlq_3:
-; CHECK: psrlq $63, %xmm0
-; CHECK-NEXT: ret
-
-
-; CHECK-LABEL: sra_sra_v4i32:
-; CHECK: psrad $6, %xmm0
-; CHECK-NEXT: retq
 define <4 x i32> @sra_sra_v4i32(<4 x i32> %x) nounwind {
+; CHECK-LABEL: sra_sra_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrad $6, %xmm0
+; CHECK-NEXT:    retq
   %sra0 = ashr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
   %sra1 = ashr <4 x i32> %sra0, <i32 4, i32 4, i32 4, i32 4>
   ret <4 x i32> %sra1
 }
 
-; CHECK-LABEL: @srl_srl_v4i32
-; CHECK: psrld $6, %xmm0
-; CHECK-NEXT: ret
 define <4 x i32> @srl_srl_v4i32(<4 x i32> %x) nounwind {
+; CHECK-LABEL: srl_srl_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrld $6, %xmm0
+; CHECK-NEXT:    retq
   %srl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
   %srl1 = lshr <4 x i32> %srl0, <i32 4, i32 4, i32 4, i32 4>
   ret <4 x i32> %srl1
 }
 
-; CHECK-LABEL: @srl_shl_v4i32
-; CHECK: andps
-; CHECK-NEXT: retq
 define <4 x i32> @srl_shl_v4i32(<4 x i32> %x) nounwind {
+; CHECK-LABEL: srl_shl_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
   %srl0 = shl <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
   %srl1 = lshr <4 x i32> %srl0, <i32 4, i32 4, i32 4, i32 4>
   ret <4 x i32> %srl1
 }
 
-; CHECK-LABEL: @srl_sra_31_v4i32
-; CHECK: psrld $31, %xmm0
-; CHECK-NEXT: ret
 define <4 x i32> @srl_sra_31_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+; CHECK-LABEL: srl_sra_31_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrld $31, %xmm0
+; CHECK-NEXT:    retq
   %sra = ashr <4 x i32> %x, %y
   %srl1 = lshr <4 x i32> %sra, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %srl1
 }
 
-; CHECK-LABEL: @shl_shl_v4i32
-; CHECK: pslld $6, %xmm0
-; CHECK-NEXT: ret
 define <4 x i32> @shl_shl_v4i32(<4 x i32> %x) nounwind {
+; CHECK-LABEL: shl_shl_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pslld $6, %xmm0
+; CHECK-NEXT:    retq
   %shl0 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
   %shl1 = shl <4 x i32> %shl0, <i32 4, i32 4, i32 4, i32 4>
   ret <4 x i32> %shl1
 }
 
-; CHECK-LABEL: @shl_sra_v4i32
-; CHECK: andps
-; CHECK-NEXT: ret
 define <4 x i32> @shl_sra_v4i32(<4 x i32> %x) nounwind {
+; CHECK-LABEL: shl_sra_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
   %shl0 = ashr <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
   %shl1 = shl <4 x i32> %shl0, <i32 4, i32 4, i32 4, i32 4>
   ret <4 x i32> %shl1
 }
 
-; CHECK-LABEL: @shl_srl_v4i32
-; CHECK: pslld $3, %xmm0
-; CHECK-NEXT: pand
-; CHECK-NEXT: ret
 define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
+; CHECK-LABEL: shl_srl_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pslld $3, %xmm0
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
   %shl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
   %shl1 = shl <4 x i32> %shl0, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %shl1
 }
 
-; CHECK-LABEL: @shl_zext_srl_v4i32
-; CHECK: andps
-; CHECK-NEXT: ret
 define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {
+; CHECK-LABEL: shl_zext_srl_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
   %srl = lshr <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
   %zext = zext <4 x i16> %srl to <4 x i32>
   %shl = shl <4 x i32> %zext, <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i32> %shl
 }
 
-; CHECK: @sra_trunc_srl_v4i32
-; CHECK: psrad $19, %xmm0
-; CHECK-NEXT: retq
 define <4 x i16> @sra_trunc_srl_v4i32(<4 x i32> %x) nounwind {
+; CHECK-LABEL: sra_trunc_srl_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrad $19, %xmm0
+; CHECK-NEXT:    retq
   %srl = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
   %trunc = trunc <4 x i32> %srl to <4 x i16>
   %sra = ashr <4 x i16> %trunc, <i16 3, i16 3, i16 3, i16 3>
   ret <4 x i16> %sra
 }
 
-; CHECK-LABEL: @shl_zext_shl_v4i32
-; CHECK: pand
-; CHECK-NEXT: pslld $19, %xmm0
-; CHECK-NEXT: ret
 define <4 x i32> @shl_zext_shl_v4i32(<4 x i16> %x) nounwind {
+; CHECK-LABEL: shl_zext_shl_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    pslld $19, %xmm0
+; CHECK-NEXT:    retq
   %shl0 = shl <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
   %ext = zext <4 x i16> %shl0 to <4 x i32>
   %shl1 = shl <4 x i32> %ext, <i32 17, i32 17, i32 17, i32 17>
   ret <4 x i32> %shl1
 }
 
-; CHECK-LABEL: @sra_v4i32
-; CHECK: psrad $3, %xmm0
-; CHECK-NEXT: ret
 define <4 x i32> @sra_v4i32(<4 x i32> %x) nounwind {
+; CHECK-LABEL: sra_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrad $3, %xmm0
+; CHECK-NEXT:    retq
   %sra = ashr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
   ret <4 x i32> %sra
 }
 
-; CHECK-LABEL: @srl_v4i32
-; CHECK: psrld $3, %xmm0
-; CHECK-NEXT: ret
 define <4 x i32> @srl_v4i32(<4 x i32> %x) nounwind {
+; CHECK-LABEL: srl_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrld $3, %xmm0
+; CHECK-NEXT:    retq
   %sra = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
   ret <4 x i32> %sra
 }
 
-; CHECK-LABEL: @shl_v4i32
-; CHECK: pslld $3, %xmm0
-; CHECK-NEXT: ret
 define <4 x i32> @shl_v4i32(<4 x i32> %x) nounwind {
+; CHECK-LABEL: shl_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pslld $3, %xmm0
+; CHECK-NEXT:    retq
   %sra = shl <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
   ret <4 x i32> %sra
 }
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index d3ee3c6f0454e..ed84905b19074 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Tests for SSE2 and below, without SSE3+.
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
 
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
index 71efa3f8f1054..79317e4576b9c 100644
--- a/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -1,11 +1,20 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
 
 ; Verify that we correctly generate 'addsub' instructions from
 ; a sequence of vector extracts + float add/sub + vector inserts.
 
 define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test1:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 0
   %2 = extractelement <4 x float> %B, i32 0
   %sub = fsub float %1, %2
@@ -24,13 +33,17 @@ define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
   %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
   ret <4 x float> %vecinsert4
 }
-; CHECK-LABEL: test1
-; SSE: addsubps
-; AVX: vaddsubps
-; CHECK-NEXT: ret
-
 
 define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test2:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 2
   %2 = extractelement <4 x float> %B, i32 2
   %sub2 = fsub float %1, %2
@@ -41,13 +54,17 @@ define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
   ret <4 x float> %vecinsert2
 }
-; CHECK-LABEL: test2
-; SSE: addsubps
-; AVX: vaddsubps
-; CHECK-NEXT: ret
-
 
 define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 0
   %2 = extractelement <4 x float> %B, i32 0
   %sub = fsub float %1, %2
@@ -58,13 +75,17 @@ define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
   ret <4 x float> %vecinsert2
 }
-; CHECK-LABEL: test3
-; SSE: addsubps
-; AVX: vaddsubps
-; CHECK-NEXT: ret
-
 
 define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 2
   %2 = extractelement <4 x float> %B, i32 2
   %sub = fsub float %1, %2
@@ -75,13 +96,17 @@ define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
   ret <4 x float> %vecinsert2
 }
-; CHECK-LABEL: test4
-; SSE: addsubps
-; AVX: vaddsubps
-; CHECK-NEXT: ret
-
 
 define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test5:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 0
   %2 = extractelement <4 x float> %B, i32 0
   %sub2 = fsub float %1, %2
@@ -92,13 +117,17 @@ define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
   ret <4 x float> %vecinsert2
 }
-; CHECK-LABEL: test5
-; SSE: addsubps
-; AVX: vaddsubps
-; CHECK-NEXT: ret
-
 
 define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test6:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 0
   %2 = extractelement <4 x float> %B, i32 0
   %sub = fsub float %1, %2
@@ -117,13 +146,18 @@ define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
   %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
   ret <4 x float> %vecinsert4
 }
-; CHECK-LABEL: test6
-; SSE: addsubps
-; AVX: vaddsubps
-; CHECK-NEXT: ret
-
 
 define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
+; SSE-LABEL: test7:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubpd %xmm2, %xmm0
+; SSE-NEXT:    addsubpd %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x double> %A, i32 0
   %2 = extractelement <4 x double> %B, i32 0
   %sub = fsub double %1, %2
@@ -142,15 +176,17 @@ define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
   %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
   ret <4 x double> %vecinsert4
 }
-; CHECK-LABEL: test7
-; SSE: addsubpd
-; SSE-NEXT: addsubpd
-; AVX: vaddsubpd
-; AVX-NOT: vaddsubpd
-; CHECK: ret
-
 
 define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: test8:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %A, i32 0
   %2 = extractelement <2 x double> %B, i32 0
   %sub = fsub double %1, %2
@@ -161,13 +197,18 @@ define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
   %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
   ret <2 x double> %vecinsert2
 }
-; CHECK-LABEL: test8
-; SSE: addsubpd
-; AVX: vaddsubpd
-; CHECK: ret
-
 
 define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
+; SSE-LABEL: test9:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm2, %xmm0
+; SSE-NEXT:    addsubps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test9:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %1 = extractelement <8 x float> %A, i32 0
   %2 = extractelement <8 x float> %B, i32 0
   %sub = fsub float %1, %2
@@ -202,65 +243,118 @@ define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
   %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
   ret <8 x float> %vecinsert8
 }
-; CHECK-LABEL: test9
-; SSE: addsubps
-; SSE-NEXT: addsubps
-; AVX: vaddsubps
-; AVX-NOT: vaddsubps
-; CHECK: ret
-
 
 ; Verify that we don't generate addsub instruction for the following
 ; functions.
+
 define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test10:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 0
   %2 = extractelement <4 x float> %B, i32 0
   %sub = fsub float %1, %2
   %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
   ret <4 x float> %vecinsert1
 }
-; CHECK-LABEL: test10
-; CHECK-NOT: addsubps
-; CHECK: ret
-
 
 define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test11:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 2
   %2 = extractelement <4 x float> %B, i32 2
   %sub = fsub float %1, %2
   %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
   ret <4 x float> %vecinsert1
 }
-; CHECK-LABEL: test11
-; CHECK-NOT: addsubps
-; CHECK: ret
-
 
 define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test12:
+; SSE:       # BB#0:
+; SSE-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 1
   %2 = extractelement <4 x float> %B, i32 1
   %add = fadd float %1, %2
   %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
   ret <4 x float> %vecinsert1
 }
-; CHECK-LABEL: test12
-; CHECK-NOT: addsubps
-; CHECK: ret
-
 
 define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test13:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test13:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 3
   %2 = extractelement <4 x float> %B, i32 3
   %add = fadd float %1, %2
   %vecinsert1 = insertelement <4 x float> undef, float %add, i32 3
   ret <4 x float> %vecinsert1
 }
-; CHECK-LABEL: test13
-; CHECK-NOT: addsubps
-; CHECK: ret
-
 
 define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test14:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    subss %xmm1, %xmm2
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1,1,3]
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 0
   %2 = extractelement <4 x float> %B, i32 0
   %sub = fsub float %1, %2
@@ -271,12 +365,32 @@ define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2
   ret <4 x float> %vecinsert2
 }
-; CHECK-LABEL: test14
-; CHECK-NOT: addsubps
-; CHECK: ret
-
 
 define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test15:
+; SSE:       # BB#0:
+; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,2,1]
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test15:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 1
   %2 = extractelement <4 x float> %B, i32 1
   %add = fadd float %1, %2
@@ -287,12 +401,43 @@ define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
   ret <4 x float> %vecinsert2
 }
-; CHECK-LABEL: test15
-; CHECK-NOT: addsubps
-; CHECK: ret
-
 
 define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test16:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    subss %xmm0, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufpd {{.*#+}} xmm3 = xmm3[1,0]
+; SSE-NEXT:    movapd %xmm1, %xmm4
+; SSE-NEXT:    shufpd {{.*#+}} xmm4 = xmm4[1,0]
+; SSE-NEXT:    subss %xmm4, %xmm3
+; SSE-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT:    addss %xmm0, %xmm4
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX-NEXT:    vsubss %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddss %xmm0, %xmm4, %xmm4
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 0
   %2 = extractelement <4 x float> %B, i32 0
   %sub = fsub float %1, undef
@@ -311,11 +456,17 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
   %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
   ret <4 x float> %vecinsert4
 }
-; CHECK-LABEL: test16
-; CHECK-NOT: addsubps
-; CHECK: ret
 
 define <2 x float> @test_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; SSE-LABEL: test_v2f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %v2 = extractelement <2 x float> %v0, i32 0
   %v3 = extractelement <2 x float> %v1, i32 0
   %v4 = extractelement <2 x float> %v0, i32 1
@@ -326,6 +477,3 @@ define <2 x float> @test_v2f32(<2 x float> %v0, <2 x float> %v1) {
   %res1 = insertelement <2 x float> %res0, float %add, i32 1
   ret <2 x float> %res1
 }
-; CHECK-LABEL: test_v2f32
-; CHECK: addsubps %xmm1, %xmm0
-; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll
index 76141fc876ae0..8665edf8f1d5f 100644
--- a/test/CodeGen/X86/sse3-avx-addsub.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK
-; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -check-prefix=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
 
 ; Test ADDSUB ISel patterns.
 
@@ -35,109 +36,207 @@
 ; }
 
 define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: test1:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %sub = fsub <4 x float> %A, %B
   %add = fadd <4 x float> %A, %B
   %vecinit6 = shufflevector <4 x float> %sub, <4 x float> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x float> %vecinit6
 }
-; CHECK-LABEL: test1
-; SSE: addsubps
-; AVX: vaddsubps
-; CHECK-NEXT: ret
-
 
 define <8 x float> @test2(<8 x float> %A, <8 x float> %B) {
+; SSE-LABEL: test2:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm2, %xmm0
+; SSE-NEXT:    addsubps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %sub = fsub <8 x float> %A, %B
   %add = fadd <8 x float> %A, %B
   %vecinit14 = shufflevector <8 x float> %sub, <8 x float> %add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   ret <8 x float> %vecinit14
 }
-; CHECK-LABEL: test2
-; SSE: addsubps
-; SSE-NEXT: addsubps
-; AVX: vaddsubps
-; AVX-NOT: vaddsubps
-; CHECK: ret
-
 
 define <4 x double> @test3(<4 x double> %A, <4 x double> %B) {
+; SSE-LABEL: test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubpd %xmm2, %xmm0
+; SSE-NEXT:    addsubpd %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %sub = fsub <4 x double> %A, %B
   %add = fadd <4 x double> %A, %B
   %vecinit6 = shufflevector <4 x double> %sub, <4 x double> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x double> %vecinit6
 }
-; CHECK-LABEL: test3
-; SSE: addsubpd
-; SSE: addsubpd
-; AVX: vaddsubpd
-; AVX-NOT: vaddsubpd
-; CHECK: ret
-
 
 define <2 x double> @test4(<2 x double> %A, <2 x double> %B) #0 {
+; SSE-LABEL: test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %add = fadd <2 x double> %A, %B
   %sub = fsub <2 x double> %A, %B
   %vecinit2 = shufflevector <2 x double> %sub, <2 x double> %add, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %vecinit2
 }
-; CHECK-LABEL: test4
-; SSE: addsubpd
-; AVX: vaddsubpd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @test1b(<4 x float> %A, <4 x float>* %B) {
+; SSE-LABEL: test1b:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test1b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %B
   %add = fadd <4 x float> %A, %1
   %sub = fsub <4 x float> %A, %1
   %vecinit6 = shufflevector <4 x float> %sub, <4 x float> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x float> %vecinit6
 }
-; CHECK-LABEL: test1b
-; SSE: addsubps
-; AVX: vaddsubps
-; CHECK-NEXT: ret
-
 
 define <8 x float> @test2b(<8 x float> %A, <8 x float>* %B) {
+; SSE-LABEL: test2b:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps (%rdi), %xmm0
+; SSE-NEXT:    addsubps 16(%rdi), %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0
+; AVX-NEXT:    retq
   %1 = load <8 x float>, <8 x float>* %B
   %add = fadd <8 x float> %A, %1
   %sub = fsub <8 x float> %A, %1
   %vecinit14 = shufflevector <8 x float> %sub, <8 x float> %add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   ret <8 x float> %vecinit14
 }
-; CHECK-LABEL: test2b
-; SSE: addsubps
-; SSE-NEXT: addsubps
-; AVX: vaddsubps
-; AVX-NOT: vaddsubps
-; CHECK: ret
-
 
 define <4 x double> @test3b(<4 x double> %A, <4 x double>* %B) {
+; SSE-LABEL: test3b:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubpd (%rdi), %xmm0
+; SSE-NEXT:    addsubpd 16(%rdi), %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test3b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT:    retq
   %1 = load <4 x double>, <4 x double>* %B
   %add = fadd <4 x double> %A, %1
   %sub = fsub <4 x double> %A, %1
   %vecinit6 = shufflevector <4 x double> %sub, <4 x double> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x double> %vecinit6
 }
-; CHECK-LABEL: test3b
-; SSE: addsubpd
-; SSE: addsubpd
-; AVX: vaddsubpd
-; AVX-NOT: vaddsubpd
-; CHECK: ret
-
 
 define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) {
+; SSE-LABEL: test4b:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubpd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test4b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = load <2 x double>, <2 x double>* %B
   %sub = fsub <2 x double> %A, %1
   %add = fadd <2 x double> %A, %1
   %vecinit2 = shufflevector <2 x double> %sub, <2 x double> %add, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %vecinit2
 }
-; CHECK-LABEL: test4b
-; SSE: addsubpd
-; AVX: vaddsubpd
-; CHECK-NEXT: ret
 
+define <4 x float> @test1c(<4 x float> %A, <4 x float>* %B) {
+; SSE-LABEL: test1c:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test1c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = load <4 x float>, <4 x float>* %B
+  %add = fadd <4 x float> %A, %1
+  %sub = fsub <4 x float> %A, %1
+  %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x float> %vecinit6
+}
+
+define <8 x float> @test2c(<8 x float> %A, <8 x float>* %B) {
+; SSE-LABEL: test2c:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps (%rdi), %xmm0
+; SSE-NEXT:    addsubps 16(%rdi), %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %1 = load <8 x float>, <8 x float>* %B
+  %add = fadd <8 x float> %A, %1
+  %sub = fsub <8 x float> %A, %1
+  %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x float> %vecinit14
+}
+
+define <4 x double> @test3c(<4 x double> %A, <4 x double>* %B) {
+; SSE-LABEL: test3c:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubpd (%rdi), %xmm0
+; SSE-NEXT:    addsubpd 16(%rdi), %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test3c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %1 = load <4 x double>, <4 x double>* %B
+  %add = fadd <4 x double> %A, %1
+  %sub = fsub <4 x double> %A, %1
+  %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x double> %vecinit6
+}
+
+define <2 x double> @test4c(<2 x double> %A, <2 x double>* %B) {
+; SSE-LABEL: test4c:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubpd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test4c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = load <2 x double>, <2 x double>* %B
+  %sub = fsub <2 x double> %A, %1
+  %add = fadd <2 x double> %A, %1
+  %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 2, i32 1>
+  ret <2 x double> %vecinit2
+}
diff --git a/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
new file mode 100644
index 0000000000000..217be9aeae3a9
--- /dev/null
+++ b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse3-builtins.c
+
+define <2 x double> @test_mm_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_addsub_pd:
+; X32:       # BB#0:
+; X32-NEXT:    addsubpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_addsub_pd:
+; X64:       # BB#0:
+; X64-NEXT:    addsubpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_addsub_ps:
+; X32:       # BB#0:
+; X32-NEXT:    addsubps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_addsub_ps:
+; X64:       # BB#0:
+; X64-NEXT:    addsubps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_hadd_pd:
+; X32:       # BB#0:
+; X32-NEXT:    haddpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_pd:
+; X64:       # BB#0:
+; X64-NEXT:    haddpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_hadd_ps:
+; X32:       # BB#0:
+; X32-NEXT:    haddps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_ps:
+; X64:       # BB#0:
+; X64-NEXT:    haddps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_hsub_pd:
+; X32:       # BB#0:
+; X32-NEXT:    hsubpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_pd:
+; X64:       # BB#0:
+; X64-NEXT:    hsubpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_hsub_ps:
+; X32:       # BB#0:
+; X32-NEXT:    hsubps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_ps:
+; X64:       # BB#0:
+; X64-NEXT:    hsubps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x i64> @test_mm_lddqu_si128(i8* %a0) {
+; X32-LABEL: test_mm_lddqu_si128:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    lddqu (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_lddqu_si128:
+; X64:       # BB#0:
+; X64-NEXT:    lddqu (%rdi), %xmm0
+; X64-NEXT:    retq
+  %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
+
+define <2 x double> @test_mm_loaddup_pd(double* %a0) {
+; X32-LABEL: test_mm_loaddup_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movddup (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loaddup_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movddup (%rdi), %xmm0
+; X64-NEXT:    retq
+  %ld = load double, double* %a0
+  %res0 = insertelement <2 x double> undef, double %ld, i32 0
+  %res1 = insertelement <2 x double> %res0, double %ld, i32 1
+  ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_movedup_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_movedup_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movedup_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT:    retq
+  %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> zeroinitializer
+  ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_movehdup_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movehdup_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_moveldup_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_moveldup_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %res
+}
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 398675276c664..2c24478706e6a 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; These are tests for SSE3 codegen.
 
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 --mattr=+sse3 | FileCheck %s --check-prefix=X64
@@ -269,8 +270,10 @@ entry:
 define <4 x i32> @t17() nounwind {
 ; X64-LABEL: t17:
 ; X64:       ## BB#0: ## %entry
-; X64-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
-; X64-NEXT:    andpd {{.*}}(%rip), %xmm0
+; X64-NEXT:    movaps (%rax), %xmm0
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    retq
 entry:
   %tmp1 = load <4 x float>, <4 x float>* undef, align 16
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
index 6fab98e70a896..75f69ffd6db9d 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -42,7 +42,6 @@ define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
 
 
-
 define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: mpsadbw
   %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
@@ -59,3 +58,49 @@ define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
 declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
 
 
+define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
+  ; CHECK: pmovsxbd
+  %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
+  ; CHECK: pmovsxbq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
+  ; CHECK: pmovsxbw
+  %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
+  ; CHECK: pmovsxdq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
+  ; CHECK: pmovsxwd
+  %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
+  ; CHECK: pmovsxwq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll
index 771e4024336ce..ceff4f9782e96 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -162,54 +162,6 @@ define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
 declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
 
 
-define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
-  ; CHECK: pmovsxbd
-  %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
-  ; CHECK: pmovsxbq
-  %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
-  ; CHECK: pmovsxbw
-  %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
-  ; CHECK: pmovsxdq
-  %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
-  ; CHECK: pmovsxwd
-  %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
-  ; CHECK: pmovsxwq
-  %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
-
-
 define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
   ; CHECK: pmovzxbd
   %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll b/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll
index a16e792771431..a7e48d8ac0381 100644
--- a/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll
+++ b/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll
@@ -1,109 +1,188 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
 
 define <8 x i16> @test_llvm_x86_sse41_pmovsxbw(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbw
-; SSE41: pmovsxbw (%rdi), %xmm0
-; AVX:  vpmovsxbw (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovsxbw:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovsxbw:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a, align 1
-  %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %1)
-  ret <8 x i16> %2
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %3 = sext <8 x i8> %2 to <8 x i16>
+  ret <8 x i16> %3
 }
 
 define <4 x i32> @test_llvm_x86_sse41_pmovsxbd(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbd
-; SSE41: pmovsxbd (%rdi), %xmm0
-; AVX:  vpmovsxbd (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovsxbd:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovsxbd:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a, align 1
-  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %1)
-  ret <4 x i32> %2
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = sext <4 x i8> %2 to <4 x i32>
+  ret <4 x i32> %3
 }
 
 define <2 x i64> @test_llvm_x86_sse41_pmovsxbq(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbq
-; SSE41: pmovsxbq (%rdi), %xmm0
-; AVX:  vpmovsxbq (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovsxbq:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovsxbq:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a, align 1
-  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %1)
-  ret <2 x i64> %2
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i8> %2 to <2 x i64>
+  ret <2 x i64> %3
 }
 
 define <4 x i32> @test_llvm_x86_sse41_pmovsxwd(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovsxwd
-; SSE41: pmovsxwd (%rdi), %xmm0
-; AVX:  vpmovsxwd (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovsxwd:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovsxwd:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a, align 1
-  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1)
-  ret <4 x i32> %2
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = sext <4 x i16> %2 to <4 x i32>
+  ret <4 x i32> %3
 }
 
 define <2 x i64> @test_llvm_x86_sse41_pmovsxwq(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovsxwq
-; SSE41: pmovsxwq (%rdi), %xmm0
-; AVX:  vpmovsxwq (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovsxwq:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovsxwq:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
+; AVX-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a, align 1
-  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %1)
-  ret <2 x i64> %2
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i16> %2 to <2 x i64>
+  ret <2 x i64> %3
 }
 
 define <2 x i64> @test_llvm_x86_sse41_pmovsxdq(<4 x i32>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovsxdq
-; SSE41: pmovsxdq (%rdi), %xmm0
-; AVX:  vpmovsxdq (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovsxdq:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovsxdq:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
+; AVX-NEXT:    retq
   %1 = load <4 x i32>, <4 x i32>* %a, align 1
-  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %1)
-  ret <2 x i64> %2
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i32> %2 to <2 x i64>
+  ret <2 x i64> %3
 }
 
 define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbw
-; SSE41: pmovzxbw (%rdi), %xmm0
-; AVX:  vpmovzxbw (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovzxbw:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovzxbw:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a, align 1
   %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %1)
   ret <8 x i16> %2
 }
 
 define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbd
-; SSE41: pmovzxbd (%rdi), %xmm0
-; AVX:  vpmovzxbd (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovzxbd:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovzxbd:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a, align 1
   %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %1)
   ret <4 x i32> %2
 }
 
 define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbq
-; SSE41: pmovzxbq (%rdi), %xmm0
-; AVX:  vpmovzxbq (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovzxbq:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovzxbq:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a, align 1
   %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %1)
   ret <2 x i64> %2
 }
 
 define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovzxwd
-; SSE41: pmovzxwd (%rdi), %xmm0
-; AVX:  vpmovzxwd (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovzxwd:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovzxwd:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a, align 1
   %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %1)
   ret <4 x i32> %2
 }
 
 define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovzxwq
-; SSE41: pmovzxwq (%rdi), %xmm0
-; AVX:  vpmovzxwq (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovzxwq:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovzxwq:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; AVX-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a, align 1
   %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %1)
   ret <2 x i64> %2
 }
 
 define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) {
-; CHECK-LABEL: test_llvm_x86_sse41_pmovzxdq
-; SSE41: pmovzxdq (%rdi), %xmm0
-; AVX:  vpmovzxdq (%rdi), %xmm0
+; SSE41-LABEL: test_llvm_x86_sse41_pmovzxdq:
+; SSE41:       ## BB#0:
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_llvm_x86_sse41_pmovzxdq:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX-NEXT:    retq
   %1 = load <4 x i32>, <4 x i32>* %a, align 1
   %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %1)
   ret <2 x i64> %2
@@ -115,9 +194,3 @@ declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>)
 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>)
 declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>)
 declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>)
-declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>)
-declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>)
-declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>)
-declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>)
-declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>)
-declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>)
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 8532c012aa9be..0a83a9753b81a 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -31,49 +31,6 @@ define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
   ret <16 x i8> %tmp1
 }
 
-define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
-; X32-LABEL: pmovsxbd_1:
-; X32:       ## BB#0: ## %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pmovsxbd (%eax), %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: pmovsxbd_1:
-; X64:       ## BB#0: ## %entry
-; X64-NEXT:    pmovsxbd (%rdi), %xmm0
-; X64-NEXT:    retq
-entry:
-	%0 = load i32, i32* %p, align 4
-	%1 = insertelement <4 x i32> undef, i32 %0, i32 0
-	%2 = insertelement <4 x i32> %1, i32 0, i32 1
-	%3 = insertelement <4 x i32> %2, i32 0, i32 2
-	%4 = insertelement <4 x i32> %3, i32 0, i32 3
-	%5 = bitcast <4 x i32> %4 to <16 x i8>
-	%6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
-	%7 = bitcast <4 x i32> %6 to <2 x i64>
-	ret <2 x i64> %7
-}
-
-define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
-; X32-LABEL: pmovsxwd_1:
-; X32:       ## BB#0: ## %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pmovsxwd (%eax), %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: pmovsxwd_1:
-; X64:       ## BB#0: ## %entry
-; X64-NEXT:    pmovsxwd (%rdi), %xmm0
-; X64-NEXT:    retq
-entry:
-	%0 = load i64, i64* %p		; <i64> [#uses=1]
-	%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0		; <<2 x i64>> [#uses=1]
-	%1 = bitcast <2 x i64> %tmp2 to <8 x i16>		; <<8 x i16>> [#uses=1]
-	%2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone		; <<4 x i32>> [#uses=1]
-	%3 = bitcast <4 x i32> %2 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	ret <2 x i64> %3
-}
-
 define <2 x i64> @pmovzxbq_1() nounwind {
 ; X32-LABEL: pmovzxbq_1:
 ; X32:       ## BB#0: ## %entry
@@ -94,8 +51,6 @@ entry:
 	ret <2 x i64> %3
 }
 
-declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
 
 define i32 @extractps_1(<4 x float> %v) nounwind {
@@ -137,7 +92,7 @@ define float @ext_1(<4 x float> %v) nounwind {
 ; X32:       ## BB#0:
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; X32-NEXT:    addss LCPI7_0, %xmm0
+; X32-NEXT:    addss LCPI5_0, %xmm0
 ; X32-NEXT:    movss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -204,7 +159,7 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) noun
 define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
 ; X32-LABEL: blendps_not_insertps_1:
 ; X32:       ## BB#0:
-; X32-NEXT:    movss   {{.*#+}} xmm1
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X32-NEXT:    retl
 ;
@@ -839,12 +794,12 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
 ; X32-LABEL: insertps_from_vector_load:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; X64-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -857,12 +812,12 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
 ; X32-LABEL: insertps_from_vector_load_offset:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
+; X32-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
+; X64-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X64-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -876,13 +831,13 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    shll $4, %ecx
-; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
+; X32-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset_2:
 ; X64:       ## BB#0:
 ; X64-NEXT:    shlq $4, %rsi
-; X64-NEXT:    insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
+; X64-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
   %2 = load <4 x float>, <4 x float>* %1, align 16
@@ -1013,12 +968,12 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
 ; X32-LABEL: pr20087:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: pr20087:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X64-NEXT:    retq
   %load = load <4 x float> , <4 x float> *%ptr
   %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
diff --git a/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
new file mode 100644
index 0000000000000..f93a16a5eb3d2
--- /dev/null
+++ b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=ALL  --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=ALL  --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse4a-builtins.c
+
+define <2 x i64> @test_mm_extracti_si64(<2 x i64> %x) {
+; X32-LABEL: test_mm_extracti_si64:
+; X32:       # BB#0:
+; X32-NEXT:    extrq $2, $3, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_extracti_si64:
+; X64:       # BB#0:
+; X64-NEXT:    extrq $2, $3, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind readnone
+
+define <2 x i64> @test_mm_extract_si64(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: test_mm_extract_si64:
+; X32:       # BB#0:
+; X32-NEXT:    extrq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_extract_si64:
+; X64:       # BB#0:
+; X64-NEXT:    extrq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %bc = bitcast <2 x i64> %y to <16 x i8>
+  %res = call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %bc)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_inserti_si64(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: test_mm_inserti_si64:
+; X32:       # BB#0:
+; X32-NEXT:    insertq $6, $5, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_inserti_si64:
+; X64:       # BB#0:
+; X64-NEXT:    insertq $6, $5, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 5, i8 6)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind readnone
+
+define <2 x i64> @test_mm_insert_si64(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: test_mm_insert_si64:
+; X32:       # BB#0:
+; X32-NEXT:    insertq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_insert_si64:
+; X64:       # BB#0:
+; X64-NEXT:    insertq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define void @test_stream_sd(i8* %p, <2 x double> %a) {
+; X32-LABEL: test_stream_sd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movntsd %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_stream_sd:
+; X64:       # BB#0:
+; X64-NEXT:    movntsd %xmm0, (%rdi)
+; X64-NEXT:    retq
+  call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
+  ret void
+}
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>) nounwind readnone
+
+define void @test_mm_stream_ss(i8* %p, <4 x float> %a) {
+; X32-LABEL: test_mm_stream_ss:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movntss %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_stream_ss:
+; X64:       # BB#0:
+; X64-NEXT:    movntss %xmm0, (%rdi)
+; X64-NEXT:    retq
+  call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
+  ret void
+}
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
index 377c3b7d6ead5..8d61428420f65 100644
--- a/test/CodeGen/X86/sse_partial_update.ll
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -90,3 +90,36 @@ entry:
 declare void @callee2(float, float)
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
 
+define <2 x double> @load_fold_cvtss2sd_int(<4 x float> *%a) {
+; CHECK-LABEL: load_fold_cvtss2sd_int:
+; CHECK:       movaps   (%rdi), %xmm1
+; CHECK-NEXT:  xorps    %xmm0, %xmm0
+; CHECK-NEXT:  cvtss2sd %xmm1, %xmm0
+; CHECK-NEXT:  retq
+  %ld = load <4 x float>, <4 x float> *%a
+  %x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
+  ret <2 x double> %x
+}
+
+define <2 x double> @load_fold_cvtss2sd_int_optsize(<4 x float> *%a) optsize {
+; CHECK-LABEL: load_fold_cvtss2sd_int_optsize:
+; CHECK:       xorps    %xmm0, %xmm0
+; CHECK-NEXT:  cvtss2sd (%rdi), %xmm0
+; CHECK-NEXT:  retq
+  %ld = load <4 x float>, <4 x float> *%a
+  %x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
+  ret <2 x double> %x
+}
+
+define <2 x double> @load_fold_cvtss2sd_int_minsize(<4 x float> *%a) minsize {
+; CHECK-LABEL: load_fold_cvtss2sd_int_minsize:
+; CHECK:       xorps    %xmm0, %xmm0
+; CHECK-NEXT:  cvtss2sd (%rdi), %xmm0
+; CHECK-NEXT:  retq
+  %ld = load <4 x float>, <4 x float> *%a
+  %x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
+  ret <2 x double> %x
+}
+
+declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
+
diff --git a/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
new file mode 100644
index 0000000000000..4f7ff20c6e0d6
--- /dev/null
+++ b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
@@ -0,0 +1,290 @@
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/ssse3-builtins.c
+
+define <2 x i64> @test_mm_abs_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_abs_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    pabsb %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_abs_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    pabsb %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg = bitcast <2 x i64> %a0 to <16 x i8>
+  %call = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %arg)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_abs_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_abs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pabsw %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_abs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pabsw %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg = bitcast <2 x i64> %a0 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %arg)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_abs_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_abs_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    pabsd %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_abs_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    pabsd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg = bitcast <2 x i64> %a0 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %arg)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_alignr_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    palignr {{.*#}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; X32-NEXT:    movdqa %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_alignr_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    palignr {{.*#}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; X64-NEXT:    movdqa %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  %res = bitcast <16 x i8> %shuf to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_hadd_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hadd_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phaddw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phaddw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hadd_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hadd_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    phaddd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    phaddd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_hadds_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hadds_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phaddsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadds_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phaddsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hsub_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hsub_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phsubw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phsubw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hsub_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hsub_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    phsubd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    phsubd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_hsubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hsubs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phsubsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsubs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phsubsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_maddubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maddubs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pmaddubsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maddubs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pmaddubsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %call = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %arg0, <16 x i8> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_mulhrs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mulhrs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pmulhrsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mulhrs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pmulhrsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_shuffle_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shuffle_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    pshufb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_shuffle_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    pshufb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %call = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %arg0, <16 x i8> %arg1)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_sign_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sign_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    psignb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sign_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    psignb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %call = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %arg0, <16 x i8> %arg1)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_sign_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sign_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psignw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sign_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psignw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sign_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sign_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    psignd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sign_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    psignd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/X86/stack-align-memcpy.ll b/test/CodeGen/X86/stack-align-memcpy.ll
index 0cc3aa848891b..129fb0c6b1f6a 100644
--- a/test/CodeGen/X86/stack-align-memcpy.ll
+++ b/test/CodeGen/X86/stack-align-memcpy.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -force-align-stack -mtriple i386-apple-darwin -mcpu=i486 | FileCheck %s
+; RUN: llc < %s -stackrealign -mtriple i386-apple-darwin -mcpu=i486 | FileCheck %s
 
 %struct.foo = type { [88 x i8] }
 
diff --git a/test/CodeGen/X86/stack-folding-adx-x86_64.ll b/test/CodeGen/X86/stack-folding-adx-x86_64.ll
new file mode 100644
index 0000000000000..5f109f09aa194
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-adx-x86_64.ll
@@ -0,0 +1,45 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+adx < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define i8 @stack_fold_addcarry_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarry_u32
+  ;CHECK:       adcxl {{-?[0-9]*}}(%rsp), %ecx {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarry.u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*)
+
+define i8 @stack_fold_addcarry_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarry_u64
+  ;CHECK:       adcxq {{-?[0-9]*}}(%rsp), %rcx {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarry.u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*)
+
+define i8 @stack_fold_addcarryx_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarryx_u32
+  ;CHECK:       adcxl {{-?[0-9]*}}(%rsp), %ecx {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarryx.u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*)
+
+define i8 @stack_fold_addcarryx_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarryx_u64
+  ;CHECK:       adcxq {{-?[0-9]*}}(%rsp), %rcx {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarryx.u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*)
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 63aa742bdf018..b86ec0ea22ff1 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+f16c < %s | FileCheck %s
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -946,7 +946,15 @@ define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) {
   ret <8 x float> %2
 }
 
-; TODO stack_fold_insertps
+define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_insertps
+  ;CHECK:       vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  ;CHECK-NEXT:                                                                              {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
   ;CHECK-LABEL: stack_fold_maxpd
@@ -1411,7 +1419,7 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
 
 define double @stack_fold_roundsd(double %a0) optsize {
   ;CHECK-LABEL: stack_fold_roundsd
-  ;CHECK:       vroundsd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  ;CHECK:       vroundsd $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call double @llvm.floor.f64(double %a0)
   ret double %2
@@ -1423,7 +1431,7 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
 
 define float @stack_fold_roundss(float %a0) optsize {
   ;CHECK-LABEL: stack_fold_roundss
-  ;CHECK:       vroundss $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       vroundss $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call float @llvm.floor.f32(float %a0)
   ret float %2
@@ -1494,7 +1502,7 @@ define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) {
   ;CHECK-LABEL: stack_fold_shufps_ymm
   ;CHECK:       vshufps $148, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 13, i32 14>
   ret <8 x float> %2
 }
 
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
index f9fcbaabdebb4..105115bc7d25c 100644
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2 < %s | FileCheck %s
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -314,7 +314,7 @@ define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
 }
 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
 
-define float @stack_fold_cvtsd2ss(double %a0) optsize {
+define float @stack_fold_cvtsd2ss(double %a0) minsize {
   ;CHECK-LABEL: stack_fold_cvtsd2ss
   ;CHECK:       cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
@@ -331,7 +331,7 @@ define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) optsize {
 }
 declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
 
-define double @stack_fold_cvtsi2sd(i32 %a0) optsize {
+define double @stack_fold_cvtsi2sd(i32 %a0) minsize {
   ;CHECK-LABEL: stack_fold_cvtsi2sd
   ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
@@ -365,7 +365,7 @@ define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
 }
 declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
 
-define float @stack_fold_cvtsi2ss(i32 %a0) optsize {
+define float @stack_fold_cvtsi2ss(i32 %a0) minsize {
   ;CHECK-LABEL: stack_fold_cvtsi2ss
   ;CHECK:       cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
@@ -399,7 +399,7 @@ define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
 }
 declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
 
-define double @stack_fold_cvtss2sd(float %a0) optsize {
+define double @stack_fold_cvtss2sd(float %a0) minsize {
   ;CHECK-LABEL: stack_fold_cvtss2sd
   ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
@@ -637,7 +637,15 @@ define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
 
-; TODO stack_fold_insertps
+define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_insertps
+  ;CHECK:       insertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  ;CHECK-NEXT:                                                        {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
   ;CHECK-LABEL: stack_fold_maxpd
@@ -886,7 +894,7 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 
 define double @stack_fold_roundsd(double %a0) optsize {
   ;CHECK-LABEL: stack_fold_roundsd
-  ;CHECK:       roundsd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  ;CHECK:       roundsd $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call double @llvm.floor.f64(double %a0)
   ret double %2
@@ -896,9 +904,9 @@ declare double @llvm.floor.f64(double) nounwind readnone
 ; TODO stack_fold_roundsd_int
 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
 
-define float @stack_fold_roundss(float %a0) optsize {
+define float @stack_fold_roundss(float %a0) minsize {
   ;CHECK-LABEL: stack_fold_roundss
-  ;CHECK:       roundss $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  ;CHECK:       roundss $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call float @llvm.floor.f32(float %a0)
   ret float %2
@@ -968,7 +976,7 @@ declare double @llvm.sqrt.f64(double) nounwind readnone
 ; TODO stack_fold_sqrtsd_int
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
 
-define float @stack_fold_sqrtss(float %a0) optsize {
+define float @stack_fold_sqrtss(float %a0) minsize {
   ;CHECK-LABEL: stack_fold_sqrtss
   ;CHECK:       sqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
diff --git a/test/CodeGen/X86/stack-folding-int-avx1.ll b/test/CodeGen/X86/stack-folding-int-avx1.ll
index fec297d5e9d40..15ffb1d2dcc5c 100644
--- a/test/CodeGen/X86/stack-folding-int-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+aes,+pclmul < %s | FileCheck %s
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+aes,+pclmul < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -671,55 +671,55 @@ define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxbd
   ;CHECK:       vpmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0)
-  ret <4 x i32> %2
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = sext <4 x i8> %2 to <4 x i32>
+  ret <4 x i32> %3
 }
-declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
 
 define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxbq
-  ;CHECK:       pmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  ;CHECK:       vpmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0)
-  ret <2 x i64> %2
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i8> %2 to <2 x i64>
+  ret <2 x i64> %3
 }
-declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
 
 define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxbw
   ;CHECK:       vpmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0)
-  ret <8 x i16> %2
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %3 = sext <8 x i8> %2 to <8 x i16>
+  ret <8 x i16> %3
 }
-declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
 
 define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxdq
   ;CHECK:       vpmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0)
-  ret <2 x i64> %2
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i32> %2 to <2 x i64>
+  ret <2 x i64> %3
 }
-declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
 
 define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxwd
   ;CHECK:       vpmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0)
-  ret <4 x i32> %2
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = sext <4 x i16> %2 to <4 x i32>
+  ret <4 x i32> %3
 }
-declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
 
 define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxwq
   ;CHECK:       vpmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0)
-  ret <2 x i64> %2
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i16> %2 to <2 x i64>
+  ret <2 x i64> %3
 }
-declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
 
 define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovzxbd
diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll
index a164fbbc7a6ae..235a10ed4678a 100644
--- a/test/CodeGen/X86/stack-folding-int-avx2.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -12,7 +12,7 @@ define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) {
   ;CHECK-LABEL: stack_fold_broadcastsd_ymm
   ;CHECK:       vbroadcastsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
+  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
   ; fadd forces execution domain
   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
   ret <4 x double> %3
@@ -23,7 +23,7 @@ define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) {
   ;CHECK-LABEL: stack_fold_broadcastss
   ;CHECK:       vbroadcastss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
+  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
   ; fadd forces execution domain
   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
   ret <4 x float> %3
@@ -34,7 +34,7 @@ define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
   ;CHECK-LABEL: stack_fold_broadcastss_ymm
   ;CHECK:       vbroadcastss {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
+  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
   ; fadd forces execution domain
   %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   ret <8 x float> %3
@@ -286,81 +286,73 @@ define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pbroadcastb
   ;CHECK:       vpbroadcastb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0)
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %2
 }
-declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
 
 define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pbroadcastb_ymm
   ;CHECK:       vpbroadcastb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0)
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> zeroinitializer
   ret <32 x i8> %2
 }
-declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
 
 define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) {
   ;CHECK-LABEL: stack_fold_pbroadcastd
   ;CHECK:       vpbroadcastd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0)
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer
   ; add forces execution domain
   %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %3
 }
-declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
 
 define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) {
   ;CHECK-LABEL: stack_fold_pbroadcastd_ymm
   ;CHECK:       vpbroadcastd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0)
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> zeroinitializer
   ; add forces execution domain
   %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <8 x i32> %3
 }
-declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
 
 define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_pbroadcastq
   ;CHECK:       vpbroadcastq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0)
+  %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
   ; add forces execution domain
   %3 = add <2 x i64> %2, <i64 1, i64 1>
   ret <2 x i64> %3
 }
-declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
 
 define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_pbroadcastq_ymm
   ;CHECK:       vpbroadcastq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0)
+  %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
   ; add forces execution domain
   %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
   ret <4 x i64> %3
 }
-declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
 
 define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pbroadcastw
   ;CHECK:       vpbroadcastw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0)
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
   ret <8 x i16> %2
 }
-declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
 
 define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pbroadcastw_ymm
   ;CHECK:       vpbroadcastw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0)
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> zeroinitializer
   ret <16 x i16> %2
 }
-declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
 
 define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_pcmpeqb
@@ -455,28 +447,28 @@ declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
 
 define <4 x double> @stack_fold_permpd(<4 x double> %a0) {
   ;CHECK-LABEL: stack_fold_permpd
-  ;CHECK:   vpermpd $255, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  ;CHECK:   vpermpd $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
   ; fadd forces execution domain
   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
   ret <4 x double> %3
 }
 
-define <8 x float> @stack_fold_permps(<8 x float> %a0, <8 x float> %a1) {
+define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) {
   ;CHECK-LABEL: stack_fold_permps
   ;CHECK:       vpermps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x float> %a0)
+  %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
   ret <8 x float> %2
 }
-declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x float>) nounwind readonly
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
 
 define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_permq
-  ;CHECK:   vpermq $255, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
   ; add forces execution domain
   %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
   ret <4 x i64> %3
@@ -684,28 +676,25 @@ define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxbw
   ;CHECK:       vpmovsxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0)
+  %2 = sext <16 x i8> %a0 to <16 x i16>
   ret <16 x i16> %2
 }
-declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
 
 define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxdq
   ;CHECK:       vpmovsxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0)
+  %2 = sext <4 x i32> %a0 to <4 x i64>
   ret <4 x i64> %2
 }
-declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
 
 define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxwd
   ;CHECK:       vpmovsxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0)
+  %2 = sext <8 x i16> %a0 to <8 x i32>
   ret <8 x i32> %2
 }
-declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
 
 define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxwq
diff --git a/test/CodeGen/X86/stack-folding-int-sse42.ll b/test/CodeGen/X86/stack-folding-int-sse42.ll
index e814ae6df5014..f732607851fc9 100644
--- a/test/CodeGen/X86/stack-folding-int-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-int-sse42.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2,+aes,+pclmul < %s | FileCheck %s
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+aes,+pclmul < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -698,55 +698,55 @@ define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxbd
   ;CHECK:       pmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0)
-  ret <4 x i32> %2
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = sext <4 x i8> %2 to <4 x i32>
+  ret <4 x i32> %3
 }
-declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
 
 define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxbq
   ;CHECK:       pmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0)
-  ret <2 x i64> %2
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i8> %2 to <2 x i64>
+  ret <2 x i64> %3
 }
-declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
 
 define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxbw
   ;CHECK:       pmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0)
-  ret <8 x i16> %2
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %3 = sext <8 x i8> %2 to <8 x i16>
+  ret <8 x i16> %3
 }
-declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
 
 define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxdq
   ;CHECK:       pmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0)
-  ret <2 x i64> %2
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i32> %2 to <2 x i64>
+  ret <2 x i64> %3
 }
-declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
 
 define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxwd
   ;CHECK:       pmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0)
-  ret <4 x i32> %2
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = sext <4 x i16> %2 to <4 x i32>
+  ret <4 x i32> %3
 }
-declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
 
 define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxwq
   ;CHECK:       pmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0)
-  ret <2 x i64> %2
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i16> %2 to <2 x i64>
+  ret <2 x i64> %3
 }
-declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
 
 define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovzxbd
diff --git a/test/CodeGen/X86/stack-folding-mmx.ll b/test/CodeGen/X86/stack-folding-mmx.ll
index 8a5d4e2770dcd..3b1a4956726f5 100644
--- a/test/CodeGen/X86/stack-folding-mmx.ll
+++ b/test/CodeGen/X86/stack-folding-mmx.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s
 
 define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
   ;CHECK-LABEL: stack_fold_cvtpd2pi
@@ -59,6 +59,33 @@ declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
 ; TODO stack_fold_movq_load
 ; TODO stack_fold_movq_store
 
+define x86_mmx @stack_fold_pabsb(x86_mmx %a0) {
+  ;CHECK-LABEL: stack_fold_pabsb
+  ;CHECK:       pabsb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pabsd(x86_mmx %a0) {
+  ;CHECK-LABEL: stack_fold_pabsd
+  ;CHECK:       pabsd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pabsw(x86_mmx %a0) {
+  ;CHECK-LABEL: stack_fold_pabsw
+  ;CHECK:       pabsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+
 define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) {
   ;CHECK-LABEL: stack_fold_packssdw
   ;CHECK:       packssdw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
@@ -158,6 +185,15 @@ define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) {
 }
 declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
 
+define x86_mmx @stack_fold_palignr(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_palignr
+  ;CHECK:       palignr $1, {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %a, x86_mmx %b, i8 1) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+
 define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) {
   ;CHECK-LABEL: stack_fold_pand
   ;CHECK:       pand {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
@@ -248,8 +284,71 @@ define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) {
 }
 declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
 
+define x86_mmx @stack_fold_phaddd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_phaddd
+  ;CHECK:       phaddd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_phaddsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_phaddsw
+  ;CHECK:       phaddsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_phaddw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_phaddw
+  ;CHECK:       phaddw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_phsubd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_phsubd
+  ;CHECK:       phsubd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_phsubsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_phsubsw
+  ;CHECK:       phsubsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_phsubw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_phsubw
+  ;CHECK:       phsubw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+
 ; TODO stack_fold_pinsrw
 
+define x86_mmx @stack_fold_pmaddubsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw
+  ;CHECK:       pmaddubsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+
 define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) {
   ;CHECK-LABEL: stack_fold_pmaddwd
   ;CHECK:       pmaddwd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
@@ -295,6 +394,15 @@ define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) {
 }
 declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
 
+define x86_mmx @stack_fold_pmulhrsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmulhrsw
+  ;CHECK:       pmulhrsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+
 define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) {
   ;CHECK-LABEL: stack_fold_pmulhuw
   ;CHECK:       pmulhuw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
@@ -349,7 +457,16 @@ define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) {
 }
 declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
 
-define x86_mmx @stack_fold_pshufw(x86_mmx %a, x86_mmx %b) {
+define x86_mmx @stack_fold_pshufb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pshufb
+  ;CHECK:       pshufb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pshufw(x86_mmx %a) {
   ;CHECK-LABEL: stack_fold_pshufw
   ;CHECK:       pshufw $1, {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
@@ -358,6 +475,33 @@ define x86_mmx @stack_fold_pshufw(x86_mmx %a, x86_mmx %b) {
 }
 declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
 
+define x86_mmx @stack_fold_psignb(x86_mmx %a0, x86_mmx %a1) {
+  ;CHECK-LABEL: stack_fold_psignb
+  ;CHECK:       psignb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %a0, x86_mmx %a1) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psignd(x86_mmx %a0, x86_mmx %a1) {
+  ;CHECK-LABEL: stack_fold_psignd
+  ;CHECK:       psignd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %a0, x86_mmx %a1) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psignw(x86_mmx %a0, x86_mmx %a1) {
+  ;CHECK-LABEL: stack_fold_psignw
+  ;CHECK:       psignw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %a0, x86_mmx %a1) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+
 define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) {
   ;CHECK-LABEL: stack_fold_pslld
   ;CHECK:       pslld {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
diff --git a/test/CodeGen/X86/stack-folding-x86_64.ll b/test/CodeGen/X86/stack-folding-x86_64.ll
index 211227916a09b..f96880d0237a3 100644
--- a/test/CodeGen/X86/stack-folding-x86_64.ll
+++ b/test/CodeGen/X86/stack-folding-x86_64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 < %s | FileCheck %s
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
diff --git a/test/CodeGen/X86/stack-folding-xop.ll b/test/CodeGen/X86/stack-folding-xop.ll
index 44a0d1dc65828..d0c48b400804f 100644
--- a/test/CodeGen/X86/stack-folding-xop.ll
+++ b/test/CodeGen/X86/stack-folding-xop.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+xop < %s | FileCheck %s
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
diff --git a/test/CodeGen/X86/stack-probe-size.ll b/test/CodeGen/X86/stack-probe-size.ll
index 21482c3abdedd..4d1f88d111726 100644
--- a/test/CodeGen/X86/stack-probe-size.ll
+++ b/test/CodeGen/X86/stack-probe-size.ll
@@ -6,10 +6,9 @@
 ; stack probe size equals the page size (4096 bytes for all x86 targets), and
 ; this is unlikely to change in the future.
 ;
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
 
 target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
-target triple = "i686-pc-windows-msvc"
 
 define i32 @test1() "stack-probe-size"="0" {
   %buffer = alloca [4095 x i8]
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index 3aba19464b9df..237b96603c005 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -8,7 +8,7 @@
 @a = external global { i64, [56 x i8] }, align 32
 
 ; Function Attrs: nounwind sspreq
-define i32 @_Z18read_response_sizev() #0 {
+define i32 @_Z18read_response_sizev() #0 !dbg !9 {
 entry:
   tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23, metadata !DIExpression()), !dbg !39
   %0 = load i64, i64* getelementptr inbounds ({ i64, [56 x i8] }, { i64, [56 x i8] }* @a, i32 0, i32 0), align 8, !dbg !40
@@ -25,7 +25,7 @@ attributes #0 = { sspreq }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21, !72}
 
-!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.4 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !5, subprograms: !8, globals: !20, imports: !5)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.4 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !5, subprograms: !8, globals: !20, imports: !5)
 !1 = !DIFile(filename: "<unknown>", directory: "/Users/matt/ryan_bug")
 !2 = !{!3}
 !3 = !DICompositeType(tag: DW_TAG_enumeration_type, line: 20, size: 32, align: 32, file: !1, scope: !4, elements: !6)
@@ -34,22 +34,22 @@ attributes #0 = { sspreq }
 !6 = !{!7}
 !7 = !DIEnumerator(name: "max_frame_size", value: 0) ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
 !8 = !{!9, !24, !41, !65}
-!9 = !DISubprogram(name: "read_response_size", linkageName: "_Z18read_response_sizev", line: 27, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 27, file: !1, scope: !10, type: !11, function: i32 ()* @_Z18read_response_sizev, variables: !14)
+!9 = distinct !DISubprogram(name: "read_response_size", linkageName: "_Z18read_response_sizev", line: 27, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 27, file: !1, scope: !10, type: !11, variables: !14)
 !10 = !DIFile(filename: "<unknown>", directory: "/Users/matt/ryan_bug")
 !11 = !DISubroutineType(types: !12)
 !12 = !{!13}
 !13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !14 = !{!15, !19}
-!15 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "b", line: 28, scope: !9, file: !10, type: !16)
+!15 = !DILocalVariable(name: "b", line: 28, scope: !9, file: !10, type: !16)
 !16 = !DICompositeType(tag: DW_TAG_structure_type, name: "B", line: 16, size: 32, align: 32, file: !1, elements: !17)
 !17 = !{!18}
 !18 = !DIDerivedType(tag: DW_TAG_member, name: "end_of_file", line: 17, size: 32, align: 32, file: !1, scope: !16, baseType: !13)
-!19 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "c", line: 29, scope: !9, file: !10, type: !13)
+!19 = !DILocalVariable(name: "c", line: 29, scope: !9, file: !10, type: !13)
 !20 = !{}
 !21 = !{i32 2, !"Dwarf Version", i32 2}
 !22 = !{i64* getelementptr inbounds ({ i64, [56 x i8] }, { i64, [56 x i8] }* @a, i32 0, i32 0)}
-!23 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "p2", line: 12, arg: 2, scope: !24, file: !10, type: !32)
-!24 = !DISubprogram(name: "min<unsigned long long>", linkageName: "_ZN3__13minIyEERKT_S3_RS1_", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !1, scope: !25, type: !27, templateParams: !33, variables: !35)
+!23 = !DILocalVariable(name: "p2", line: 12, arg: 2, scope: !24, file: !10, type: !32)
+!24 = distinct !DISubprogram(name: "min<unsigned long long>", linkageName: "_ZN3__13minIyEERKT_S3_RS1_", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !1, scope: !25, type: !27, templateParams: !33, variables: !35)
 !25 = !DINamespace(name: "__1", line: 1, file: !26, scope: null)
 !26 = !DIFile(filename: "main.cpp", directory: "/Users/matt/ryan_bug")
 !27 = !DISubroutineType(types: !28)
@@ -61,12 +61,12 @@ attributes #0 = { sspreq }
 !33 = !{!34}
 !34 = !DITemplateTypeParameter(name: "_Tp", type: !31)
 !35 = !{!36, !37}
-!36 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "p1", line: 12, arg: 1, scope: !24, file: !10, type: !29)
-!37 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "p2", line: 12, arg: 2, scope: !24, file: !10, type: !32)
+!36 = !DILocalVariable(name: "p1", line: 12, arg: 1, scope: !24, file: !10, type: !29)
+!37 = !DILocalVariable(name: "p2", line: 12, arg: 2, scope: !24, file: !10, type: !32)
 !38 = !DILocation(line: 33, scope: !9)
 !39 = !DILocation(line: 12, scope: !24, inlinedAt: !38)
 !40 = !DILocation(line: 9, scope: !41, inlinedAt: !59)
-!41 = !DISubprogram(name: "min<unsigned long long, __1::A>", linkageName: "_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 8, file: !1, scope: !25, type: !42, templateParams: !53, variables: !55)
+!41 = distinct !DISubprogram(name: "min<unsigned long long, __1::A>", linkageName: "_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 8, file: !1, scope: !25, type: !42, templateParams: !53, variables: !55)
 !42 = !DISubroutineType(types: !43)
 !43 = !{!29, !29, !32, !44}
 !44 = !DICompositeType(tag: DW_TAG_structure_type, name: "A", size: 8, align: 8, file: !1, scope: !25, elements: !45)
@@ -80,17 +80,17 @@ attributes #0 = { sspreq }
 !53 = !{!34, !54}
 !54 = !DITemplateTypeParameter(name: "_Compare", type: !44)
 !55 = !{!56, !57, !58}
-!56 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "p1", line: 7, arg: 1, scope: !41, file: !10, type: !29)
-!57 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "p2", line: 7, arg: 2, scope: !41, file: !10, type: !32)
-!58 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "p3", line: 8, arg: 3, scope: !41, file: !10, type: !44)
+!56 = !DILocalVariable(name: "p1", line: 7, arg: 1, scope: !41, file: !10, type: !29)
+!57 = !DILocalVariable(name: "p2", line: 7, arg: 2, scope: !41, file: !10, type: !32)
+!58 = !DILocalVariable(name: "p3", line: 8, arg: 3, scope: !41, file: !10, type: !44)
 !59 = !DILocation(line: 13, scope: !24, inlinedAt: !38)
 !63 = !{i32 undef}
-!64 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "p1", line: 1, arg: 2, scope: !65, file: !10, type: !50)
-!65 = !DISubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !25, type: !47, declaration: !46, variables: !66)
+!64 = !DILocalVariable(name: "p1", line: 1, arg: 2, scope: !65, file: !10, type: !50)
+!65 = distinct !DISubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !25, type: !47, declaration: !46, variables: !66)
 !66 = !{!67, !69, !70}
-!67 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !65, type: !68)
+!67 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !65, type: !68)
 !68 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !44)
-!69 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "p1", line: 1, arg: 2, scope: !65, file: !10, type: !50)
-!70 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "", line: 2, arg: 3, scope: !65, file: !10, type: !50)
+!69 = !DILocalVariable(name: "p1", line: 1, arg: 2, scope: !65, file: !10, type: !50)
+!70 = !DILocalVariable(name: "", line: 2, arg: 3, scope: !65, file: !10, type: !50)
 !71 = !DILocation(line: 1, scope: !65, inlinedAt: !40)
 !72 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll
index 4220a4c46a0ae..dea66d28e3dd2 100644
--- a/test/CodeGen/X86/stack-protector-weight.ll
+++ b/test/CodeGen/X86/stack-protector-weight.ll
@@ -2,13 +2,13 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
 
 ; SELDAG: # Machine code for function test_branch_weights:
-; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](1048575) BB#[[FAILURE:[0-9]+]](1)
+; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
 ; SELDAG: BB#[[FAILURE]]:
 ; SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
 ; SELDAG: BB#[[SUCCESS]]:
 
 ; IR: # Machine code for function test_branch_weights:
-; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](1048575) BB#[[FAILURE:[0-9]+]](1)
+; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
 ; IR: BB#[[SUCCESS]]:
 ; IR: BB#[[FAILURE]]:
 ; IR: CALL64pcrel32 <ga:@__stack_chk_fail>
diff --git a/test/CodeGen/X86/stackmap-frame-setup.ll b/test/CodeGen/X86/stackmap-frame-setup.ll
new file mode 100644
index 0000000000000..076e2482f8bad
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-frame-setup.ll
@@ -0,0 +1,20 @@
+; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=x86_64-apple-darwin -mcpu=corei7 -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
+; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
+
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+; ISEL:      ADJCALLSTACKDOWN64 0, 0, implicit-def
+; ISEL-NEXT: STACKMAP
+; ISEL-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+; FAST-ISEL:      ADJCALLSTACKDOWN64 0, 0, implicit-def
+; FAST-ISEL-NEXT: STACKMAP
+; FAST-ISEL-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/X86/statepoint-allocas.ll b/test/CodeGen/X86/statepoint-allocas.ll
index 4af33e1f54782..fa2621e7d2fe0 100644
--- a/test/CodeGen/X86/statepoint-allocas.ll
+++ b/test/CodeGen/X86/statepoint-allocas.ll
@@ -16,12 +16,12 @@ define i32 addrspace(1)* @test(i32 addrspace(1)* %ptr) gc "statepoint-example" {
 ; CHECK: movq   %rdi, (%rsp)
 ; CHECK: callq return_i1
 ; CHECK: movq   (%rsp), %rax
-; CHECK: popq   %rdx
+; CHECK: popq   %rcx
 ; CHECK: retq
 entry:
   %alloca = alloca i32 addrspace(1)*, align 8
   store i32 addrspace(1)* %ptr, i32 addrspace(1)** %alloca
-  call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)** %alloca)
+  call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)** %alloca)
   %rel = load i32 addrspace(1)*, i32 addrspace(1)** %alloca
   ret i32 addrspace(1)* %rel
 }
@@ -33,16 +33,16 @@ define i32 addrspace(1)* @test2(i32 addrspace(1)* %ptr) gc "statepoint-example"
 ; CHECK: movq   %rdi, (%rsp)
 ; CHECK: callq return_i1
 ; CHECK: xorl   %eax, %eax
-; CHECK: popq   %rdx
+; CHECK: popq   %rcx
 ; CHECK: retq
 entry:
   %alloca = alloca i32 addrspace(1)*, align 8
   store i32 addrspace(1)* %ptr, i32 addrspace(1)** %alloca
-  call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 1, i32 addrspace(1)** %alloca)
+  call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 1, i32 addrspace(1)** %alloca)
   ret i32 addrspace(1)* null
 }
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
 
 
 ; CHECK-LABEL: .section .llvm_stackmaps
diff --git a/test/CodeGen/X86/statepoint-call-lowering.ll b/test/CodeGen/X86/statepoint-call-lowering.ll
index 8f352b7728c3b..a8fa3cb37782d 100644
--- a/test/CodeGen/X86/statepoint-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-call-lowering.ll
@@ -5,10 +5,13 @@
 target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-linux-gnu"
 
+%struct = type { i64, i64 }
+
 declare zeroext i1 @return_i1()
 declare zeroext i32 @return_i32()
 declare i32* @return_i32ptr()
 declare float @return_float()
+declare %struct @return_struct()
 declare void @varargf(i32, ...)
 
 define i1 @test_i1_return() gc "statepoint-example" {
@@ -17,11 +20,11 @@ define i1 @test_i1_return() gc "statepoint-example" {
 ; state arguments to the statepoint
 ; CHECK: pushq %rax
 ; CHECK: callq return_i1
-; CHECK: popq %rdx
+; CHECK: popq %rcx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0)
-  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
   ret i1 %call1
 }
 
@@ -29,11 +32,11 @@ define i32 @test_i32_return() gc "statepoint-example" {
 ; CHECK-LABEL: test_i32_return
 ; CHECK: pushq %rax
 ; CHECK: callq return_i32
-; CHECK: popq %rdx
+; CHECK: popq %rcx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 0, i32 0, i32 0)
-  %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
   ret i32 %call1
 }
 
@@ -41,11 +44,11 @@ define i32* @test_i32ptr_return() gc "statepoint-example" {
 ; CHECK-LABEL: test_i32ptr_return
 ; CHECK: pushq %rax
 ; CHECK: callq return_i32ptr
-; CHECK: popq %rdx
+; CHECK: popq %rcx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, i32* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p0i32f(i64 0, i32 0, i32* ()* @return_i32ptr, i32 0, i32 0, i32 0, i32 0)
-  %call1 = call i32* @llvm.experimental.gc.result.p0i32(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, i32* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p0i32f(i64 0, i32 0, i32* ()* @return_i32ptr, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call i32* @llvm.experimental.gc.result.p0i32(token %safepoint_token)
   ret i32* %call1
 }
 
@@ -56,23 +59,35 @@ define float @test_float_return() gc "statepoint-example" {
 ; CHECK: popq %rax
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, float ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f32f(i64 0, i32 0, float ()* @return_float, i32 0, i32 0, i32 0, i32 0)
-  %call1 = call float @llvm.experimental.gc.result.f32(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, float ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f32f(i64 0, i32 0, float ()* @return_float, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call float @llvm.experimental.gc.result.f32(token %safepoint_token)
   ret float %call1
 }
 
+define %struct @test_struct_return() gc "statepoint-example" {
+; CHECK-LABEL: test_struct_return
+; CHECK: pushq %rax
+; CHECK: callq return_struct
+; CHECK: popq %rcx
+; CHECK: retq
+entry:
+  %safepoint_token = tail call token (i64, i32, %struct ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_structf(i64 0, i32 0, %struct ()* @return_struct, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call %struct @llvm.experimental.gc.result.struct(token %safepoint_token)
+  ret %struct %call1
+}
+
 define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
 ; CHECK-LABEL: test_relocate
 ; Check that an ununsed relocate has no code-generation impact
 ; CHECK: pushq %rax
 ; CHECK: callq return_i1
-; CHECK-NEXT: .Ltmp9:
-; CHECK-NEXT: popq %rdx
+; CHECK-NEXT: .Ltmp11:
+; CHECK-NEXT: popq %rcx
 ; CHECK-NEXT: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
-  %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token,  i32 7, i32 7)
-  %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
+  %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token,  i32 7, i32 7)
+  %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
   ret i1 %call2
 }
 
@@ -81,7 +96,7 @@ define void @test_void_vararg() gc "statepoint-example" {
 ; Check a statepoint wrapping a *void* returning vararg function works
 ; CHECK: callq varargf
 entry:
-  %safepoint_token = tail call i32 (i64, i32, void (i32, ...)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64 0, i32 0, void (i32, ...)* @varargf, i32 2, i32 0, i32 42, i32 43, i32 0, i32 0)
+  %safepoint_token = tail call token (i64, i32, void (i32, ...)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64 0, i32 0, void (i32, ...)* @varargf, i32 2, i32 0, i32 42, i32 43, i32 0, i32 0)
   ;; if we try to use the result from a statepoint wrapping a
   ;; non-void-returning varargf, we will experience a crash.
   ret void
@@ -92,26 +107,54 @@ define i1 @test_i1_return_patchable() gc "statepoint-example" {
 ; A patchable variant of test_i1_return
 ; CHECK: pushq %rax
 ; CHECK: nopl
-; CHECK: popq %rdx
+; CHECK: popq %rcx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 3, i1 ()*null, i32 0, i32 0, i32 0, i32 0)
-  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 3, i1 ()*null, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
   ret i1 %call1
 }
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
-declare i1 @llvm.experimental.gc.result.i1(i32)
+declare void @consume(i32 addrspace(1)* %obj)
+
+define i1 @test_cross_bb(i32 addrspace(1)* %a, i1 %external_cond) gc "statepoint-example" {
+; CHECK-LABEL: test_cross_bb
+; CHECK: movq
+; CHECK: callq return_i1
+; CHECK: %left
+; CHECK: movq
+; CHECK-NEXT: callq consume
+; CHECK: retq
+entry:
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
+  br i1 %external_cond, label %left, label %right
+
+left:
+  %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token,  i32 7, i32 7)
+  %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
+  call void @consume(i32 addrspace(1)* %call1)
+  ret i1 %call2
+
+right:
+  ret i1 true
+}
+
+
+declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
+declare i1 @llvm.experimental.gc.result.i1(token)
+
+declare token @llvm.experimental.gc.statepoint.p0f_i32f(i64, i32, i32 ()*, i32, i32, ...)
+declare i32 @llvm.experimental.gc.result.i32(token)
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_i32f(i64, i32, i32 ()*, i32, i32, ...)
-declare i32 @llvm.experimental.gc.result.i32(i32)
+declare token @llvm.experimental.gc.statepoint.p0f_p0i32f(i64, i32, i32* ()*, i32, i32, ...)
+declare i32* @llvm.experimental.gc.result.p0i32(token)
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_p0i32f(i64, i32, i32* ()*, i32, i32, ...)
-declare i32* @llvm.experimental.gc.result.p0i32(i32)
+declare token @llvm.experimental.gc.statepoint.p0f_f32f(i64, i32, float ()*, i32, i32, ...)
+declare float @llvm.experimental.gc.result.f32(token)
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_f32f(i64, i32, float ()*, i32, i32, ...)
-declare float @llvm.experimental.gc.result.f32(i32)
+declare token @llvm.experimental.gc.statepoint.p0f_structf(i64, i32, %struct ()*, i32, i32, ...)
+declare %struct @llvm.experimental.gc.result.struct(token)
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64, i32, void (i32, ...)*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64, i32, void (i32, ...)*, i32, i32, ...)
 
-declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
diff --git a/test/CodeGen/X86/statepoint-far-call.ll b/test/CodeGen/X86/statepoint-far-call.ll
index cd8dd0f35a204..2ebf38c5c019f 100644
--- a/test/CodeGen/X86/statepoint-far-call.ll
+++ b/test/CodeGen/X86/statepoint-far-call.ll
@@ -14,9 +14,9 @@ define void @test_far_call() gc "statepoint-example" {
 ; CHECK: retq
 
 entry:
-  %safepoint_token = call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* inttoptr (i64 140727162896504 to void ()*), i32 0, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* inttoptr (i64 140727162896504 to void ()*), i32 0, i32 0, i32 0, i32 0)  
   ret void
 }
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) 
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
 
diff --git a/test/CodeGen/X86/statepoint-forward.ll b/test/CodeGen/X86/statepoint-forward.ll
index 698229e705f42..d97bc0c756029 100644
--- a/test/CodeGen/X86/statepoint-forward.ll
+++ b/test/CodeGen/X86/statepoint-forward.ll
@@ -25,8 +25,8 @@ entry:
   %before = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
   %cmp1 = call i1 @f(i32 addrspace(1)* %before)
   call void @llvm.assume(i1 %cmp1)
-  %safepoint_token = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
-  %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32 %safepoint_token,  i32 7, i32 7)
+  %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
+  %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(token %safepoint_token,  i32 7, i32 7)
   %after = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %pnew
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
   ret i1 %cmp2
@@ -44,8 +44,8 @@ entry:
   %cmp1 = call i1 @f(i32 addrspace(1)* %v)
   call void @llvm.assume(i1 %cmp1)
   store i32 addrspace(1)* %v, i32 addrspace(1)* addrspace(1)* %p
-  %safepoint_token = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
-  %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32 %safepoint_token,  i32 7, i32 7)
+  %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
+  %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(token %safepoint_token,  i32 7, i32 7)
   %after = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %pnew
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
   ret i1 %cmp2
@@ -72,7 +72,7 @@ entry:
   %before = load i32 addrspace(1)*, i32 addrspace(1)** %p
   %cmp1 = call i1 @f(i32 addrspace(1)* %before)
   call void @llvm.assume(i1 %cmp1)
-  call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0)
   %after = load i32 addrspace(1)*, i32 addrspace(1)** %p
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
   ret i1 %cmp2
@@ -90,7 +90,7 @@ entry:
   %cmp1 = call i1 @f(i32 addrspace(1)* %v)
   call void @llvm.assume(i1 %cmp1)
   store i32 addrspace(1)* %v, i32 addrspace(1)** %p
-  call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0)
   %after = load i32 addrspace(1)*, i32 addrspace(1)** %p
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
   ret i1 %cmp2
@@ -102,5 +102,5 @@ entry:
 }
 
 declare void @llvm.assume(i1)
-declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
-declare i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32, i32, i32) #3
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(token, i32, i32) #3
diff --git a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
index 61b8ded2c4720..b4ba0964fdd68 100644
--- a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
@@ -18,11 +18,11 @@ define i1 @test_i1_return() gc "statepoint-example" {
 ; state arguments to the statepoint
 ; CHECK: pushq %rax
 ; CHECK: callq return_i1
-; CHECK: popq %rdx
+; CHECK: popq %rcx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 1, i32 0, i32 0)
-  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 1, i32 0, i32 0)
+  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
   ret i1 %call1
 }
 
@@ -30,11 +30,11 @@ define i32 @test_i32_return() gc "statepoint-example" {
 ; CHECK-LABEL: test_i32_return
 ; CHECK: pushq %rax
 ; CHECK: callq return_i32
-; CHECK: popq %rdx
+; CHECK: popq %rcx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 1, i32 0, i32 0)
-  %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 1, i32 0, i32 0)
+  %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
   ret i32 %call1
 }
 
@@ -42,11 +42,11 @@ define i32* @test_i32ptr_return() gc "statepoint-example" {
 ; CHECK-LABEL: test_i32ptr_return
 ; CHECK: pushq %rax
 ; CHECK: callq return_i32ptr
-; CHECK: popq %rdx
+; CHECK: popq %rcx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, i32* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p0i32f(i64 0, i32 0, i32* ()* @return_i32ptr, i32 0, i32 1, i32 0, i32 0)
-  %call1 = call i32* @llvm.experimental.gc.result.p0i32(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, i32* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p0i32f(i64 0, i32 0, i32* ()* @return_i32ptr, i32 0, i32 1, i32 0, i32 0)
+  %call1 = call i32* @llvm.experimental.gc.result.p0i32(token %safepoint_token)
   ret i32* %call1
 }
 
@@ -57,8 +57,8 @@ define float @test_float_return() gc "statepoint-example" {
 ; CHECK: popq %rax
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, float ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f32f(i64 0, i32 0, float ()* @return_float, i32 0, i32 1, i32 0, i32 0)
-  %call1 = call float @llvm.experimental.gc.result.f32(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, float ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f32f(i64 0, i32 0, float ()* @return_float, i32 0, i32 1, i32 0, i32 0)
+  %call1 = call float @llvm.experimental.gc.result.f32(token %safepoint_token)
   ret float %call1
 }
 
@@ -68,12 +68,12 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
 ; CHECK: pushq %rax
 ; CHECK: callq return_i1
 ; CHECK-NEXT: .Ltmp9:
-; CHECK-NEXT: popq %rdx
+; CHECK-NEXT: popq %rcx
 ; CHECK-NEXT: retq
 entry:
-  %safepoint_token = tail call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 1, i32 0, i32 0, i32 addrspace(1)* %a)
-  %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7)
-  %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 1, i32 0, i32 0, i32 addrspace(1)* %a)
+  %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 7)
+  %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
   ret i1 %call2
 }
 
@@ -82,7 +82,7 @@ define void @test_void_vararg() gc "statepoint-example" {
 ; Check a statepoint wrapping a *void* returning vararg function works
 ; CHECK: callq varargf
 entry:
-  %safepoint_token = tail call i32 (i64, i32, void (i32, ...)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64 0, i32 0, void (i32, ...)* @varargf, i32 2, i32 1, i32 42, i32 43, i32 0, i32 0)
+  %safepoint_token = tail call token (i64, i32, void (i32, ...)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64 0, i32 0, void (i32, ...)* @varargf, i32 2, i32 1, i32 42, i32 43, i32 0, i32 0)
   ;; if we try to use the result from a statepoint wrapping a
   ;; non-void-returning varargf, we will experience a crash.
   ret void
@@ -92,12 +92,12 @@ define i32 @test_transition_args() gc "statepoint-example" {
 ; CHECK-LABEL: test_transition_args
 ; CHECK: pushq %rax
 ; CHECK: callq return_i32
-; CHECK: popq %rdx
+; CHECK: popq %rcx
 ; CHECK: retq
 entry:
   %val = alloca i32
-  %safepoint_token = call i32 (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 1, i32 2, i32* %val, i64 42, i32 0)
-  %call1 = call i32 @llvm.experimental.gc.result.i32(i32 %safepoint_token)
+  %safepoint_token = call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 1, i32 2, i32* %val, i64 42, i32 0)
+  %call1 = call i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
   ret i32 %call1
 }
 
@@ -105,29 +105,29 @@ define i32 @test_transition_args_2() gc "statepoint-example" {
 ; CHECK-LABEL: test_transition_args_2
 ; CHECK: pushq %rax
 ; CHECK: callq return_i32
-; CHECK: popq %rdx
+; CHECK: popq %rcx
 ; CHECK: retq
 entry:
   %val = alloca i32
   %arg = alloca i8
-  %safepoint_token = call i32 (i64, i32, i32 (i32, i8*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32i32p0i8f(i64 0, i32 0, i32 (i32, i8*)* @return_i32_with_args, i32 2, i32 1, i32 0, i8* %arg, i32 2, i32* %val, i64 42, i32 0)
-  %call1 = call i32 @llvm.experimental.gc.result.i32(i32 %safepoint_token)
+  %safepoint_token = call token (i64, i32, i32 (i32, i8*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32i32p0i8f(i64 0, i32 0, i32 (i32, i8*)* @return_i32_with_args, i32 2, i32 1, i32 0, i8* %arg, i32 2, i32* %val, i64 42, i32 0)
+  %call1 = call i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
   ret i32 %call1
 }
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
-declare i1 @llvm.experimental.gc.result.i1(i32)
+declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
+declare i1 @llvm.experimental.gc.result.i1(token)
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_i32f(i64, i32, i32 ()*, i32, i32, ...)
-declare i32 @llvm.experimental.gc.statepoint.p0f_i32i32p0i8f(i64, i32, i32 (i32, i8*)*, i32, i32, ...)
-declare i32 @llvm.experimental.gc.result.i32(i32)
+declare token @llvm.experimental.gc.statepoint.p0f_i32f(i64, i32, i32 ()*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_i32i32p0i8f(i64, i32, i32 (i32, i8*)*, i32, i32, ...)
+declare i32 @llvm.experimental.gc.result.i32(token)
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_p0i32f(i64, i32, i32* ()*, i32, i32, ...)
-declare i32* @llvm.experimental.gc.result.p0i32(i32)
+declare token @llvm.experimental.gc.statepoint.p0f_p0i32f(i64, i32, i32* ()*, i32, i32, ...)
+declare i32* @llvm.experimental.gc.result.p0i32(token)
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_f32f(i64, i32, float ()*, i32, i32, ...)
-declare float @llvm.experimental.gc.result.f32(i32)
+declare token @llvm.experimental.gc.statepoint.p0f_f32f(i64, i32, float ()*, i32, i32, ...)
+declare float @llvm.experimental.gc.result.f32(token)
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64, i32, void (i32, ...)*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64, i32, void (i32, ...)*, i32, i32, ...)
 
-declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32)
\ No newline at end of file
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
\ No newline at end of file
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 81b9ab89ebca5..1d38b2facc732 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -14,13 +14,13 @@ entry:
   ; CHECK: Ltmp{{[0-9]+}}:
   ; CHECK: callq some_call
   ; CHECK: Ltmp{{[0-9]+}}:
-  %0 = invoke i32 (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1)
+  %0 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1)
           to label %invoke_safepoint_normal_dest unwind label %exceptional_return
 
 invoke_safepoint_normal_dest:
   ; CHECK: movq
-  %obj.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %0, i32 13, i32 13)
-  %obj1.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %0, i32 14, i32 14)
+  %obj.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %0, i32 13, i32 13)
+  %obj1.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %0, i32 14, i32 14)
   br label %normal_return
 
 normal_return:
@@ -31,11 +31,10 @@ exceptional_return:
   ; CHECK: Ltmp{{[0-9]+}}:
   ; CHECK: movq
   ; CHECK: retq
-  %landing_pad = landingpad { i8*, i32 }
+  %landing_pad = landingpad token
           cleanup
-  %relocate_token = extractvalue { i8*, i32 } %landing_pad, 1
-  %obj.relocated1 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %relocate_token, i32 13, i32 13)
-  %obj1.relocated1 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %relocate_token, i32 14, i32 14)
+  %obj.relocated1 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 13, i32 13)
+  %obj1.relocated1 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 14, i32 14)
   ret i64 addrspace(1)* %obj1.relocated1
 }
 ; CHECK-LABEL: GCC_except_table{{[0-9]+}}:
@@ -51,22 +50,21 @@ entry:
   ; CHECK: .Ltmp{{[0-9]+}}:
   ; CHECK: callq some_other_call
   ; CHECK: .Ltmp{{[0-9]+}}:
-  %0 = invoke i32 (i64, i32, i64 addrspace(1)* (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64p1i64f(i64 0, i32 0, i64 addrspace(1)* (i64 addrspace(1)*)* @some_other_call, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1)
+  %0 = invoke token (i64, i32, i64 addrspace(1)* (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64p1i64f(i64 0, i32 0, i64 addrspace(1)* (i64 addrspace(1)*)* @some_other_call, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1)
           to label %normal_return unwind label %exceptional_return
 
 normal_return:
   ; CHECK: popq 
   ; CHECK: retq
-  %ret_val = call i64 addrspace(1)* @llvm.experimental.gc.result.p1i64(i32 %0)
+  %ret_val = call i64 addrspace(1)* @llvm.experimental.gc.result.p1i64(token %0)
   ret i64 addrspace(1)* %ret_val
 
 exceptional_return:
   ; CHECK: .Ltmp{{[0-9]+}}:
   ; CHECK: movq
-  %landing_pad = landingpad { i8*, i32 }
+  %landing_pad = landingpad token
           cleanup
-  %relocate_token = extractvalue { i8*, i32 } %landing_pad, 1
-  %obj.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %relocate_token, i32 13, i32 13)
+  %obj.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 13, i32 13)
   ret i64 addrspace(1)* %obj.relocated
 }
 ; CHECK-LABEL: GCC_except_table{{[0-9]+}}:
@@ -85,14 +83,14 @@ left:
   ; CHECK: movq %rdx, 8(%rsp)
   ; CHECK: movq
   ; CHECK: callq some_call
-  %sp1 = invoke i32 (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %val1, i64 addrspace(1)* %val2)
+  %sp1 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %val1, i64 addrspace(1)* %val2)
            to label %left.relocs unwind label %exceptional_return.left
 
 left.relocs:
   ; CHECK: movq (%rsp),
   ; CHECK: movq 8(%rsp), [[REGVAL2:%[a-z]+]]
-  %val1.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %sp1, i32 13, i32 13)
-  %val2.relocated_left = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %sp1, i32 14, i32 14)
+  %val1.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 13, i32 13)
+  %val2.relocated_left = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 14, i32 14)
   br label %normal_return
 
 right:
@@ -100,37 +98,35 @@ right:
   ; CHECK: movq
   ; CHECK: movq %rdx, (%rsp)
   ; CHECK: callq some_call
-  %sp2 = invoke i32 (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3)
+  %sp2 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3)
            to label %right.relocs unwind label %exceptional_return.right
 
 right.relocs:
   ; CHECK: movq (%rsp), [[REGVAL2]]
   ; CHECK: movq
-  %val2.relocated_right = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %sp2, i32 13, i32 13)
-  %val3.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %sp2, i32 14, i32 14)
+  %val2.relocated_right = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp2, i32 13, i32 13)
+  %val3.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp2, i32 14, i32 14)
   br label %normal_return
 
 normal_return:
   ; CHECK-LABEL: %normal_return
   ; CHECK: cmoveq {{.*}}[[REGVAL2]]{{.*}}
-  ; CHECK retq
+  ; CHECK: retq
   %a1 = phi i64 addrspace(1)* [%val1.relocated, %left.relocs], [%val3.relocated, %right.relocs]
   %a2 = phi i64 addrspace(1)* [%val2.relocated_left, %left.relocs], [%val2.relocated_right, %right.relocs]
   %ret = select i1 %cond, i64 addrspace(1)* %a1, i64 addrspace(1)* %a2
   ret i64 addrspace(1)* %ret
 
 exceptional_return.left:
-  %landing_pad = landingpad { i8*, i32 }
+  %landing_pad = landingpad token
           cleanup
-  %relocate_token = extractvalue { i8*, i32 } %landing_pad, 1
-  %val.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %relocate_token, i32 13, i32 13)
+  %val.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 13, i32 13)
   ret i64 addrspace(1)* %val.relocated2
 
 exceptional_return.right:
-  %landing_pad1 = landingpad { i8*, i32 }
+  %landing_pad1 = landingpad token
           cleanup
-  %relocate_token1 = extractvalue { i8*, i32 } %landing_pad1, 1
-  %val.relocated3 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %relocate_token1, i32 13, i32 13)
+  %val.relocated3 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad1, i32 13, i32 13)
   ret i64 addrspace(1)* %val.relocated3
 }
 
@@ -139,7 +135,7 @@ define i64 addrspace(1)* @test_null_undef(i64 addrspace(1)* %val1)
 ; CHECK-LABEL: test_null_undef:
 entry:
   ; CHECK: callq some_call
-  %sp1 = invoke i32 (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* null, i64 addrspace(1)* undef)
+  %sp1 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* null, i64 addrspace(1)* undef)
            to label %normal_return unwind label %exceptional_return
 
 normal_return:
@@ -147,16 +143,15 @@ normal_return:
   ; CHECK: xorl %eax, %eax
   ; CHECK-NEXT: popq
   ; CHECK-NEXT: retq
-  %null.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %sp1, i32 13, i32 13)
-  %undef.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %sp1, i32 14, i32 14)
+  %null.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 13, i32 13)
+  %undef.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 14, i32 14)
   ret i64 addrspace(1)* %null.relocated
 
 exceptional_return:
-  %landing_pad = landingpad { i8*, i32 }
+  %landing_pad = landingpad token
           cleanup
-  %relocate_token = extractvalue { i8*, i32 } %landing_pad, 1
-  %null.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %relocate_token, i32 13, i32 13)
-  %undef.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %relocate_token, i32 14, i32 14)
+  %null.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 13, i32 13)
+  %undef.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 14, i32 14)
   ret i64 addrspace(1)* %null.relocated2
 }
 
@@ -168,14 +163,14 @@ entry:
   %aa = addrspacecast i32* %a to i32 addrspace(1)*
   %c = inttoptr i64 15 to i64 addrspace(1)*
   ; CHECK: callq
-  %sp = invoke i32 (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %aa, i64 addrspace(1)* %c)
+  %sp = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %aa, i64 addrspace(1)* %c)
            to label %normal_return unwind label %exceptional_return
 
 normal_return:
   ; CHECK: leaq
   ; CHECK-NEXT: popq
   ; CHECK-NEXT: retq
-  %aa.rel = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %sp, i32 13, i32 13)
+  %aa.rel = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %sp, i32 13, i32 13)
   %aa.converted = bitcast i32 addrspace(1)* %aa.rel to i64 addrspace(1)*
   ret i64 addrspace(1)* %aa.converted
 
@@ -183,16 +178,15 @@ exceptional_return:
   ; CHECK: movl	$15
   ; CHECK-NEXT: popq
   ; CHECK-NEXT: retq
-  %landing_pad = landingpad { i8*, i32 }
+  %landing_pad = landingpad token
           cleanup
-  %relocate_token = extractvalue { i8*, i32 } %landing_pad, 1
-  %aa.rel2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %relocate_token, i32 14, i32 14)
+  %aa.rel2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 14, i32 14)
   ret i64 addrspace(1)* %aa.rel2
 }
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...)
-declare i32 @llvm.experimental.gc.statepoint.p0f_p1i64p1i64f(i64, i32, i64 addrspace(1)* (i64 addrspace(1)*)*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_p1i64p1i64f(i64, i32, i64 addrspace(1)* (i64 addrspace(1)*)*, i32, i32, ...)
 
-declare i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32, i32, i32)
-declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32)
-declare i64 addrspace(1)* @llvm.experimental.gc.result.p1i64(i32)
+declare i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token, i32, i32)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
+declare i64 addrspace(1)* @llvm.experimental.gc.result.p1i64(token)
diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll
index a4aa747af8cff..d4784212810f0 100644
--- a/test/CodeGen/X86/statepoint-stack-usage.ll
+++ b/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -16,17 +16,17 @@ define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 a
 ; CHECK: movq	%rsi, (%rsp)
 ; There should be no more than three moves
 ; CHECK-NOT: movq
-  %safepoint_token = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
-  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 12)
-  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 13)
-  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 14)
+  %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12)
+  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13)
+  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 14)
 ; CHECK: callq
 ; This is the key check.  There should NOT be any memory moves here
 ; CHECK-NOT: movq
-  %safepoint_token2 = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
-  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 12, i32 14)
-  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 12, i32 13)
-  %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 12, i32 12)
+  %safepoint_token2 = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 14)
+  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 13)
+  %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 12)
 ; CHECK: callq
   ret i32 1
 }
@@ -39,17 +39,17 @@ define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrsp
 ; CHECK: movq	%rdi, 16(%rsp)
 ; CHECK: movq	%rdx, 8(%rsp)
 ; CHECK: movq	%rsi, (%rsp)
-  %safepoint_token = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
-  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 12)
-  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 13)
-  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 14)
+  %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12)
+  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13)
+  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 14)
 ; CHECK: callq
 ; This is the key check.  There should NOT be any memory moves here
 ; CHECK-NOT: movq
-  %safepoint_token2 = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 addrspace(1)* %a1, i32 0, i32 addrspace(1)* %c1, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
-  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 12, i32 14)
-  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 12, i32 13)
-  %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 12, i32 12)
+  %safepoint_token2 = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 addrspace(1)* %a1, i32 0, i32 addrspace(1)* %c1, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 14)
+  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 13)
+  %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 12)
 ; CHECK: callq
   ret i32 1
 }
@@ -63,25 +63,25 @@ entry:
   ; CHECK: movq	%rdx, 8(%rsp)
   ; CHECK: movq	%rsi, (%rsp)
   ; CHECK: callq
-  %safepoint_token = invoke i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+  %safepoint_token = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
                    to label %normal_return unwind label %exceptional_return
 
 normal_return:
-  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 12)
-  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 13)
-  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 14)
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12)
+  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13)
+  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 14)
   ; Should work even through bitcasts
   %c1.casted = bitcast i32 addrspace(1)* %c1 to i8 addrspace(1)*
   ; This is the key check.  There should NOT be any memory moves here
   ; CHECK-NOT: movq
   ; CHECK: callq
-  %safepoint_token2 = invoke i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i8 addrspace(1)* %c1.casted, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+  %safepoint_token2 = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i8 addrspace(1)* %c1.casted, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
                     to label %normal_return2 unwind label %exceptional_return2
 
 normal_return2:
-  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 12, i32 14)
-  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 12, i32 13)
-  %c2 = tail call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32 %safepoint_token2, i32 12, i32 12)
+  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 14)
+  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 13)
+  %c2 = tail call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token2, i32 12, i32 12)
   ret i32 1
 
 exceptional_return:
@@ -96,10 +96,10 @@ exceptional_return2:
 }
 
 ; Function Attrs: nounwind
-declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
-declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32, i32, i32) #3
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32) #3
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) #3
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
 
 declare i32 @"personality_function"()
 
diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll
index e18476cee53c5..4f8b2ce6efd95 100644
--- a/test/CodeGen/X86/statepoint-stackmap-format.ll
+++ b/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple="x86_64-pc-linux-gnu" | FileCheck %s
-; RUN: llc < %s -mtriple="x86_64-pc-win64-coff" | FileCheck %s
+; RUN: llc < %s -mtriple="x86_64-pc-unknown-elf" | FileCheck %s
 
 ; This test is a sanity check to ensure statepoints are generating StackMap
 ; sections correctly.  This is not intended to be a rigorous test of the 
@@ -11,7 +11,7 @@ declare zeroext i1 @return_i1()
 
 define i1 @test(i32 addrspace(1)* %ptr_base, i32 %arg)
   gc "statepoint-example" {
-; CHECK-LABEL: test
+; CHECK-LABEL: test:
 ; Do we see two spills for the local values and the store to the
 ; alloca?
 ; CHECK: subq	$40, %rsp
@@ -25,11 +25,11 @@ entry:
   %metadata1 = alloca i32 addrspace(1)*, i32 2, align 8
   store i32 addrspace(1)* null, i32 addrspace(1)** %metadata1
   %ptr_derived = getelementptr i32, i32 addrspace(1)* %ptr_base, i32 %arg
-  %safepoint_token = tail call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* null, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* %ptr_derived, i32 addrspace(1)* null)
-  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
-  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 9)
-  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 10)
-  %c = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 11, i32 11)
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* null, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* %ptr_derived, i32 addrspace(1)* null)
+  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
+  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 9, i32 9)
+  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 9, i32 10)
+  %c = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 11, i32 11)
 ; 
   ret i1 %call1
 }
@@ -53,11 +53,11 @@ define i1 @test_derived_arg(i32 addrspace(1)* %ptr_base,
 entry:
   %metadata1 = alloca i32 addrspace(1)*, i32 2, align 8
   store i32 addrspace(1)* null, i32 addrspace(1)** %metadata1
-  %safepoint_token = tail call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* null, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* %ptr_derived, i32 addrspace(1)* null)
-  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
-  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 9)
-  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 10)
-  %c = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 11, i32 11)
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* null, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* %ptr_derived, i32 addrspace(1)* null)
+  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
+  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 9, i32 9)
+  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 9, i32 10)
+  %c = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 11, i32 11)
 ; 
   ret i1 %call1
 }
@@ -66,15 +66,15 @@ entry:
 define i1 @test_id() gc "statepoint-example" {
 ; CHECK-LABEL: test_id
 entry:
-  %safepoint_token = tail call i32 (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 237, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0)
-  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 237, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
   ret i1 %call1
 }
 
 
-declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
-declare i1 @llvm.experimental.gc.result.i1(i32)
-declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
+declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
+declare i1 @llvm.experimental.gc.result.i1(token)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32) #3
 
 ; CHECK-LABEL: .section .llvm_stackmaps
 ; CHECK-NEXT:  __LLVM_StackMaps:
@@ -94,18 +94,19 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
 ; CHECK-NEXT:   .quad 40
 ; CHECK-NEXT:   .quad test_derived_arg
 ; CHECK-NEXT:   .quad 40
+; CHECK-NEXT:   .quad test_id
+; CHECK-NEXT:   .quad 8
 
 ;
 ; test
 ;
 
-; Large Constants
-; Statepoint ID only
-; CHECK: .quad	0
+; Statepoint ID
+; CHECK-NEXT: .quad	0
 
 ; Callsites
 ; Constant arguments
-; CHECK: .long	.Ltmp1-test
+; CHECK-NEXT: .long	.Ltmp1-test
 ; CHECK: .short	0
 ; CHECK: .short	11
 ; SmallConstant (0)
@@ -123,8 +124,8 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
 ; CHECK: .byte	8
 ; CHECK: .short	0
 ; CHECK: .long	2
-; Direct Spill Slot [RSP+0]
-; CHECK: .byte	2
+; Indirect Spill Slot [RSP+0]
+; CHECK: .byte	3
 ; CHECK: .byte	8
 ; CHECK: .short	7
 ; CHECK: .long	16
@@ -143,23 +144,23 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
 ; CHECK: .byte	8
 ; CHECK: .short	0
 ; CHECK: .long	0
-; Direct Spill Slot [RSP+16]
-; CHECK: .byte	2
+; Indirect Spill Slot [RSP+16]
+; CHECK: .byte	3
 ; CHECK: .byte	8
 ; CHECK: .short	7
 ; CHECK: .long	16
-; Direct Spill Slot [RSP+8]
-; CHECK: .byte	2
+; Indirect Spill Slot [RSP+8]
+; CHECK: .byte	3
 ; CHECK: .byte	8
 ; CHECK: .short	7
 ; CHECK: .long	8
-; Direct Spill Slot [RSP+16]
-; CHECK: .byte	2
+; Indirect Spill Slot [RSP+16]
+; CHECK: .byte	3
 ; CHECK: .byte	8
 ; CHECK: .short	7
 ; CHECK: .long	16
-; Direct Spill Slot [RSP+16]
-; CHECK: .byte	2
+; Indirect Spill Slot [RSP+16]
+; CHECK: .byte	3
 ; CHECK: .byte	8
 ; CHECK: .short	7
 ; CHECK: .long	16
@@ -171,15 +172,13 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
 
 ;
 ; test_derived_arg
-;
 
-; Large Constants
-; Statepoint ID only
-; CHECK: .quad	0
+; Statepoint ID
+; CHECK-NEXT: .quad	0
 
 ; Callsites
 ; Constant arguments
-; CHECK: .long	.Ltmp3-test_derived_arg
+; CHECK-NEXT: .long	.Ltmp3-test_derived_arg
 ; CHECK: .short	0
 ; CHECK: .short	11
 ; SmallConstant (0)
@@ -192,8 +191,8 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
 ; CHECK: .byte	8
 ; CHECK: .short	0
 ; CHECK: .long	2
-; Direct Spill Slot [RSP+0]
-; CHECK: .byte	2
+; Indirect Spill Slot [RSP+0]
+; CHECK: .byte	3
 ; CHECK: .byte	8
 ; CHECK: .short	7
 ; CHECK: .long	16
@@ -212,23 +211,23 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
 ; CHECK: .byte	8
 ; CHECK: .short	0
 ; CHECK: .long	0
-; Direct Spill Slot [RSP+16]
-; CHECK: .byte	2
+; Indirect Spill Slot [RSP+16]
+; CHECK: .byte	3
 ; CHECK: .byte	8
 ; CHECK: .short	7
 ; CHECK: .long	16
-; Direct Spill Slot [RSP+8]
-; CHECK: .byte	2
+; Indirect Spill Slot [RSP+8]
+; CHECK: .byte	3
 ; CHECK: .byte	8
 ; CHECK: .short	7
 ; CHECK: .long	8
-; Direct Spill Slot [RSP+16]
-; CHECK: .byte	2
+; Indirect Spill Slot [RSP+16]
+; CHECK: .byte	3
 ; CHECK: .byte	8
 ; CHECK: .short	7
 ; CHECK: .long	16
-; Direct Spill Slot [RSP+16]
-; CHECK: .byte	2
+; Indirect Spill Slot [RSP+16]
+; CHECK: .byte	3
 ; CHECK: .byte	8
 ; CHECK: .short	7
 ; CHECK: .long	16
@@ -239,13 +238,12 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
 ; CHECK: .align	8
 
 ; Records for the test_id function:
-; No large constants
 
 ; The Statepoint ID:
-; CHECK: .quad	237
+; CHECK-NEXT: .quad	237
 
 ; Instruction Offset
-; CHECK: .long	.Ltmp5-test_id
+; CHECK-NEXT: .long	.Ltmp5-test_id
 
 ; Reserved:
 ; CHECK: .short	0
diff --git a/test/CodeGen/X86/stdarg.ll b/test/CodeGen/X86/stdarg.ll
index 18d502ad58345..42cbcb1008d33 100644
--- a/test/CodeGen/X86/stdarg.ll
+++ b/test/CodeGen/X86/stdarg.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; CHECK: testb %al, %al
 
 %struct.__va_list_tag = type { i32, i32, i8*, i8* }
 
@@ -8,6 +7,15 @@ entry:
   %ap = alloca [1 x %struct.__va_list_tag], align 8; <[1 x %struct.__va_list_tag]*> [#uses=2]
   %ap12 = bitcast [1 x %struct.__va_list_tag]* %ap to i8*; <i8*> [#uses=2]
   call void @llvm.va_start(i8* %ap12)
+; CHECK: testb %al, %al
+
+; These test for specific offsets, which is very fragile. Still, the test needs
+; to ensure that va_list has the correct element types.
+;
+; CHECK-DAG: movq {{.*}}, 192(%rsp)
+; CHECK-DAG: movq {{.*}}, 184(%rsp)
+; CHECK-DAG: movl {{.*}}, 180(%rsp)
+; CHECK-DAG: movl {{.*}}, 176(%rsp)
   %ap3 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0; <%struct.__va_list_tag*> [#uses=1]
   call void @bar(%struct.__va_list_tag* %ap3) nounwind
   call void @llvm.va_end(i8* %ap12)
diff --git a/test/CodeGen/X86/stores-merging.ll b/test/CodeGen/X86/stores-merging.ll
index d6daa573b4ae1..9e479bd71b98b 100644
--- a/test/CodeGen/X86/stores-merging.ll
+++ b/test/CodeGen/X86/stores-merging.ll
@@ -7,17 +7,51 @@ target triple = "x86_64-unknown-linux-gnu"
 
 @e = common global %structTy zeroinitializer, align 4
 
-; CHECK-LABEL: f
-define void @f() {
-entry:
+;; Ensure that MergeConsecutiveStores doesn't incorrectly reorder
+;; store operations.  The first test stores in increasing address
+;; order, the second in decreasing -- but in both cases should have
+;; the same result in memory in the end.
 
-; CHECK:   movabsq	$528280977409, %rax
+; CHECK-LABEL: redundant_stores_merging:
+; CHECK:   movl    $123, e+8(%rip)
+; CHECK:   movabsq $1958505086977, %rax
 ; CHECK:   movq    %rax, e+4(%rip)
-; CHECK:   movl    $456, e+8(%rip)
-
+define void @redundant_stores_merging() {
+entry:
   store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
   store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
   store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
   ret void
 }
 
+;; This variant tests PR25154.
+; CHECK-LABEL: redundant_stores_merging_reverse:
+; CHECK:   movl    $123, e+8(%rip)
+; CHECK:   movabsq $1958505086977, %rax
+; CHECK:   movq    %rax, e+4(%rip)
+define void @redundant_stores_merging_reverse() {
+entry:
+  store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
+  store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
+  store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
+  ret void
+}
+
+@b = common global [8 x i8] zeroinitializer, align 2
+
+;; The 2-byte store to offset 3 overlaps the 2-byte store to offset 2;
+;; these must not be reordered in MergeConsecutiveStores such that the
+;; store to 3 comes first (e.g. by merging the stores to 0 and 2 into
+;; a movl, after the store to 3).
+
+;; CHECK-LABEL: overlapping_stores_merging:
+;; CHECK:  movw    $0, b+2(%rip)
+;; CHECK:  movw    $2, b+3(%rip)
+;; CHECK:  movw    $1, b(%rip)
+define void @overlapping_stores_merging() {
+entry:
+  store i16 0, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 2) to i16*), align 2
+  store i16 2, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 3) to i16*), align 1
+  store i16 1, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 0) to i16*), align 2
+  ret void
+}
diff --git a/test/CodeGen/X86/switch-bt.ll b/test/CodeGen/X86/switch-bt.ll
index 2cf3aafe5471e..6a2cbe1ec6ca4 100644
--- a/test/CodeGen/X86/switch-bt.ll
+++ b/test/CodeGen/X86/switch-bt.ll
@@ -5,7 +5,7 @@
 
 ;      CHECK: movabsq $2305843009482129440, %r
 ; CHECK-NEXT: btq %rax, %r
-; CHECK-NEXT: jae
+; CHECK-NEXT: jb
 ;     CHECK: movl  $671088640, %e
 ; CHECK-NEXT: btq %rax, %r
 ; CHECK-NEXT: jae
@@ -145,13 +145,13 @@ sw.epilog:
 ; CHECK: cmpl $10
 ; CHECK: je
 ; CHECK: cmpl $20
+; CHECK: je
+; CHECK: cmpl $30
 ; CHECK: jne
 ; CHECK: cmpl $40
 ; CHECK: je
 ; CHECK: cmpl $50
-; CHECK: jne
-; CHECK: cmpl $30
-; CHECK: jne
+; CHECK: je
 ; CHECK: cmpl $60
 ; CHECK: jne
 }
diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll
new file mode 100644
index 0000000000000..b8cb7b1280ad4
--- /dev/null
+++ b/test/CodeGen/X86/switch-edge-weight.ll
@@ -0,0 +1,281 @@
+; RUN: llc -march=x86-64 -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s
+
+declare void @foo(i32)
+
+; CHECK-LABEL: test
+
+define void @test(i32 %x) nounwind {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 1, label %sw.bb
+    i32 155, label %sw.bb
+    i32 156, label %sw.bb
+    i32 157, label %sw.bb
+    i32 158, label %sw.bb
+    i32 159, label %sw.bb
+    i32 1134, label %sw.bb
+    i32 1140, label %sw.bb
+  ], !prof !1
+
+sw.bb:
+  call void @foo(i32 0)
+  br label %sw.epilog
+
+sw.default:
+  call void @foo(i32 1)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+
+; Check if weights are correctly assigned to edges generated from switch
+; statement.
+;
+; CHECK: BB#0:
+; BB#0 to BB#4: [0, 1133] (65 = 60 + 5)
+; BB#0 to BB#5: [1134, UINT32_MAX] (25 = 20 + 5)
+; CHECK: Successors according to CFG: BB#4({{[0-9a-fx/= ]+}}72.22%) BB#5({{[0-9a-fx/= ]+}}27.78%)
+;
+; CHECK: BB#4:
+; BB#4 to BB#1: [155, 159] (50)
+; BB#4 to BB#5: [0, 1133] - [155, 159] (15 = 10 + 5)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}76.92%) BB#7({{[0-9a-fx/= ]+}}23.08%)
+;
+; CHECK: BB#5:
+; BB#5 to BB#1: {1140} (10)
+; BB#5 to BB#6: [1134, UINT32_MAX] - {1140} (15 = 10 + 5)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}40.00%) BB#6({{[0-9a-fx/= ]+}}60.00%)
+;
+; CHECK: BB#6:
+; BB#6 to BB#1: {1134} (10)
+; BB#6 to BB#2: [1134, UINT32_MAX] - {1134, 1140} (5)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}66.67%) BB#2({{[0-9a-fx/= ]+}}33.33%)
+}
+
+; CHECK-LABEL: test2
+
+define void @test2(i32 %x) nounwind {
+entry:
+
+; In this switch statement, there is an edge from jump table to default
+; statement.
+
+  switch i32 %x, label %sw.default [
+    i32 1, label %sw.bb
+    i32 10, label %sw.bb2
+    i32 11, label %sw.bb3
+    i32 12, label %sw.bb4
+    i32 13, label %sw.bb5
+    i32 14, label %sw.bb5
+  ], !prof !3
+
+sw.bb:
+  call void @foo(i32 0)
+  br label %sw.epilog
+
+sw.bb2:
+  call void @foo(i32 2)
+  br label %sw.epilog
+
+sw.bb3:
+  call void @foo(i32 3)
+  br label %sw.epilog
+
+sw.bb4:
+  call void @foo(i32 4)
+  br label %sw.epilog
+
+sw.bb5:
+  call void @foo(i32 5)
+  br label %sw.epilog
+
+sw.default:
+  call void @foo(i32 1)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+
+; Check if weights are correctly assigned to edges generated from switch
+; statement.
+;
+; CHECK: BB#0:
+; BB#0 to BB#6: {0} + [15, UINT32_MAX] (5)
+; BB#0 to BB#8: [1, 14] (jump table) (65 = 60 + 5)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}7.14%) BB#8({{[0-9a-fx/= ]+}}92.86%
+;
+; CHECK: BB#8:
+; BB#8 to BB#1: {1} (10)
+; BB#8 to BB#6: [2, 9] (5)
+; BB#8 to BB#2: {10} (10)
+; BB#8 to BB#3: {11} (10)
+; BB#8 to BB#4: {12} (10)
+; BB#8 to BB#5: {13, 14} (20)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}15.38%) BB#6({{[0-9a-fx/= ]+}}7.69%) BB#2({{[0-9a-fx/= ]+}}15.38%) BB#3({{[0-9a-fx/= ]+}}15.38%) BB#4({{[0-9a-fx/= ]+}}15.38%) BB#5({{[0-9a-fx/= ]+}}30.77%)
+}
+
+; CHECK-LABEL: test3
+
+define void @test3(i32 %x) nounwind {
+entry:
+
+; In this switch statement, there is no edge from jump table to default
+; statement.
+
+  switch i32 %x, label %sw.default [
+    i32 10, label %sw.bb
+    i32 11, label %sw.bb2
+    i32 12, label %sw.bb3
+    i32 13, label %sw.bb4
+    i32 14, label %sw.bb5
+  ], !prof !2
+
+sw.bb:
+  call void @foo(i32 0)
+  br label %sw.epilog
+
+sw.bb2:
+  call void @foo(i32 2)
+  br label %sw.epilog
+
+sw.bb3:
+  call void @foo(i32 3)
+  br label %sw.epilog
+
+sw.bb4:
+  call void @foo(i32 4)
+  br label %sw.epilog
+
+sw.bb5:
+  call void @foo(i32 5)
+  br label %sw.epilog
+
+sw.default:
+  call void @foo(i32 1)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+
+; Check if weights are correctly assigned to edges generated from switch
+; statement.
+;
+; CHECK: BB#0:
+; BB#0 to BB#6: [0, 9] + [15, UINT32_MAX] {10}
+; BB#0 to BB#8: [10, 14] (jump table) (50)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}16.67%) BB#8({{[0-9a-fx/= ]+}}83.33%)
+;
+; CHECK: BB#8:
+; BB#8 to BB#1: {10} (10)
+; BB#8 to BB#2: {11} (10)
+; BB#8 to BB#3: {12} (10)
+; BB#8 to BB#4: {13} (10)
+; BB#8 to BB#5: {14} (10)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}20.00%) BB#2({{[0-9a-fx/= ]+}}20.00%) BB#3({{[0-9a-fx/= ]+}}20.00%) BB#4({{[0-9a-fx/= ]+}}20.00%) BB#5({{[0-9a-fx/= ]+}}20.00%)
+}
+
+; CHECK-LABEL: test4
+
+define void @test4(i32 %x) nounwind {
+entry:
+
+; In this switch statement, there is no edge from bit test to default basic
+; block.
+
+  switch i32 %x, label %sw.default [
+    i32 1, label %sw.bb
+    i32 111, label %sw.bb2
+    i32 112, label %sw.bb3
+    i32 113, label %sw.bb3
+    i32 114, label %sw.bb2
+    i32 115, label %sw.bb2
+  ], !prof !3
+
+sw.bb:
+  call void @foo(i32 0)
+  br label %sw.epilog
+
+sw.bb2:
+  call void @foo(i32 2)
+  br label %sw.epilog
+
+sw.bb3:
+  call void @foo(i32 3)
+  br label %sw.epilog
+
+sw.default:
+  call void @foo(i32 1)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+
+; Check if weights are correctly assigned to edges generated from switch
+; statement.
+;
+; CHECK: BB#0:
+; BB#0 to BB#6: [0, 110] + [116, UINT32_MAX] (20)
+; BB#0 to BB#7: [111, 115] (bit test) (50)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}28.57%) BB#7({{[0-9a-fx/= ]+}}71.43%)
+;
+; CHECK: BB#7:
+; BB#7 to BB#2: {111, 114, 115} (30)
+; BB#7 to BB#3: {112, 113} (20)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}60.00%) BB#3({{[0-9a-fx/= ]+}}40.00%)
+}
+
+; CHECK-LABEL: test5
+
+define void @test5(i32 %x) nounwind {
+entry:
+
+; In this switch statement, there is an edge from jump table to default basic
+; block.
+
+  switch i32 %x, label %sw.default [
+    i32 1, label %sw.bb
+    i32 5, label %sw.bb2
+    i32 7, label %sw.bb3
+    i32 9, label %sw.bb4
+    i32 31, label %sw.bb5
+  ], !prof !2
+
+sw.bb:
+  call void @foo(i32 0)
+  br label %sw.epilog
+
+sw.bb2:
+  call void @foo(i32 1)
+  br label %sw.epilog
+
+sw.bb3:
+  call void @foo(i32 2)
+  br label %sw.epilog
+
+sw.bb4:
+  call void @foo(i32 3)
+  br label %sw.epilog
+
+sw.bb5:
+  call void @foo(i32 4)
+  br label %sw.epilog
+
+sw.default:
+  call void @foo(i32 5)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+
+; Check if weights are correctly assigned to edges generated from switch
+; statement.
+;
+; CHECK: BB#0:
+; BB#0 to BB#6: [10, UINT32_MAX] (15)
+; BB#0 to BB#8: [1, 5, 7, 9] (jump table) (45)
+; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%)
+}
+
+!1 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10} 
+!2 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10} 
+!3 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10} 
diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll
index a84fb4aafd178..896a067da230f 100644
--- a/test/CodeGen/X86/switch-jump-table.ll
+++ b/test/CodeGen/X86/switch-jump-table.ll
@@ -1,17 +1,18 @@
-; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s
+; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-PROB
 
 
 ; An unreachable default destination is replaced with the most popular case label.
 
-define void @sum2(i32 %x, i32* %to) {
-; CHECK-LABEL: sum2:
+define void @foo(i32 %x, i32* %to) {
+; CHECK-LABEL: foo:
 ; CHECK: movl 4(%esp), [[REG:%e[a-z]{2}]]
 ; CHECK: cmpl $3, [[REG]]
-; CHECK: jbe .LBB0_1
+; CHECK: ja .LBB0_6
+; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: jmpl *.LJTI0_0(,[[REG]],4)
 ; CHECK: movl $4
 ; CHECK: retl
-; CHECK-LABEL: .LBB0_1:
-; CHECK-NEXT: jmpl *.LJTI0_0(,[[REG]],4)
 
 entry:
   switch i32 %x, label %default [
@@ -48,5 +49,44 @@ default:
 ; CHECK-NEXT: .long  .LBB0_3
 ; CHECK-NEXT: .long  .LBB0_4
 ; CHECK-NEXT: .long  .LBB0_5
-; CHECK-NOT: .long
 }
+
+; Check if branch probabilities are correctly assigned to the jump table.
+
+define void @bar(i32 %x, i32* %to) {
+; CHECK-JT-PROB-LABEL: bar:
+; CHECK-JT-PROB: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}14.29%) BB#8({{[0-9a-fx/= ]+}}85.71%)
+; CHECK-JT-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}16.67%) BB#2({{[0-9a-fx/= ]+}}16.67%) BB#3({{[0-9a-fx/= ]+}}16.67%) BB#4({{[0-9a-fx/= ]+}}16.67%) BB#5({{[0-9a-fx/= ]+}}33.33%)
+
+entry:
+  switch i32 %x, label %default [
+    i32 0, label %bb0
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+    i32 5, label %bb4
+  ], !prof !1
+bb0:
+  store i32 0, i32* %to
+  br label %exit
+bb1:
+  store i32 1, i32* %to
+  br label %exit
+bb2:
+  store i32 2, i32* %to
+  br label %exit
+bb3:
+  store i32 3, i32* %to
+  br label %exit
+bb4:
+  store i32 4, i32* %to
+  br label %exit
+default:
+  store i32 5, i32* %to
+  br label %exit
+exit:
+  ret void
+}
+
+!1 = !{!"branch_weights", i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16}
diff --git a/test/CodeGen/X86/switch-order-weight.ll b/test/CodeGen/X86/switch-order-weight.ll
index 207e0b3f707b9..8c0c1a7d8108b 100644
--- a/test/CodeGen/X86/switch-order-weight.ll
+++ b/test/CodeGen/X86/switch-order-weight.ll
@@ -13,8 +13,8 @@ entry:
 ; CHECK-LABEL: test1:
 ; CHECK-NOT: unr
 ; CHECK: cmpl $10
-; CHECK: bar
 ; CHECK: cmpl $20
+; CHECK: bar
 
 if.then:
   tail call void @unr(i32 23) noreturn nounwind
diff --git a/test/CodeGen/X86/switch.ll b/test/CodeGen/X86/switch.ll
index 748fd6f238b19..46587341ea741 100644
--- a/test/CodeGen/X86/switch.ll
+++ b/test/CodeGen/X86/switch.ll
@@ -51,7 +51,7 @@ return: ret void
 ; CHECK-LABEL: simple_ranges
 ; CHECK: leal -100
 ; CHECK: cmpl $4
-; CHECK: jae
+; CHECK: jb
 ; CHECK: cmpl $3
 ; CHECK: ja
 
@@ -90,7 +90,7 @@ return: ret void
 ; but with 6-8, the whole switch is suitable for a jump table.
 ; CHECK-LABEL: jt_is_better
 ; CHECK: cmpl $8
-; CHECK: jbe
+; CHECK: ja
 ; CHECK: jmpq *.LJTI
 }
 
@@ -107,7 +107,6 @@ entry:
     i32 2, label %bb2
     i32 5, label %bb2
     i32 8, label %bb2
-
   ]
 bb0: tail call void @g(i32 0) br label %return
 bb1: tail call void @g(i32 1) br label %return
@@ -116,6 +115,10 @@ return: ret void
 
 ; This could be lowered as a jump table, but bit tests is more efficient.
 ; CHECK-LABEL: bt_is_better
+; The bit test on 2,5,8 is unnecessary as all cases cover the rage [0, 8].
+; The range check guarantees that cases other than 0,3,6 and 1,4,7 must be
+; in 2,5,8.
+;
 ; 73 = 2^0 + 2^3 + 2^6
 ; CHECK: movl $73
 ; CHECK: btl
@@ -123,7 +126,74 @@ return: ret void
 ; CHECK: movl $146
 ; CHECK: btl
 ; 292 = 2^2 + 2^5 + 2^8
-; CHECK: movl $292
+; CHECK-NOT: movl $292
+; CHECK-NOT: btl
+}
+
+define void @bt_is_better2(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 0, label %bb0
+    i32 3, label %bb0
+    i32 6, label %bb0
+    i32 1, label %bb1
+    i32 4, label %bb1
+    i32 7, label %bb1
+    i32 2, label %bb2
+    i32 8, label %bb2
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 2) br label %return
+return: ret void
+
+; This will also be lowered as bit test, but as the range [0,8] is not fully
+; covered (5 missing), the default statement can be jumped to and we end up
+; with one more branch.
+; CHECK-LABEL: bt_is_better2
+;
+; 73 = 2^0 + 2^3 + 2^6
+; CHECK: movl $73
+; CHECK: btl
+; 146 = 2^1 + 2^4 + 2^7
+; CHECK: movl $146
+; CHECK: btl
+; 260 = 2^2 + 2^8
+; CHECK: movl $260
+; CHECK: btl
+}
+
+define void @bt_is_better3(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 10, label %bb0
+    i32 13, label %bb0
+    i32 16, label %bb0
+    i32 11, label %bb1
+    i32 14, label %bb1
+    i32 17, label %bb1
+    i32 12, label %bb2
+    i32 18, label %bb2
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 2) br label %return
+return: ret void
+
+; We don't have to subtract 10 from the case value to let the range become
+; [0, 8], as each value in the range [10, 18] can be represented by bits in a
+; word. Then we still need a branch to jump to the default statement for the
+; range [0, 10).
+; CHECK-LABEL: bt_is_better3
+;
+; 74752 = 2^10 + 2^13 + 2^16
+; CHECK: movl $74752
+; CHECK: btl
+; 149504 = 2^11 + 2^14 + 2^17
+; CHECK: movl $149504
+; CHECK: btl
+; 266240 = 2^12 + 2^15 + 2^18
+; CHECK: movl $266240
 ; CHECK: btl
 }
 
@@ -410,6 +480,9 @@ return: ret void
 ; Cases 1,4,7 have a very large branch weight (which shouldn't overflow), so
 ; their bit test should come first. 0,3,6 and 2,5,8,9 both have a weight of 12,
 ; but the latter set has more cases, so should be tested for earlier.
+; The bit test on 0,3,6 is unnecessary as all cases cover the rage [0, 9].
+; The range check guarantees that cases other than 1,4,7 and 2,5,8,9 must be
+; in 0,3,6.
 
 ; CHECK-LABEL: bt_order_by_weight
 ; 146 = 2^1 + 2^4 + 2^7
@@ -419,8 +492,8 @@ return: ret void
 ; CHECK: movl $804
 ; CHECK: btl
 ; 73 = 2^0 + 2^3 + 2^6
-; CHECK: movl $73
-; CHECK: btl
+; CHECK-NOT: movl $73
+; CHECK-NOT: btl
 }
 
 !1 = !{!"branch_weights",
diff --git a/test/CodeGen/X86/swizzle-2.ll b/test/CodeGen/X86/swizzle-2.ll
index 697af843abb1a..fd81573edec90 100644
--- a/test/CodeGen/X86/swizzle-2.ll
+++ b/test/CodeGen/X86/swizzle-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
 
 ; Test that we correctly fold a shuffle that performs a swizzle of another
diff --git a/test/CodeGen/X86/system-intrinsics-64-xsave.ll b/test/CodeGen/X86/system-intrinsics-64-xsave.ll
new file mode 100644
index 0000000000000..feec9516220b6
--- /dev/null
+++ b/test/CodeGen/X86/system-intrinsics-64-xsave.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsave | FileCheck %s
+
+define void @test_xsave(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsave
+; CHECK: movl  %edx, %eax
+; CHECK: movl  %esi, %edx
+; CHECK: xsave (%rdi)
+  call void @llvm.x86.xsave(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsave(i8*, i32, i32)
+
+define void @test_xsave64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsave64
+; CHECK: movl    %edx, %eax
+; CHECK: movl    %esi, %edx
+; CHECK: xsave64 (%rdi)
+  call void @llvm.x86.xsave64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsave64(i8*, i32, i32)
+
+define void @test_xrstor(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstor
+; CHECK: movl   %edx, %eax
+; CHECK: movl   %esi, %edx
+; CHECK: xrstor (%rdi)
+  call void @llvm.x86.xrstor(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstor(i8*, i32, i32)
+
+define void @test_xrstor64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstor64
+; CHECK: movl     %edx, %eax
+; CHECK: movl     %esi, %edx
+; CHECK: xrstor64 (%rdi)
+  call void @llvm.x86.xrstor64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstor64(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-64-xsavec.ll b/test/CodeGen/X86/system-intrinsics-64-xsavec.ll
new file mode 100644
index 0000000000000..0680348865158
--- /dev/null
+++ b/test/CodeGen/X86/system-intrinsics-64-xsavec.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsave,+xsavec | FileCheck %s
+
+define void @test_xsavec(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsavec
+; CHECK: movl   %edx, %eax
+; CHECK: movl   %esi, %edx
+; CHECK: xsavec (%rdi)
+  call void @llvm.x86.xsavec(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsavec(i8*, i32, i32)
+
+define void @test_xsavec64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsavec64
+; CHECK: movl     %edx, %eax
+; CHECK: movl     %esi, %edx
+; CHECK: xsavec64 (%rdi)
+  call void @llvm.x86.xsavec64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsavec64(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll b/test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll
new file mode 100644
index 0000000000000..ee0a5360da8e4
--- /dev/null
+++ b/test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsaveopt | FileCheck %s
+
+define void @test_xsaveopt(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaveopt
+; CHECK: movl     %edx, %eax
+; CHECK: movl     %esi, %edx
+; CHECK: xsaveopt (%rdi)
+  call void @llvm.x86.xsaveopt(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaveopt(i8*, i32, i32)
+
+define void @test_xsaveopt64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaveopt64
+; CHECK: movl       %edx, %eax
+; CHECK: movl       %esi, %edx
+; CHECK: xsaveopt64 (%rdi)
+  call void @llvm.x86.xsaveopt64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaveopt64(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-64-xsaves.ll b/test/CodeGen/X86/system-intrinsics-64-xsaves.ll
new file mode 100644
index 0000000000000..5c1c5be4e7e2a
--- /dev/null
+++ b/test/CodeGen/X86/system-intrinsics-64-xsaves.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsave,+xsaves | FileCheck %s
+
+define void @test_xsaves(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaves
+; CHECK: movl   %edx, %eax
+; CHECK: movl   %esi, %edx
+; CHECK: xsaves (%rdi)
+  call void @llvm.x86.xsaves(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaves(i8*, i32, i32)
+
+define void @test_xsaves64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaves64
+; CHECK: movl     %edx, %eax
+; CHECK: movl     %esi, %edx
+; CHECK: xsaves64 (%rdi)
+  call void @llvm.x86.xsaves64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaves64(i8*, i32, i32)
+
+define void @test_xrstors(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstors
+; CHECK: movl    %edx, %eax
+; CHECK: movl    %esi, %edx
+; CHECK: xrstors (%rdi)
+  call void @llvm.x86.xrstors(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstors(i8*, i32, i32)
+
+define void @test_xrstors64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstors64
+; CHECK: movl      %edx, %eax
+; CHECK: movl      %esi, %edx
+; CHECK: xrstors64 (%rdi)
+  call void @llvm.x86.xrstors64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstors64(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-64.ll b/test/CodeGen/X86/system-intrinsics-64.ll
index 96c4417733902..e18a79c2b6147 100644
--- a/test/CodeGen/X86/system-intrinsics-64.ll
+++ b/test/CodeGen/X86/system-intrinsics-64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fxsr | FileCheck %s
 
 define void @test_fxsave(i8* %ptr) {
 ; CHECK-LABEL: test_fxsave
diff --git a/test/CodeGen/X86/system-intrinsics-xsave.ll b/test/CodeGen/X86/system-intrinsics-xsave.ll
new file mode 100644
index 0000000000000..ff9fb7e247a4e
--- /dev/null
+++ b/test/CodeGen/X86/system-intrinsics-xsave.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave | FileCheck %s
+
+define void @test_xsave(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsave
+; CHECK: movl  8(%esp), %edx
+; CHECK: movl  12(%esp), %eax
+; CHECK: movl  4(%esp), %ecx
+; CHECK: xsave (%ecx)
+  call void @llvm.x86.xsave(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsave(i8*, i32, i32)
+
+define void @test_xrstor(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstor
+; CHECK: movl   8(%esp), %edx
+; CHECK: movl   12(%esp), %eax
+; CHECK: movl   4(%esp), %ecx
+; CHECK: xrstor (%ecx)
+  call void @llvm.x86.xrstor(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstor(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-xsavec.ll b/test/CodeGen/X86/system-intrinsics-xsavec.ll
new file mode 100644
index 0000000000000..4a55ea9531b11
--- /dev/null
+++ b/test/CodeGen/X86/system-intrinsics-xsavec.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave,+xsavec | FileCheck %s
+
+define void @test_xsavec(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsavec
+; CHECK: movl   8(%esp), %edx
+; CHECK: movl   12(%esp), %eax
+; CHECK: movl   4(%esp), %ecx
+; CHECK: xsavec (%ecx)
+  call void @llvm.x86.xsavec(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsavec(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-xsaveopt.ll b/test/CodeGen/X86/system-intrinsics-xsaveopt.ll
new file mode 100644
index 0000000000000..f9bd7acd5a7cf
--- /dev/null
+++ b/test/CodeGen/X86/system-intrinsics-xsaveopt.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave,+xsaveopt | FileCheck %s
+
+define void @test_xsaveopt(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaveopt
+; CHECK: movl     8(%esp), %edx
+; CHECK: movl     12(%esp), %eax
+; CHECK: movl     4(%esp), %ecx
+; CHECK: xsaveopt (%ecx)
+  call void @llvm.x86.xsaveopt(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaveopt(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-xsaves.ll b/test/CodeGen/X86/system-intrinsics-xsaves.ll
new file mode 100644
index 0000000000000..ca1c5c1a9ed0f
--- /dev/null
+++ b/test/CodeGen/X86/system-intrinsics-xsaves.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave,+xsaves | FileCheck %s
+
+define void @test_xsaves(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaves
+; CHECK: movl   8(%esp), %edx
+; CHECK: movl   12(%esp), %eax
+; CHECK: movl   4(%esp), %ecx
+; CHECK: xsaves (%ecx)
+  call void @llvm.x86.xsaves(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaves(i8*, i32, i32)
+
+define void @test_xrstors(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstors
+; CHECK: movl    8(%esp), %edx
+; CHECK: movl    12(%esp), %eax
+; CHECK: movl    4(%esp), %ecx
+; CHECK: xrstors (%ecx)
+  call void @llvm.x86.xrstors(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstors(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics.ll b/test/CodeGen/X86/system-intrinsics.ll
index 84fcd052d7dbf..90dc9cd21e677 100644
--- a/test/CodeGen/X86/system-intrinsics.ll
+++ b/test/CodeGen/X86/system-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-unknown-unknown   | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fxsr  | FileCheck %s
 
 define void @test_fxsave(i8* %ptr) {
 ; CHECK-LABEL: test_fxsave
diff --git a/test/CodeGen/X86/tail-dup-catchret.ll b/test/CodeGen/X86/tail-dup-catchret.ll
new file mode 100644
index 0000000000000..3eeb24d20f2dc
--- /dev/null
+++ b/test/CodeGen/X86/tail-dup-catchret.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+define void @f() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  invoke void @g()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %0 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  catchret from %0 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch
+  %b.0 = phi i1 [ false, %catch ], [ true, %entry ]
+  tail call void @h(i1 zeroext %b.0)
+  ret void
+}
+
+; CHECK-LABEL: _f:
+; CHECK: calll _g
+; CHECK: calll _h
+
+declare void @g()
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @h(i1 zeroext)
diff --git a/test/CodeGen/X86/tail-merge-wineh.ll b/test/CodeGen/X86/tail-merge-wineh.ll
new file mode 100644
index 0000000000000..69c2fda6949b4
--- /dev/null
+++ b/test/CodeGen/X86/tail-merge-wineh.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s | FileCheck %s
+
+; Started from this code:
+; void f() {
+;   try {
+;     try {
+;       throw 42;
+;     } catch (int) {
+;     }
+;     try {
+;       throw 42;
+;     } catch (int) {
+;     }
+;   } catch (int) {
+;   }
+; }
+
+; Don't tail merge the calls.
+; CHECK: calll __CxxThrowException@8
+; CHECK: calll __CxxThrowException@8
+
+; ModuleID = 'cppeh-pingpong.cpp'
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchableType = type { i32, i8*, i32, i32, i32, i32, i8* }
+%eh.CatchableTypeArray.1 = type { i32, [1 x %eh.CatchableType*] }
+%eh.ThrowInfo = type { i32, i8*, i8*, i8* }
+
+$"\01??_R0H@8" = comdat any
+
+$"_CT??_R0H@84" = comdat any
+
+$_CTA1H = comdat any
+
+$_TI1H = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*), i32 0, i32 -1, i32 0, i32 4, i8* null }, section ".xdata", comdat
+@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x %eh.CatchableType*] [%eh.CatchableType* @"_CT??_R0H@84"] }, section ".xdata", comdat
+@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i8* null, i8* null, i8* bitcast (%eh.CatchableTypeArray.1* @_CTA1H to i8*) }, section ".xdata", comdat
+
+define void @"\01?f@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %i = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %tmp1 = alloca i32, align 4
+  store i32 0, i32* %i, align 4
+  store i32 42, i32* %tmp, align 4
+  %0 = bitcast i32* %tmp to i8*
+  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #1
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %catch] unwind label %catch.dispatch.7
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %cs1 [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i8* null]
+  catchret from %1 to label %catchret.dest
+
+catchret.dest:                                    ; preds = %catch
+  br label %try.cont
+
+try.cont:                                         ; preds = %catchret.dest
+  store i32 42, i32* %tmp1, align 4
+  %2 = bitcast i32* %tmp1 to i8*
+  invoke void @_CxxThrowException(i8* %2, %eh.ThrowInfo* @_TI1H) #1
+          to label %unreachable unwind label %catch.dispatch.2
+
+catch.dispatch.2:                                 ; preds = %try.cont
+  %cs2 = catchswitch within none [label %catch.4] unwind label %catch.dispatch.7
+
+catch.4:                                          ; preds = %catch.dispatch.2
+  %3 = catchpad within %cs2 [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i8* null]
+  catchret from %3 to label %catchret.dest.5
+
+catchret.dest.5:                                  ; preds = %catch.4
+  br label %try.cont.6
+
+try.cont.6:                                       ; preds = %catchret.dest.5
+  br label %try.cont.11
+
+catch.dispatch.7:
+  %cs3 = catchswitch within none [label %catch.9] unwind to caller
+
+catch.9:                                          ; preds = %catch.dispatch.7
+  %4 = catchpad within %cs3 [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i8* null]
+  catchret from %4 to label %catchret.dest.10
+
+catchret.dest.10:                                 ; preds = %catch.9
+  br label %try.cont.11
+
+try.cont.11:                                      ; preds = %catchret.dest.10, %try.cont.6
+  ret void
+
+unreachable:                                      ; preds = %try.cont, %entry
+  unreachable
+}
+
+declare x86_stdcallcc void @_CxxThrowException(i8*, %eh.ThrowInfo*)
+
+declare i32 @__CxxFrameHandler3(...)
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noreturn }
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll
index f590176d98156..bf778e5bad2ba 100644
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -277,8 +277,8 @@ declare fastcc %union.tree_node* @default_conversion(%union.tree_node*) nounwind
 
 ; CHECK-LABEL: foo:
 ; CHECK:        callq func
-; CHECK-NEXT: .LBB4_2:
 ; CHECK-NEXT:   popq
+; CHECK-NEXT: .LBB4_2:
 ; CHECK-NEXT:   ret
 
 define void @foo(i1* %V) nounwind {
@@ -371,6 +371,44 @@ return:
   ret void
 }
 
+; two_minsize - Same as two, but with minsize instead of optsize.
+
+; CHECK-LABEL: two_minsize:
+; CHECK-NOT: XYZ
+; CHECK: ret
+; CHECK: movl $0, XYZ(%rip)
+; CHECK: movl $1, XYZ(%rip)
+; CHECK-NOT: XYZ
+
+define void @two_minsize() nounwind minsize {
+entry:
+  %0 = icmp eq i32 undef, 0
+  br i1 %0, label %bbx, label %bby
+
+bby:
+  switch i32 undef, label %bb7 [
+    i32 16, label %return
+  ]
+
+bb7:
+  store volatile i32 0, i32* @XYZ
+  store volatile i32 1, i32* @XYZ
+  unreachable
+
+bbx:
+  switch i32 undef, label %bb12 [
+    i32 128, label %return
+  ]
+
+bb12:
+  store volatile i32 0, i32* @XYZ
+  store volatile i32 1, i32* @XYZ
+  unreachable
+
+return:
+  ret void
+}
+
 ; two_nosize - Same as two, but without the optsize attribute.
 ; Now two instructions are enough to be tail-duplicated.
 
diff --git a/test/CodeGen/X86/tailcall-mem-intrinsics.ll b/test/CodeGen/X86/tailcall-mem-intrinsics.ll
index 0e0ab5c478fc1..8e1e4f464baa9 100644
--- a/test/CodeGen/X86/tailcall-mem-intrinsics.ll
+++ b/test/CodeGen/X86/tailcall-mem-intrinsics.ll
@@ -8,8 +8,8 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: tail_memset
-; CHECK; jmp memmove
+; CHECK-LABEL: tail_memmove
+; CHECK: jmp memmove
 define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
 entry:
   tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
diff --git a/test/CodeGen/X86/tailcall-msvc-conventions.ll b/test/CodeGen/X86/tailcall-msvc-conventions.ll
new file mode 100644
index 0000000000000..98b02c9c07e82
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-msvc-conventions.ll
@@ -0,0 +1,189 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -O1 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-linux-gnu -O0 < %s | FileCheck %s
+
+; The MSVC family of x86 calling conventions makes tail calls really tricky.
+; Tests of all the various combinations should live here.
+
+declare i32 @cdecl_i32()
+declare void @cdecl_void()
+
+; Don't allow tail calling these cdecl functions, because we need to clear the
+; incoming stack arguments for these argument-clearing conventions.
+
+define x86_thiscallcc void @thiscall_cdecl_notail(i32 %a, i32 %b, i32 %c) {
+  tail call void @cdecl_void()
+  ret void
+}
+; CHECK-LABEL: thiscall_cdecl_notail
+; CHECK: calll cdecl_void
+; CHECK: retl $8
+
+define x86_stdcallcc void @stdcall_cdecl_notail(i32 %a, i32 %b, i32 %c) {
+  tail call void @cdecl_void()
+  ret void
+}
+; CHECK-LABEL: stdcall_cdecl_notail
+; CHECK: calll cdecl_void
+; CHECK: retl $12
+
+define x86_vectorcallcc void @vectorcall_cdecl_notail(i32 inreg %a, i32 inreg %b, i32 %c) {
+  tail call void @cdecl_void()
+  ret void
+}
+; CHECK-LABEL: vectorcall_cdecl_notail
+; CHECK: calll cdecl_void
+; CHECK: retl $4
+
+define x86_fastcallcc void @fastcall_cdecl_notail(i32 inreg %a, i32 inreg %b, i32 %c) {
+  tail call void @cdecl_void()
+  ret void
+}
+; CHECK-LABEL: fastcall_cdecl_notail
+; CHECK: calll cdecl_void
+; CHECK: retl $4
+
+
+; Tail call to/from callee pop functions can work under the right circumstances:
+
+declare x86_thiscallcc void @no_args_method(i8*)
+declare x86_thiscallcc void @one_arg_method(i8*, i32)
+declare x86_thiscallcc void @two_args_method(i8*, i32, i32)
+declare void @ccall_func()
+declare void @ccall_func1(i32)
+
+define x86_thiscallcc void @thiscall_thiscall_tail(i8* %this) {
+entry:
+  tail call x86_thiscallcc void @no_args_method(i8* %this)
+  ret void
+}
+; CHECK-LABEL: thiscall_thiscall_tail:
+; CHECK: jmp no_args_method
+
+define x86_thiscallcc void @thiscall_thiscall_tail2(i8* %this, i32 %a, i32 %b) {
+entry:
+  tail call x86_thiscallcc void @two_args_method(i8* %this, i32 %a, i32 %b)
+  ret void
+}
+; @two_args_method will take care of popping %a and %b from the stack for us.
+; CHECK-LABEL: thiscall_thiscall_tail2:
+; CHECK: jmp two_args_method
+
+define x86_thiscallcc void @thiscall_thiscall_notail(i8* %this, i32 %a, i32 %b, i32 %x) {
+entry:
+  tail call x86_thiscallcc void @two_args_method(i8* %this, i32 %a, i32 %b)
+  ret void
+}
+; @two_args_method would not pop %x.
+; CHECK-LABEL: thiscall_thiscall_notail:
+; CHECK: calll two_args_method
+; CHECK: retl $12
+
+define x86_thiscallcc void @thiscall_thiscall_notail2(i8* %this, i32 %a) {
+entry:
+  tail call x86_thiscallcc void @no_args_method(i8* %this)
+  ret void
+}
+; @no_args_method would not pop %x for us. Make sure this is checked even
+; when there are no arguments to the call.
+; CHECK-LABEL: thiscall_thiscall_notail2:
+; CHECK: calll no_args_method
+; CHECK: retl $4
+
+define void @ccall_thiscall_tail(i8* %x) {
+entry:
+  tail call x86_thiscallcc void @no_args_method(i8* %x)
+  ret void
+}
+; Tail calling from ccall to thiscall works.
+; CHECK-LABEL: ccall_thiscall_tail:
+; CHECK: jmp no_args_method
+
+define void @ccall_thiscall_notail(i8* %x, i32 %y) {
+entry:
+  tail call x86_thiscallcc void @one_arg_method(i8* %x, i32 %y);
+  ret void
+}
+; @one_arg_method would pop %y off the stack.
+; CHECK-LABEL: ccall_thiscall_notail:
+; CHECK: calll one_arg_method
+
+define x86_thiscallcc void @thiscall_ccall_tail(i8* %this) {
+entry:
+  tail call void @ccall_func()
+  ret void
+}
+; Tail call from thiscall to ccall works if no arguments need popping.
+; CHECK-LABEL: thiscall_ccall_tail:
+; CHECK: jmp ccall_func
+
+define x86_thiscallcc void @thiscall_ccall_notail(i8* %this, i32 %x) {
+entry:
+  tail call void @ccall_func1(i32 %x)
+  ret void
+}
+; No tail call: %x needs to be popped.
+; CHECK-LABEL: thiscall_ccall_notail:
+; CHECK: calll ccall_func1
+; CHECK: retl $4
+
+%S = type { i32 (...)** }
+define x86_thiscallcc void @tailcall_through_pointer(%S* %this, i32 %a) {
+entry:
+  %0 = bitcast %S* %this to void (%S*, i32)***
+  %vtable = load void (%S*, i32)**, void (%S*, i32)*** %0
+  %1 = load void (%S*, i32)*, void (%S*, i32)** %vtable
+  tail call x86_thiscallcc void %1(%S* %this, i32 %a)
+  ret void
+}
+; Tail calling works through function pointers too.
+; CHECK-LABEL: tailcall_through_pointer:
+; CHECK: jmpl
+
+define x86_stdcallcc void @stdcall_cdecl_tail() {
+  tail call void @ccall_func()
+  ret void
+}
+; stdcall to cdecl works if no arguments need popping.
+; CHECK-LABEL: stdcall_cdecl_tail
+; CHECK: jmp ccall_func
+
+define x86_vectorcallcc void @vectorcall_cdecl_tail(i32 inreg %a, i32 inreg %b) {
+  tail call void @ccall_func()
+  ret void
+}
+; vectorcall to cdecl works if no arguments need popping.
+; CHECK-LABEL: vectorcall_cdecl_tail
+; CHECK: jmp ccall_func
+
+define x86_fastcallcc void @fastcall_cdecl_tail(i32 inreg %a, i32 inreg %b) {
+  tail call void @ccall_func()
+  ret void
+}
+; fastcall to cdecl works if no arguments need popping.
+; CHECK-LABEL: fastcall_cdecl_tail
+; CHECK: jmp ccall_func
+
+define x86_stdcallcc void @stdcall_thiscall_notail(i8* %this, i32 %a, i32 %b) {
+  tail call x86_thiscallcc void @two_args_method(i8* %this, i32 %a, i32 %b)
+  ret void
+}
+; two_args_method will not pop %this.
+; CHECK-LABEL: stdcall_thiscall_notail
+; CHECK: calll two_args_method
+
+define x86_stdcallcc void @stdcall_thiscall_tail(i32 %a, i32 %b) {
+  tail call x86_thiscallcc void @two_args_method(i8* null, i32 %a, i32 %b)
+  ret void
+}
+; The callee pop amounts match up.
+; CHECK-LABEL: stdcall_thiscall_tail
+; CHECK: jmp two_args_method
+
+declare x86_fastcallcc void @fastcall2(i32 inreg %a, i32 inreg %b)
+define void @cdecl_fastcall_tail(i32 %a, i32 %b) {
+  tail call x86_fastcallcc void @fastcall2(i32 %a, i32 %b)
+  ret void
+}
+; fastcall2 won't pop anything.
+; CHECK-LABEL: cdecl_fastcall_tail
+; CHECK: jmp fastcall2
diff --git a/test/CodeGen/X86/tailcall-readnone.ll b/test/CodeGen/X86/tailcall-readnone.ll
new file mode 100644
index 0000000000000..b43f69120e7c2
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-readnone.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s
+
+define void @f(i32** %p) unnamed_addr {
+entry:
+  %v = tail call i32* @g()
+  store i32* %v, i32** %p, align 8
+  ret void
+}
+; CHECK-LABEL: f:
+; CHECK: callq g
+; CHECK: movq    %rax, (%rbx)
+
+declare i32* @g() #2
+
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/X86/tls-android-negative.ll b/test/CodeGen/X86/tls-android-negative.ll
new file mode 100644
index 0000000000000..e90b8914ab285
--- /dev/null
+++ b/test/CodeGen/X86/tls-android-negative.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -emulated-tls -march=x86 -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck  %s
+; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck  %s
+
+; Make sure that some symboles are not emitted in emulated TLS model.
+
+@external_x = external thread_local global i32
+@external_y = thread_local global i32 7
+@internal_y = internal thread_local global i32 9
+@internal_y0 = internal thread_local global i32 0
+
+define i32* @get_external_x() {
+entry:
+  ret i32* @external_x
+}
+
+define i32* @get_external_y() {
+entry:
+  ret i32* @external_y
+}
+
+define i32* @get_internal_y() {
+entry:
+  ret i32* @internal_y
+}
+
+define i32* @get_internal_y0() {
+entry:
+  ret i32* @internal_y0
+}
+
+; no direct access to emulated TLS variables.
+; no definition of emulated TLS variables.
+; no initializer for external TLS variables, __emutls_t.external_x
+; no initializer for 0-initialized TLS variables, __emutls_t.internal_y0
+; not global linkage for __emutls_t.external_y
+
+; CHECK-NOT: external_x@TLS
+; CHECK-NOT: external_y@TLS
+; CHECK-NOT: internal_y@TLS
+; CHECK-NOT: .size external_x
+; CHECK-NOT: .size external_y
+; CHECK-NOT: .size internal_y
+; CHECK-NOT: .size internal_y0
+; CHECK-NOT: __emutls_v.external_x:
+; CHECK-NOT: __emutls_t.external_x:
+; CHECK-NOT: __emutls_t.internal_y0:
+; CHECK-NOT: global __emutls_t.external_y
+; CHECK-NOT: global __emutls_v.internal_y
+; CHECK-NOT: global __emutls_v.internal_y0
+
+; CHECK:     __emutls_t.external_y
+
+; CHECK-NOT: external_x@TLS
+; CHECK-NOT: external_y@TLS
+; CHECK-NOT: internal_y@TLS
+; CHECK-NOT: .size external_x
+; CHECK-NOT: .size external_y
+; CHECK-NOT: .size internal_y
+; CHECK-NOT: .size internal_y0
+; CHECK-NOT: __emutls_v.external_x:
+; CHECK-NOT: __emutls_t.external_x:
+; CHECK-NOT: __emutls_t.internal_y0:
+; CHECK-NOT: global __emutls_t.external_y
+; CHECK-NOT: global __emutls_v.internal_y
+; CHECK-NOT: global __emutls_v.internal_y0
diff --git a/test/CodeGen/X86/tls-android.ll b/test/CodeGen/X86/tls-android.ll
new file mode 100644
index 0000000000000..4156c7b3f5b96
--- /dev/null
+++ b/test/CodeGen/X86/tls-android.ll
@@ -0,0 +1,89 @@
+; RUN: llc < %s -emulated-tls -march=x86 -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck  %s
+; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck -check-prefix=X64 %s
+
+; Make sure that TLS symboles are emitted in expected order.
+
+@external_x = external thread_local global i32
+@external_y = thread_local global i32 7
+@internal_y = internal thread_local global i32 9
+
+define i32* @get_external_x() {
+entry:
+  ret i32* @external_x
+}
+
+define i32* @get_external_y() {
+entry:
+  ret i32* @external_y
+}
+
+define i32* @get_internal_y() {
+entry:
+  ret i32* @internal_y
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 32-bit mode
+; CHECK-LABEL: get_external_x:
+; CHECK:  __emutls_v.external_x
+; CHECK:  __emutls_get_address
+
+; CHECK-LABEL: get_external_y:
+; CHECK:  __emutls_v.external_y
+; CHECK:  __emutls_get_address
+
+; CHECK-LABEL: get_internal_y:
+; CHECK:  __emutls_v.internal_y
+; CHECK:  __emutls_get_address
+
+; CHECK-NOT: __emutls_v.external_x:
+
+; CHECK:       .align 4
+; CHECK-LABEL: __emutls_v.external_y:
+; CHECK-NEXT:  .long 4
+; CHECK-NEXT:  .long 4
+; CHECK-NEXT:  .long 0
+; CHECK-NEXT:  .long __emutls_t.external_y
+; CHECK-LABEL: __emutls_t.external_y:
+; CHECK-NEXT:  .long 7
+
+; CHECK:       .align 4
+; CHECK-LABEL: __emutls_v.internal_y:
+; CHECK-NEXT:  .long 4
+; CHECK-NEXT:  .long 4
+; CHECK-NEXT:  .long 0
+; CHECK-NEXT:  .long __emutls_t.internal_y
+; CHECK-LABEL: __emutls_t.internal_y:
+; CHECK-NEXT:  .long 9
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 64-bit mode
+; X64-LABEL: get_external_x:
+; X64:  __emutls_v.external_x
+; X64:  __emutls_get_address
+
+; X64-LABEL: get_external_y:
+; X64:  __emutls_v.external_y
+; X64:  __emutls_get_address
+
+; X64-LABEL: get_internal_y:
+; X64:  __emutls_v.internal_y
+; X64:  __emutls_get_address
+
+; X64-NOT: __emutls_v.external_x:
+
+; X64:       .align 8
+; X64-LABEL: __emutls_v.external_y:
+; X64-NEXT:  .quad 4
+; X64-NEXT:  .quad 4
+; X64-NEXT:  .quad 0
+; X64-NEXT:  .quad __emutls_t.external_y
+; X64-LABEL: __emutls_t.external_y:
+; X64-NEXT:  .long 7
+
+; X64:       .align 8
+; X64-LABEL: __emutls_v.internal_y:
+; X64-NEXT:  .quad 4
+; X64-NEXT:  .quad 4
+; X64-NEXT:  .quad 0
+; X64-NEXT:  .quad __emutls_t.internal_y
+; X64-LABEL: __emutls_t.internal_y:
+; X64-NEXT:  .long 9
diff --git a/test/CodeGen/X86/tls-models.ll b/test/CodeGen/X86/tls-models.ll
index 0fd7853282119..2377da4f025a4 100644
--- a/test/CodeGen/X86/tls-models.ll
+++ b/test/CodeGen/X86/tls-models.ll
@@ -18,6 +18,8 @@
 @external_le = external thread_local(localexec) global i32
 @internal_le = internal thread_local(localexec) global i32 42
 
+; See test cases for emulated model in emutls.ll, emutls-pic.ll and emutls-pie.ll.
+
 ; ----- no model specified -----
 
 define i32* @f1() {
diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll
index 10fe1e94bbdc4..235230e3c6a81 100644
--- a/test/CodeGen/X86/tls-pie.ll
+++ b/test/CodeGen/X86/tls-pie.ll
@@ -36,9 +36,13 @@ entry:
 define i32 @f3() {
 ; X32-LABEL: f3:
 ; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset 4
 ; X32-NEXT: .L{{[0-9]+}}$pb:
 ; X32-NEXT: popl %eax
 ; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset -4
+; X32-NEXT: .Ltmp{{[0-9]+}}:
 ; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %eax
 ; X32-NEXT: movl i2@GOTNTPOFF(%eax), %eax
 ; X32-NEXT: movl %gs:(%eax), %eax
@@ -56,9 +60,13 @@ entry:
 define i32* @f4() {
 ; X32-LABEL: f4:
 ; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset 4
 ; X32-NEXT: .L{{[0-9]+}}$pb:
 ; X32-NEXT: popl %ecx
 ; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset -4
+; X32-NEXT: .Ltmp{{[0-9]+}}:
 ; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %ecx
 ; X32-NEXT: movl %gs:0, %eax
 ; X32-NEXT: addl i2@GOTNTPOFF(%ecx), %eax
diff --git a/test/CodeGen/X86/token_landingpad.ll b/test/CodeGen/X86/token_landingpad.ll
new file mode 100644
index 0000000000000..087b68bfce8aa
--- /dev/null
+++ b/test/CodeGen/X86/token_landingpad.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s
+
+; This test verifies that SelectionDAG can handle landingPad of token type and not crash LLVM.
+
+define void @test() personality i32 (...)* @dummy_personality {
+entry:
+  invoke void @dummy()
+          to label %return unwind label %unwind
+
+unwind:                                           ; preds = %entry
+  %lp = landingpad token
+            cleanup
+  br label %return
+
+return:                                           ; preds = %entry
+  ret void
+}
+
+declare void @dummy()
+
+declare i32 @dummy_personality(...)
diff --git a/test/CodeGen/X86/trunc-store.ll b/test/CodeGen/X86/trunc-store.ll
new file mode 100644
index 0000000000000..646b4b2c336df
--- /dev/null
+++ b/test/CodeGen/X86/trunc-store.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+; With optimization at O2 we actually get the legalized function optimized
+; away through legalization and stack coloring, but check that we do all of
+; that here and don't crash during legalization.
+
+; Original program:
+; typedef enum { A, B, C, D } P;
+; struct { P x[2]; } a;
+
+; void fn2();
+; void fn1() {
+;   int b;
+;   unsigned c;
+;   for (;; c++) {
+;     fn2();
+;     unsigned n;
+;     for (; c; c++) {
+;       b = a.x[c] == A || a.x[c] == B || a.x[c] == D;
+;       if (b) n++;
+;     }
+;     if (n)
+;	for (;;)
+;	  ;
+;   }
+; }
+
+define void @fn1() {
+; CHECK-LABEL: fn1
+; CHECK: movb	$0, {{.*}}(%rsp)
+; CHECK: cmpq	$8, %rax
+for.cond:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %for.cond
+  %x42 = bitcast <4 x i4> zeroinitializer to i16
+  %x43 = icmp ne i16 %x42, 0
+  %x44 = select i1 %x43, i32 undef, i32 0
+  %x72 = bitcast <4 x i1> zeroinitializer to i4
+  %x73 = icmp ne i4 %x72, 0
+  %x74 = select i1 %x73, i32 %x44, i32 undef
+  %x84 = select i1 undef, i32 undef, i32 %x74
+  %x88 = icmp eq i64 undef, 8
+  br i1 %x88, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = select i1 undef, i32 undef, i32 %x84
+  ret void
+}
diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll
index d979c16f4abdd..b9deb058cb3f1 100644
--- a/test/CodeGen/X86/unaligned-32-byte-memops.ll
+++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=AVXSLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=AVXFAST
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2
@@ -75,12 +76,12 @@ define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
   ret <8 x float> %v3
 }
 
+; If the first load is 32-byte aligned, then the loads should be merged in all cases.
+
 define <8 x float> @combine_16_byte_loads_aligned(<4 x float>* %ptr) {
-;; FIXME: The first load is 32-byte aligned, so the second load should get merged.
 ; AVXSLOW-LABEL: combine_16_byte_loads_aligned:
 ; AVXSLOW:       # BB#0:
-; AVXSLOW-NEXT:    vmovaps 48(%rdi), %xmm0
-; AVXSLOW-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
+; AVXSLOW-NEXT:    vmovaps 48(%rdi), %ymm0
 ; AVXSLOW-NEXT:    retq
 ;
 ; AVXFAST-LABEL: combine_16_byte_loads_aligned:
diff --git a/test/CodeGen/X86/unaligned-spill-folding.ll b/test/CodeGen/X86/unaligned-spill-folding.ll
index 33e2daf9dc1b2..dee94bce15a58 100644
--- a/test/CodeGen/X86/unaligned-spill-folding.ll
+++ b/test/CodeGen/X86/unaligned-spill-folding.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=i386-unknown-freebsd -mcpu=core2 -stack-alignment=4 -relocation-model=pic < %s | FileCheck %s -check-prefix=UNALIGNED
 ; RUN: llc -mtriple=i386-unknown-freebsd -mcpu=core2 -stack-alignment=16 -relocation-model=pic < %s | FileCheck %s -check-prefix=ALIGNED
-; RUN: llc -mtriple=i386-unknown-freebsd -mcpu=core2 -stack-alignment=4 -force-align-stack -relocation-model=pic < %s | FileCheck %s -check-prefix=FORCEALIGNED
+; RUN: llc -mtriple=i386-unknown-freebsd -mcpu=core2 -stack-alignment=4 -stackrealign -relocation-model=pic < %s | FileCheck %s -check-prefix=FORCEALIGNED
 
 @arr = internal unnamed_addr global [32 x i32] zeroinitializer, align 16
 
diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index c018a49d135e2..c41e529aa9543 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll
@@ -10,7 +10,7 @@
 ; CHECK-NEXT:         idivl
 ; CHECK-NEXT:         .loc 1 4 3
 
-define i32 @foo(i32 %w, i32 %x, i32 %y, i32 %z) nounwind {
+define i32 @foo(i32 %w, i32 %x, i32 %y, i32 %z) nounwind !dbg !1 {
 entry:
   %a = add  i32 %w, %x, !dbg !8
   %b = sdiv i32 %a, %y
@@ -21,10 +21,10 @@ entry:
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!12}
 
-!0 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "x", line: 1, arg: 0, scope: !1, file: !2, type: !6)
-!1 = !DISubprogram(name: "foo", linkageName: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 1, file: !10, scope: !2, type: !4, function: i32 (i32, i32, i32, i32)* @foo)
+!0 = !DILocalVariable(name: "x", line: 1, arg: 2, scope: !1, file: !2, type: !6)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 1, file: !10, scope: !2, type: !4)
 !2 = !DIFile(filename: "test.c", directory: "/dir")
-!3 = !DICompileUnit(language: DW_LANG_C99, producer: "producer", isOptimized: false, emissionKind: 0, file: !10, enums: !11, retainedTypes: !11, subprograms: !9)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "producer", isOptimized: false, emissionKind: 0, file: !10, enums: !11, retainedTypes: !11, subprograms: !9)
 !4 = !DISubroutineType(types: !5)
 !5 = !{!6}
 !6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll
index 7beed52295ee3..3b7160c718699 100644
--- a/test/CodeGen/X86/v2f32.ll
+++ b/test/CodeGen/X86/v2f32.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -o - | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -o - | FileCheck %s --check-prefix=X32
 
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 1ba11f51baa2b..dda50b7b94b7f 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
 
@@ -46,17 +47,19 @@ define <4 x float> @foo1_4(<4 x i8> %src) {
 define <8 x float> @foo2_8(<8 x i8> %src) {
 ; CHECK-LABEL: foo2_8:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    vandps LCPI2_0, %ymm0, %ymm0
+; CHECK-NEXT:    vpand LCPI2_0, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: foo2_8:
 ; CHECK-WIDE:       ## BB#0:
 ; CHECK-WIDE-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-WIDE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-WIDE-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-WIDE-NEXT:    retl
@@ -96,25 +99,25 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
 ; CHECK-WIDE:       ## BB#0:
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
-; CHECK-WIDE-NEXT:    vmovshdup %xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps $231, %xmm0, %xmm2 ## xmm2 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vmovshdup %xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm0
 ; CHECK-WIDE-NEXT:    vzeroupper
@@ -133,13 +136,13 @@ define <4 x i8> @foo3_4(<4 x float> %src) {
 ; CHECK-WIDE:       ## BB#0:
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
-; CHECK-WIDE-NEXT:    vmovshdup %xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
 ; CHECK-WIDE-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm0
 ; CHECK-WIDE-NEXT:    retl
diff --git a/test/CodeGen/X86/vec_cmp_sint-128.ll b/test/CodeGen/X86/vec_cmp_sint-128.ll
new file mode 100644
index 0000000000000..1407f71de714e
--- /dev/null
+++ b/test/CodeGen/X86/vec_cmp_sint-128.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Equal
+;
+
+define <2 x i64> @eq_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: eq_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: eq_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: eq_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: eq_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: eq_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomeqq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp eq <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @eq_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: eq_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: eq_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: eq_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomeqd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp eq <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @eq_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: eq_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: eq_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: eq_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomeqw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp eq <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @eq_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: eq_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: eq_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: eq_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomeqb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp eq <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; Not Equal
+;
+
+define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: ne_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ne_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: ne_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: ne_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ne_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomneqq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ne <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @ne_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: ne_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ne_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ne_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomneqd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ne <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @ne_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: ne_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ne_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ne_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomneqw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ne <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @ne_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: ne_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ne_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ne_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomneqb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ne <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; Greater Than Or Equal
+;
+
+define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: ge_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ge_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: ge_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pcmpgtq %xmm0, %xmm1
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: ge_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ge_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgeq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sge <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @ge_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: ge_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ge_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ge_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomged %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sge <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @ge_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: ge_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ge_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ge_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgew %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sge <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @ge_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: ge_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ge_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ge_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgeb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sge <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; Greater Than
+;
+
+define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: gt_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: gt_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: gt_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: gt_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: gt_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sgt <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: gt_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gt_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: gt_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sgt <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: gt_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gt_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: gt_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sgt <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: gt_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gt_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: gt_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sgt <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; Less Than Or Equal
+;
+
+define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: le_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: le_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: le_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: le_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: le_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomleq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sle <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @le_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: le_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: le_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: le_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomled %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sle <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @le_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: le_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtw %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: le_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: le_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomlew %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sle <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @le_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: le_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: le_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: le_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomleb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp sle <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; Less Than
+;
+
+define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: lt_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: lt_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: lt_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pcmpgtq %xmm0, %xmm1
+; SSE42-NEXT:    movdqa %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: lt_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: lt_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomltq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp slt <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: lt_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: lt_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: lt_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomltd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp slt <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: lt_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: lt_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: lt_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomltw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp slt <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @lt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: lt_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: lt_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: lt_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomltb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp slt <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
diff --git a/test/CodeGen/X86/vec_cmp_uint-128.ll b/test/CodeGen/X86/vec_cmp_uint-128.ll
new file mode 100644
index 0000000000000..8bed14e7e5f5f
--- /dev/null
+++ b/test/CodeGen/X86/vec_cmp_uint-128.ll
@@ -0,0 +1,860 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Equal
+;
+
+define <2 x i64> @eq_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: eq_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: eq_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: eq_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: eq_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: eq_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomeqq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp eq <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @eq_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: eq_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: eq_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: eq_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomeqd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp eq <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @eq_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: eq_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: eq_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: eq_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomeqw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp eq <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @eq_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: eq_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: eq_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: eq_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomeqb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp eq <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; Not Equal
+;
+
+define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: ne_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ne_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: ne_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: ne_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ne_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomneqq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ne <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @ne_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: ne_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ne_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ne_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomneqd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ne <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @ne_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: ne_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ne_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ne_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomneqw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ne <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @ne_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: ne_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ne_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ne_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomneqb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ne <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; Greater Than Or Equal
+;
+
+define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: ge_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ge_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: ge_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm2
+; SSE42-NEXT:    pcmpgtq %xmm0, %xmm2
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: ge_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ge_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgeuq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp uge <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @ge_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE2-LABEL: ge_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ge_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: ge_v4i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxud %xmm0, %xmm1
+; SSE42-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: ge_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ge_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgeud %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp uge <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @ge_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE2-LABEL: ge_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psubusw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ge_v8i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxuw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: ge_v8i16:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; SSE42-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: ge_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ge_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgeuw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp uge <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @ge_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: ge_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmaxub %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ge_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: ge_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgeub %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp uge <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; Greater Than
+;
+
+define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: gt_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: gt_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: gt_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    pxor %xmm2, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: gt_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: gt_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgtuq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ugt <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: gt_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: gt_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: gt_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: gt_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgtud %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: gt_v4i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX512-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = icmp ugt <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: gt_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gt_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: gt_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgtuw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ugt <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: gt_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gt_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: gt_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomgtub %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ugt <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; Less Than Or Equal
+;
+
+define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: le_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: le_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: le_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    pxor %xmm2, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: le_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: le_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomleuq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ule <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @le_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE2-LABEL: le_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: le_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: le_v4i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminud %xmm0, %xmm1
+; SSE42-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: le_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: le_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomleud %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ule <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @le_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE2-LABEL: le_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psubusw %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: le_v8i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminuw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: le_v8i16:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminuw %xmm0, %xmm1
+; SSE42-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: le_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: le_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomleuw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ule <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @le_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: le_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pminub %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: le_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: le_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomleub %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ule <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; Less Than
+;
+
+define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: lt_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: lt_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: lt_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm2
+; SSE42-NEXT:    pcmpgtq %xmm0, %xmm2
+; SSE42-NEXT:    movdqa %xmm2, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: lt_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: lt_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomltuq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ult <2 x i64> %a, %b
+  %2 = sext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: lt_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: lt_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: lt_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: lt_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomltud %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: lt_v4i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %1 = icmp ult <4 x i32> %a, %b
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: lt_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtw %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: lt_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: lt_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomltuw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ult <8 x i16> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @lt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: lt_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: lt_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: lt_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpcomltub %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+  %1 = icmp ult <16 x i8> %a, %b
+  %2 = sext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 318aca1d54cb2..66114bc9c6bcf 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
 
 declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
@@ -5,25 +6,63 @@ declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
 
 define <2 x i64> @footz(<2 x i64> %a) nounwind {
+; CHECK-LABEL: footz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    bsfq %rax, %rax
+; CHECK-NEXT:    movd %rax, %xmm1
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    bsfq %rax, %rax
+; CHECK-NEXT:    movd %rax, %xmm0
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
   ret <2 x i64> %c
 
-; CHECK-LABEL: footz
-; CHECK: bsfq
-; CHECK: bsfq
 }
 define <2 x i64> @foolz(<2 x i64> %a) nounwind {
+; CHECK-LABEL: foolz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    bsrq %rax, %rax
+; CHECK-NEXT:    xorq $63, %rax
+; CHECK-NEXT:    movd %rax, %xmm1
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    bsrq %rax, %rax
+; CHECK-NEXT:    xorq $63, %rax
+; CHECK-NEXT:    movd %rax, %xmm0
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
   ret <2 x i64> %c
 
-; CHECK-LABEL: foolz
-; CHECK: bsrq
-; CHECK: xorq $63
-; CHECK: bsrq
-; CHECK: xorq $63
 }
 
 define <2 x i64> @foopop(<2 x i64> %a) nounwind {
+; CHECK-LABEL: foopop:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlq $1, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    psubq %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm1, %xmm2
+; CHECK-NEXT:    psrlq $2, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    paddq %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlq $4, %xmm1
+; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    psadbw %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %c = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   ret <2 x i64> %c
 }
@@ -33,35 +72,73 @@ declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1)
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
 
 define <2 x i32> @promtz(<2 x i32> %a) nounwind {
+; CHECK-LABEL: promtz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    por {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    bsfq %rax, %rax
+; CHECK-NEXT:    movl $64, %ecx
+; CHECK-NEXT:    cmoveq %rcx, %rax
+; CHECK-NEXT:    movd %rax, %xmm1
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    bsfq %rax, %rax
+; CHECK-NEXT:    cmoveq %rcx, %rax
+; CHECK-NEXT:    movd %rax, %xmm0
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
   ret <2 x i32> %c
 
-; CHECK: .quad 4294967296
-; CHECK: .quad 4294967296
-; CHECK-LABEL: promtz
-; CHECK: bsfq
-; CHECK: cmov
-; CHECK: bsfq
-; CHECK: cmov
 }
 define <2 x i32> @promlz(<2 x i32> %a) nounwind {
+; CHECK-LABEL: promlz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    bsrq %rax, %rax
+; CHECK-NEXT:    movl $127, %ecx
+; CHECK-NEXT:    cmoveq %rcx, %rax
+; CHECK-NEXT:    xorq $63, %rax
+; CHECK-NEXT:    movd %rax, %xmm1
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    bsrq %rax, %rax
+; CHECK-NEXT:    cmoveq %rcx, %rax
+; CHECK-NEXT:    xorq $63, %rax
+; CHECK-NEXT:    movd %rax, %xmm0
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    psubq {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
   ret <2 x i32> %c
 
-; CHECK: .quad 4294967295
-; CHECK: .quad 4294967295
-; CHECK: .quad 32
-; CHECK: .quad 32
-; CHECK-LABEL: promlz
-; CHECK: pand
-; CHECK: bsrq
-; CHECK: xorq $63
-; CHECK: bsrq
-; CHECK: xorq $63
-; CHECK: psub
 }
 
 define <2 x i32> @prompop(<2 x i32> %a) nounwind {
+; CHECK-LABEL: prompop:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlq $1, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    psubq %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm1, %xmm2
+; CHECK-NEXT:    psrlq $2, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    paddq %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlq $4, %xmm1
+; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    psadbw %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
   ret <2 x i32> %c
 }
diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll
index fbb84170dc83b..abb07233d35e0 100644
--- a/test/CodeGen/X86/vec_extract-avx.ll
+++ b/test/CodeGen/X86/vec_extract-avx.ll
@@ -1,14 +1,18 @@
-target triple = "x86_64-unknown-unknown"
-
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
 ; When extracting multiple consecutive elements from a larger
 ; vector into a smaller one, do it efficiently. We should use
 ; an EXTRACT_SUBVECTOR node internally rather than a bunch of
-; single element extractions. 
+; single element extractions.
 
 ; Extracting the low elements only requires using the right kind of store.
 define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
+; CHECK-LABEL: low_v8f32_to_v4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %ext0 = extractelement <8 x float> %v, i32 0
   %ext1 = extractelement <8 x float> %v, i32 1
   %ext2 = extractelement <8 x float> %v, i32 2
@@ -19,15 +23,15 @@ define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
   %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
   store <4 x float> %ins3, <4 x float>* %ptr, align 16
   ret void
-
-; CHECK-LABEL: low_v8f32_to_v4f32
-; CHECK: vmovaps
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
 }
 
-; Extracting the high elements requires just one AVX instruction. 
+; Extracting the high elements requires just one AVX instruction.
 define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
+; CHECK-LABEL: high_v8f32_to_v4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %ext0 = extractelement <8 x float> %v, i32 4
   %ext1 = extractelement <8 x float> %v, i32 5
   %ext2 = extractelement <8 x float> %v, i32 6
@@ -38,17 +42,17 @@ define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
   %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
   store <4 x float> %ins3, <4 x float>* %ptr, align 16
   ret void
-
-; CHECK-LABEL: high_v8f32_to_v4f32
-; CHECK: vextractf128
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
 }
 
 ; Make sure element type doesn't alter the codegen. Note that
 ; if we were actually using the vector in this function and
 ; have AVX2, we should generate vextracti128 (the int version).
 define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
+; CHECK-LABEL: high_v8i32_to_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %ext0 = extractelement <8 x i32> %v, i32 4
   %ext1 = extractelement <8 x i32> %v, i32 5
   %ext2 = extractelement <8 x i32> %v, i32 6
@@ -59,24 +63,86 @@ define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
   %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3
   store <4 x i32> %ins3, <4 x i32>* %ptr, align 16
   ret void
-
-; CHECK-LABEL: high_v8i32_to_v4i32
-; CHECK: vextractf128
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
 }
 
 ; Make sure that element size doesn't alter the codegen.
 define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
+; CHECK-LABEL: high_v4f64_to_v2f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %ext0 = extractelement <4 x double> %v, i32 2
   %ext1 = extractelement <4 x double> %v, i32 3
   %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
   %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
   store <2 x double> %ins1, <2 x double>* %ptr, align 16
   ret void
+}
+
+; PR25320 Make sure that a widened (possibly legalized) vector correctly zero-extends upper elements.
+; FIXME - Ideally these should just call VMOVD/VMOVQ/VMOVSS/VMOVSD
+
+define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) {
+; CHECK-LABEL: legal_vzmovl_2i32_8i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; CHECK-NEXT:    vmovaps %ymm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %ld = load <2 x i32>, <2 x i32>* %in, align 8
+  %ext = extractelement <2 x i32> %ld, i64 0
+  %ins = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %ext, i64 0
+  store <8 x i32> %ins, <8 x i32>* %out, align 32
+  ret void
+}
+
+define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {
+; CHECK-LABEL: legal_vzmovl_2i64_4i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovupd (%rdi), %xmm0
+; CHECK-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; CHECK-NEXT:    vmovapd %ymm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %ld = load <2 x i64>, <2 x i64>* %in, align 8
+  %ext = extractelement <2 x i64> %ld, i64 0
+  %ins = insertelement <4 x i64> <i64 undef, i64 0, i64 0, i64 0>, i64 %ext, i64 0
+  store <4 x i64> %ins, <4 x i64>* %out, align 32
+  ret void
+}
 
-; CHECK-LABEL: high_v4f64_to_v2f64
-; CHECK: vextractf128
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
+; CHECK-LABEL: legal_vzmovl_2f32_8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; CHECK-NEXT:    vmovaps %ymm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %ld = load <2 x float>, <2 x float>* %in, align 8
+  %ext = extractelement <2 x float> %ld, i64 0
+  %ins = insertelement <8 x float> <float undef, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, float %ext, i64 0
+  store <8 x float> %ins, <8 x float>* %out, align 32
+  ret void
+}
+
+define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) {
+; CHECK-LABEL: legal_vzmovl_2f64_4f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovupd (%rdi), %xmm0
+; CHECK-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; CHECK-NEXT:    vmovapd %ymm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %ld = load <2 x double>, <2 x double>* %in, align 8
+  %ext = extractelement <2 x double> %ld, i64 0
+  %ins = insertelement <4 x double> <double undef, double 0.0, double 0.0, double 0.0>, double %ext, i64 0
+  store <4 x double> %ins, <4 x double>* %out, align 32
+  ret void
 }
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index 960b5f27cf535..54f33b2bd2241 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
-
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s
 
 define <2 x double> @fabs_v2f64(<2 x double> %p)
 {
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 3b1b2f5c1c775..7834b2804247d 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -10,19 +10,19 @@
 ; Double to Signed Integer
 ;
 
-define <2 x i64> @fptosi_2vf64(<2 x double> %a) {
-; SSE2-LABEL: fptosi_2vf64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2vf64:
+define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f64_to_2i64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcvttsd2si %xmm0, %rax
 ; AVX-NEXT:    vmovq %rax, %xmm1
@@ -35,19 +35,19 @@ define <2 x i64> @fptosi_2vf64(<2 x double> %a) {
   ret <2 x i64> %cvt
 }
 
-define <4 x i32> @fptosi_2vf64_i32(<2 x double> %a) {
-; SSE2-LABEL: fptosi_2vf64_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; SSE2-NEXT:    retq
+define <4 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_2i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_2vf64_i32:
+; AVX-LABEL: fptosi_2f64_to_2i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcvttsd2si %xmm0, %rax
 ; AVX-NEXT:    vmovq %rax, %xmm1
@@ -62,26 +62,53 @@ define <4 x i32> @fptosi_2vf64_i32(<2 x double> %a) {
   ret <4 x i32> %ext
 }
 
-define <4 x i64> @fptosi_4vf64(<4 x double> %a) {
-; SSE2-LABEL: fptosi_4vf64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm2
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT:    cvttsd2si %xmm1, %rax
-; SSE2-NEXT:    movd %rax, %xmm3
-; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
-; SSE2-NEXT:    cvttsd2si %xmm1, %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptosi_4vf64:
+define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
+; SSE-LABEL: fptosi_4f64_to_2i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f64_to_2i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvttpd2dqy %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %cvt = fptosi <4 x double> %ext to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
+; SSE-LABEL: fptosi_4f64_to_4i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm2
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    movd %rax, %xmm3
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f64_to_4i64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vcvttsd2si %xmm1, %rax
@@ -102,27 +129,27 @@ define <4 x i64> @fptosi_4vf64(<4 x double> %a) {
   ret <4 x i64> %cvt
 }
 
-define <4 x i32> @fptosi_4vf64_i32(<4 x double> %a) {
-; SSE2-LABEL: fptosi_4vf64_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvttsd2si %xmm1, %rax
-; SSE2-NEXT:    movd %rax, %xmm2
-; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
-; SSE2-NEXT:    cvttsd2si %xmm1, %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm2
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptosi_4vf64_i32:
+define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
+; SSE-LABEL: fptosi_4f64_to_4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    movd %rax, %xmm2
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm2
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f64_to_4i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcvttpd2dqy %ymm0, %xmm0
 ; AVX-NEXT:    vzeroupper
@@ -135,33 +162,33 @@ define <4 x i32> @fptosi_4vf64_i32(<4 x double> %a) {
 ; Double to Unsigned Integer
 ;
 
-define <2 x i64> @fptoui_2vf64(<2 x double> %a) {
-; SSE2-LABEL: fptoui_2vf64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    subsd %xmm2, %xmm1
-; SSE2-NEXT:    cvttsd2si %xmm1, %rax
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm0
-; SSE2-NEXT:    cmovaeq %rax, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm1
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE2-NEXT:    movapd %xmm0, %xmm3
-; SSE2-NEXT:    subsd %xmm2, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rax
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm0
-; SSE2-NEXT:    cmovaeq %rax, %rcx
-; SSE2-NEXT:    movd %rcx, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptoui_2vf64:
+define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    subsd %xmm2, %xmm1
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE-NEXT:    ucomisd %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    movd %rdx, %xmm1
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    movapd %xmm0, %xmm3
+; SSE-NEXT:    subsd %xmm2, %xmm3
+; SSE-NEXT:    cvttsd2si %xmm3, %rax
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    ucomisd %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rcx
+; SSE-NEXT:    movd %rcx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_2f64_to_2i64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm2
@@ -186,33 +213,33 @@ define <2 x i64> @fptoui_2vf64(<2 x double> %a) {
   ret <2 x i64> %cvt
 }
 
-define <4 x i32> @fptoui_2vf64_i32(<2 x double> %a) {
-; SSE2-LABEL: fptoui_2vf64_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    movapd %xmm0, %xmm2
-; SSE2-NEXT:    subsd %xmm1, %xmm2
-; SSE2-NEXT:    cvttsd2si %xmm2, %rax
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
-; SSE2-NEXT:    ucomisd %xmm1, %xmm0
-; SSE2-NEXT:    cmovaeq %rax, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm2
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE2-NEXT:    movapd %xmm0, %xmm3
-; SSE2-NEXT:    subsd %xmm1, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rax
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE2-NEXT:    ucomisd %xmm1, %xmm0
-; SSE2-NEXT:    cmovaeq %rax, %rcx
-; SSE2-NEXT:    movd %rcx, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptoui_2vf64_i32:
+define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    subsd %xmm1, %xmm2
+; SSE-NEXT:    cvttsd2si %xmm2, %rax
+; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE-NEXT:    ucomisd %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    movd %rdx, %xmm2
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    movapd %xmm0, %xmm3
+; SSE-NEXT:    subsd %xmm1, %xmm3
+; SSE-NEXT:    cvttsd2si %xmm3, %rax
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    ucomisd %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rcx
+; SSE-NEXT:    movd %rcx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_2f64_to_2i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm2
@@ -239,51 +266,101 @@ define <4 x i32> @fptoui_2vf64_i32(<2 x double> %a) {
   ret <4 x i32> %ext
 }
 
-define <4 x i64> @fptoui_4vf64(<4 x double> %a) {
-; SSE2-LABEL: fptoui_4vf64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movapd %xmm0, %xmm2
-; SSE2-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
-; SSE2-NEXT:    subsd %xmm3, %xmm0
-; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm2, %rdx
-; SSE2-NEXT:    ucomisd %xmm3, %xmm2
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm0
-; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
-; SSE2-NEXT:    movapd %xmm2, %xmm4
-; SSE2-NEXT:    subsd %xmm3, %xmm4
-; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm2, %rdx
-; SSE2-NEXT:    ucomisd %xmm3, %xmm2
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm2
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT:    movapd %xmm1, %xmm2
-; SSE2-NEXT:    subsd %xmm3, %xmm2
-; SSE2-NEXT:    cvttsd2si %xmm2, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm1, %rdx
-; SSE2-NEXT:    ucomisd %xmm3, %xmm1
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm2
-; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
-; SSE2-NEXT:    movapd %xmm1, %xmm4
-; SSE2-NEXT:    subsd %xmm3, %xmm4
-; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm1, %rax
-; SSE2-NEXT:    ucomisd %xmm3, %xmm1
-; SSE2-NEXT:    cmovaeq %rcx, %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptoui_4vf64:
+define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
+; SSE-LABEL: fptoui_4f64_to_2i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    subsd %xmm1, %xmm2
+; SSE-NEXT:    cvttsd2si %xmm2, %rax
+; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE-NEXT:    ucomisd %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    movd %rdx, %xmm2
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    movapd %xmm0, %xmm3
+; SSE-NEXT:    subsd %xmm1, %xmm3
+; SSE-NEXT:    cvttsd2si %xmm3, %rax
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE-NEXT:    ucomisd %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    movd %rdx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    ucomisd %xmm1, %xmm0
+; SSE-NEXT:    cmovbq %rax, %rcx
+; SSE-NEXT:    movd %rcx, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_4f64_to_2i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %cvt = fptoui <4 x double> %ext to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
+; SSE-LABEL: fptoui_4f64_to_4i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE-NEXT:    subsd %xmm3, %xmm0
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm2, %rdx
+; SSE-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm0
+; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT:    movapd %xmm2, %xmm4
+; SSE-NEXT:    subsd %xmm3, %xmm4
+; SSE-NEXT:    cvttsd2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm2, %rdx
+; SSE-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm2
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT:    movapd %xmm1, %xmm2
+; SSE-NEXT:    subsd %xmm3, %xmm2
+; SSE-NEXT:    cvttsd2si %xmm2, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm1, %rdx
+; SSE-NEXT:    ucomisd %xmm3, %xmm1
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm2
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    movapd %xmm1, %xmm4
+; SSE-NEXT:    subsd %xmm3, %xmm4
+; SSE-NEXT:    cvttsd2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    ucomisd %xmm3, %xmm1
+; SSE-NEXT:    cmovaeq %rcx, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_4f64_to_4i64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
@@ -326,53 +403,53 @@ define <4 x i64> @fptoui_4vf64(<4 x double> %a) {
   ret <4 x i64> %cvt
 }
 
-define <4 x i32> @fptoui_4vf64_i32(<4 x double> %a) {
-; SSE2-LABEL: fptoui_4vf64_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT:    movapd %xmm1, %xmm3
-; SSE2-NEXT:    subsd %xmm2, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rcx
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm1, %rdx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm1
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm3
-; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
-; SSE2-NEXT:    movapd %xmm1, %xmm4
-; SSE2-NEXT:    subsd %xmm2, %xmm4
-; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm1, %rdx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm1
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; SSE2-NEXT:    movapd %xmm0, %xmm3
-; SSE2-NEXT:    subsd %xmm2, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm0
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm3
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE2-NEXT:    movapd %xmm0, %xmm4
-; SSE2-NEXT:    subsd %xmm2, %xmm4
-; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    ucomisd %xmm2, %xmm0
-; SSE2-NEXT:    cmovaeq %rcx, %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptoui_4vf64_i32:
+define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
+; SSE-LABEL: fptoui_4f64_to_4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT:    movapd %xmm1, %xmm3
+; SSE-NEXT:    subsd %xmm2, %xmm3
+; SSE-NEXT:    cvttsd2si %xmm3, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm1, %rdx
+; SSE-NEXT:    ucomisd %xmm2, %xmm1
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm3
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    movapd %xmm1, %xmm4
+; SSE-NEXT:    subsd %xmm2, %xmm4
+; SSE-NEXT:    cvttsd2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm1, %rdx
+; SSE-NEXT:    ucomisd %xmm2, %xmm1
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE-NEXT:    movapd %xmm0, %xmm3
+; SSE-NEXT:    subsd %xmm2, %xmm3
+; SSE-NEXT:    cvttsd2si %xmm3, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE-NEXT:    ucomisd %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm3
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    movapd %xmm0, %xmm4
+; SSE-NEXT:    subsd %xmm2, %xmm4
+; SSE-NEXT:    cvttsd2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    ucomisd %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rcx, %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_4f64_to_4i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vcvttsd2si %xmm1, %rax
@@ -395,13 +472,13 @@ define <4 x i32> @fptoui_4vf64_i32(<4 x double> %a) {
 ; Float to Signed Integer
 ;
 
-define <4 x i32> @fptosi_4vf32(<4 x float> %a) {
-; SSE2-LABEL: fptosi_4vf32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE2-NEXT:    retq
+define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) {
+; SSE-LABEL: fptosi_4f32_to_4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_4vf32:
+; AVX-LABEL: fptosi_4f32_to_4i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -409,19 +486,19 @@ define <4 x i32> @fptosi_4vf32(<4 x float> %a) {
   ret <4 x i32> %cvt
 }
 
-define <2 x i64> @fptosi_4vf32_i64(<4 x float> %a) {
-; SSE2-LABEL: fptosi_4vf32_i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    retq
+define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
+; SSE-LABEL: fptosi_2f32_to_2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_4vf32_i64:
+; AVX-LABEL: fptosi_2f32_to_2i64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcvttss2si %xmm0, %rax
 ; AVX-NEXT:    vmovq %rax, %xmm1
@@ -430,19 +507,45 @@ define <2 x i64> @fptosi_4vf32_i64(<4 x float> %a) {
 ; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX-NEXT:    retq
-  %shuf = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
   %cvt = fptosi <2 x float> %shuf to <2 x i64>
   ret <2 x i64> %cvt
 }
 
-define <8 x i32> @fptosi_8vf32(<8 x float> %a) {
-; SSE2-LABEL: fptosi_8vf32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT:    retq
+define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
+; SSE-LABEL: fptosi_4f32_to_2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f32_to_2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vcvttss2si %xmm1, %rax
+; AVX-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX-NEXT:    vmovq %rcx, %xmm0
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %cvt = fptosi <4 x float> %a to <4 x i64>
+  %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i64> %shuf
+}
+
+define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) {
+; SSE-LABEL: fptosi_8f32_to_8i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_8vf32:
+; AVX-LABEL: fptosi_8f32_to_8i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcvttps2dq %ymm0, %ymm0
 ; AVX-NEXT:    retq
@@ -450,28 +553,28 @@ define <8 x i32> @fptosi_8vf32(<8 x float> %a) {
   ret <8 x i32> %cvt
 }
 
-define <4 x i64> @fptosi_8vf32_i64(<8 x float> %a) {
-; SSE2-LABEL: fptosi_8vf32_i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm2
-; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm1, %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm1, %rax
-; SSE2-NEXT:    movd %rax, %xmm3
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptosi_8vf32_i64:
+define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
+; SSE-LABEL: fptosi_4f32_to_4i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movd %rax, %xmm3
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f32_to_4i64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
 ; AVX-NEXT:    vcvttss2si %xmm1, %rax
@@ -488,38 +591,81 @@ define <4 x i64> @fptosi_8vf32_i64(<8 x float> %a) {
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
-  %shuf = shufflevector <8 x float> %a, <8 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cvt = fptosi <4 x float> %shuf to <4 x i64>
   ret <4 x i64> %cvt
 }
 
+define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
+; SSE-LABEL: fptosi_8f32_to_4i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movd %rax, %xmm3
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_8f32_to_4i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX-NEXT:    vcvttss2si %xmm1, %rax
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-NEXT:    vcvttss2si %xmm2, %rax
+; AVX-NEXT:    vmovq %rax, %xmm2
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX-NEXT:    vcvttss2si %xmm0, %rax
+; AVX-NEXT:    vmovq %rax, %xmm2
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    vcvttss2si %xmm0, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %cvt = fptosi <8 x float> %a to <8 x i64>
+  %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %shuf
+}
+
 ;
 ; Float to Unsigned Integer
 ;
 
-define <4 x i32> @fptoui_4vf32(<4 x float> %a) {
-; SSE2-LABEL: fptoui_4vf32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm1, %rax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm2, %rax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptoui_4vf32:
+define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
+; SSE-LABEL: fptoui_4f32_to_4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm2, %rax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_4f32_to_4i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vcvttss2si %xmm1, %rax
@@ -537,33 +683,33 @@ define <4 x i32> @fptoui_4vf32(<4 x float> %a) {
   ret <4 x i32> %cvt
 }
 
-define <2 x i64> @fptoui_4vf32_i64(<4 x float> %a) {
-; SSE2-LABEL: fptoui_4vf32_i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:    subss %xmm2, %xmm1
-; SSE2-NEXT:    cvttss2si %xmm1, %rax
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    cvttss2si %xmm0, %rdx
-; SSE2-NEXT:    ucomiss %xmm2, %xmm0
-; SSE2-NEXT:    cmovaeq %rax, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    subss %xmm2, %xmm3
-; SSE2-NEXT:    cvttss2si %xmm3, %rax
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    cvttss2si %xmm0, %rcx
-; SSE2-NEXT:    ucomiss %xmm2, %xmm0
-; SSE2-NEXT:    cmovaeq %rax, %rcx
-; SSE2-NEXT:    movd %rcx, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptoui_4vf32_i64:
+define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
+; SSE-LABEL: fptoui_2f32_to_2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    subss %xmm2, %xmm1
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rdx
+; SSE-NEXT:    ucomiss %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    movd %rdx, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    subss %xmm2, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    ucomiss %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rcx
+; SSE-NEXT:    movd %rcx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_2f32_to_2i64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
@@ -584,50 +730,102 @@ define <2 x i64> @fptoui_4vf32_i64(<4 x float> %a) {
 ; AVX-NEXT:    vmovq %rcx, %xmm0
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; AVX-NEXT:    retq
-  %shuf = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
   %cvt = fptoui <2 x float> %shuf to <2 x i64>
   ret <2 x i64> %cvt
 }
 
-define <8 x i32> @fptoui_8vf32(<8 x float> %a) {
-; SSE2-LABEL: fptoui_8vf32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movaps %xmm2, %xmm3
-; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm3, %rax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE2-NEXT:    cvttss2si %xmm2, %rax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
-; SSE2-NEXT:    cvttss2si %xmm2, %rax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    movaps %xmm1, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm2, %rax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    movaps %xmm1, %xmm3
-; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm3, %rax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    cvttss2si %xmm1, %rax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
-; SSE2-NEXT:    cvttss2si %xmm1, %rax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptoui_8vf32:
+define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
+; SSE-LABEL: fptoui_4f32_to_2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    subss %xmm2, %xmm1
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rdx
+; SSE-NEXT:    ucomiss %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    movd %rdx, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    subss %xmm2, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    ucomiss %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rcx
+; SSE-NEXT:    movd %rcx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_4f32_to_2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vsubss %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vcvttss2si %xmm3, %rax
+; AVX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; AVX-NEXT:    xorq %rcx, %rax
+; AVX-NEXT:    vcvttss2si %xmm1, %rdx
+; AVX-NEXT:    vucomiss %xmm2, %xmm1
+; AVX-NEXT:    cmovaeq %rax, %rdx
+; AVX-NEXT:    vsubss %xmm2, %xmm0, %xmm1
+; AVX-NEXT:    vcvttss2si %xmm1, %rax
+; AVX-NEXT:    xorq %rcx, %rax
+; AVX-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX-NEXT:    vucomiss %xmm2, %xmm0
+; AVX-NEXT:    cmovaeq %rax, %rcx
+; AVX-NEXT:    vmovq %rcx, %xmm0
+; AVX-NEXT:    vmovq %rdx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %cvt = fptoui <4 x float> %a to <4 x i64>
+  %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i64> %shuf
+}
+
+define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
+; SSE-LABEL: fptoui_8f32_to_8i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movaps %xmm2, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    cvttss2si %xmm2, %rax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT:    cvttss2si %xmm2, %rax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT:    movaps %xmm1, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm2, %rax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    movaps %xmm1, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_8f32_to_8i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
@@ -658,54 +856,54 @@ define <8 x i32> @fptoui_8vf32(<8 x float> %a) {
   ret <8 x i32> %cvt
 }
 
-define <4 x i64> @fptoui_8vf32_i64(<8 x float> %a) {
-; SSE2-LABEL: fptoui_8vf32_i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    subss %xmm1, %xmm2
-; SSE2-NEXT:    cvttss2si %xmm2, %rcx
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttss2si %xmm0, %rdx
-; SSE2-NEXT:    ucomiss %xmm1, %xmm0
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm2
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; SSE2-NEXT:    movaps %xmm3, %xmm4
-; SSE2-NEXT:    subss %xmm1, %xmm4
-; SSE2-NEXT:    cvttss2si %xmm4, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttss2si %xmm3, %rdx
-; SSE2-NEXT:    ucomiss %xmm1, %xmm3
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm3
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; SSE2-NEXT:    movaps %xmm3, %xmm4
-; SSE2-NEXT:    subss %xmm1, %xmm4
-; SSE2-NEXT:    cvttss2si %xmm4, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttss2si %xmm3, %rdx
-; SSE2-NEXT:    ucomiss %xmm1, %xmm3
-; SSE2-NEXT:    cmovaeq %rcx, %rdx
-; SSE2-NEXT:    movd %rdx, %xmm3
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE2-NEXT:    movapd %xmm0, %xmm4
-; SSE2-NEXT:    subss %xmm1, %xmm4
-; SSE2-NEXT:    cvttss2si %xmm4, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    ucomiss %xmm1, %xmm0
-; SSE2-NEXT:    cmovaeq %rcx, %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: fptoui_8vf32_i64:
+define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
+; SSE-LABEL: fptoui_4f32_to_4i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    subss %xmm1, %xmm2
+; SSE-NEXT:    cvttss2si %xmm2, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm0, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm3, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm3
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm3, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm3
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm3
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    movapd %xmm0, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rcx, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_4f32_to_4i64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
 ; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -744,22 +942,113 @@ define <4 x i64> @fptoui_8vf32_i64(<8 x float> %a) {
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX-NEXT:    retq
-  %shuf = shufflevector <8 x float> %a, <8 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cvt = fptoui <4 x float> %shuf to <4 x i64>
   ret <4 x i64> %cvt
 }
 
+define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
+; SSE-LABEL: fptoui_8f32_to_4i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    subss %xmm1, %xmm2
+; SSE-NEXT:    cvttss2si %xmm2, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm0, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm3, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm3
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm3, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm3
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movd %rdx, %xmm3
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    movapd %xmm0, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rcx, %rax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_8f32_to_4i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX-NEXT:    vcvttss2si %xmm3, %rax
+; AVX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; AVX-NEXT:    xorq %rcx, %rax
+; AVX-NEXT:    vcvttss2si %xmm2, %rdx
+; AVX-NEXT:    vucomiss %xmm1, %xmm2
+; AVX-NEXT:    cmovaeq %rax, %rdx
+; AVX-NEXT:    vmovq %rdx, %xmm2
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX-NEXT:    vsubss %xmm1, %xmm3, %xmm4
+; AVX-NEXT:    vcvttss2si %xmm4, %rax
+; AVX-NEXT:    xorq %rcx, %rax
+; AVX-NEXT:    vcvttss2si %xmm3, %rdx
+; AVX-NEXT:    vucomiss %xmm1, %xmm3
+; AVX-NEXT:    cmovaeq %rax, %rdx
+; AVX-NEXT:    vmovq %rdx, %xmm3
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm3
+; AVX-NEXT:    vcvttss2si %xmm3, %rax
+; AVX-NEXT:    xorq %rcx, %rax
+; AVX-NEXT:    vcvttss2si %xmm0, %rdx
+; AVX-NEXT:    vucomiss %xmm1, %xmm0
+; AVX-NEXT:    cmovaeq %rax, %rdx
+; AVX-NEXT:    vmovq %rdx, %xmm3
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm4
+; AVX-NEXT:    vcvttss2si %xmm4, %rax
+; AVX-NEXT:    xorq %rcx, %rax
+; AVX-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX-NEXT:    vucomiss %xmm1, %xmm0
+; AVX-NEXT:    cmovaeq %rax, %rcx
+; AVX-NEXT:    vmovq %rcx, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %cvt = fptoui <8 x float> %a to <8 x i64>
+  %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %shuf
+}
+
 ;
 ; Constant Folding
 ;
 
-define <2 x i64> @fptosi_2vf64c() {
-; SSE2-LABEL: fptosi_2vf64c:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; SSE2-NEXT:    retq
+define <2 x i64> @fptosi_2f64_to_2i64_const() {
+; SSE-LABEL: fptosi_2f64_to_2i64_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_2vf64c:
+; AVX-LABEL: fptosi_2f64_to_2i64_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,18446744073709551615]
 ; AVX-NEXT:    retq
@@ -767,13 +1056,13 @@ define <2 x i64> @fptosi_2vf64c() {
   ret <2 x i64> %cvt
 }
 
-define <4 x i32> @fptosi_2vf64c_i32() {
-; SSE2-LABEL: fptosi_2vf64c_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <4294967295,1,u,u>
-; SSE2-NEXT:    retq
+define <4 x i32> @fptosi_2f64_to_2i32_const() {
+; SSE-LABEL: fptosi_2f64_to_2i32_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = <4294967295,1,u,u>
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_2vf64c_i32:
+; AVX-LABEL: fptosi_2f64_to_2i32_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u>
 ; AVX-NEXT:    retq
@@ -782,14 +1071,14 @@ define <4 x i32> @fptosi_2vf64c_i32() {
   ret <4 x i32> %ext
 }
 
-define <4 x i64> @fptosi_4vf64c() {
-; SSE2-LABEL: fptosi_4vf64c:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [2,18446744073709551613]
-; SSE2-NEXT:    retq
+define <4 x i64> @fptosi_4f64_to_4i64_const() {
+; SSE-LABEL: fptosi_4f64_to_4i64_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,18446744073709551613]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_4vf64c:
+; AVX-LABEL: fptosi_4f64_to_4i64_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
 ; AVX-NEXT:    retq
@@ -797,13 +1086,13 @@ define <4 x i64> @fptosi_4vf64c() {
   ret <4 x i64> %cvt
 }
 
-define <4 x i32> @fptosi_4vf64c_i32() {
-; SSE2-LABEL: fptosi_4vf64c_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
-; SSE2-NEXT:    retq
+define <4 x i32> @fptosi_4f64_to_4i32_const() {
+; SSE-LABEL: fptosi_4f64_to_4i32_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_4vf64c_i32:
+; AVX-LABEL: fptosi_4f64_to_4i32_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
 ; AVX-NEXT:    retq
@@ -811,13 +1100,13 @@ define <4 x i32> @fptosi_4vf64c_i32() {
   ret <4 x i32> %cvt
 }
 
-define <2 x i64> @fptoui_2vf64c() {
-; SSE2-LABEL: fptoui_2vf64c:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [2,4]
-; SSE2-NEXT:    retq
+define <2 x i64> @fptoui_2f64_to_2i64_const() {
+; SSE-LABEL: fptoui_2f64_to_2i64_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_2vf64c:
+; AVX-LABEL: fptoui_2f64_to_2i64_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4]
 ; AVX-NEXT:    retq
@@ -825,13 +1114,13 @@ define <2 x i64> @fptoui_2vf64c() {
   ret <2 x i64> %cvt
 }
 
-define <4 x i32> @fptoui_2vf64c_i32(<2 x double> %a) {
-; SSE2-LABEL: fptoui_2vf64c_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <2,4,u,u>
-; SSE2-NEXT:    retq
+define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i32_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = <2,4,u,u>
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_2vf64c_i32:
+; AVX-LABEL: fptoui_2f64_to_2i32_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <2,4,u,u>
 ; AVX-NEXT:    retq
@@ -840,14 +1129,14 @@ define <4 x i32> @fptoui_2vf64c_i32(<2 x double> %a) {
   ret <4 x i32> %ext
 }
 
-define <4 x i64> @fptoui_4vf64c(<4 x double> %a) {
-; SSE2-LABEL: fptoui_4vf64c:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [2,4]
-; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [6,8]
-; SSE2-NEXT:    retq
+define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) {
+; SSE-LABEL: fptoui_4f64_to_4i64_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [6,8]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_4vf64c:
+; AVX-LABEL: fptoui_4f64_to_4i64_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [2,4,6,8]
 ; AVX-NEXT:    retq
@@ -855,13 +1144,13 @@ define <4 x i64> @fptoui_4vf64c(<4 x double> %a) {
   ret <4 x i64> %cvt
 }
 
-define <4 x i32> @fptoui_4vf64c_i32(<4 x double> %a) {
-; SSE2-LABEL: fptoui_4vf64c_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [2,4,6,8]
-; SSE2-NEXT:    retq
+define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) {
+; SSE-LABEL: fptoui_4f64_to_4i32_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4,6,8]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_4vf64c_i32:
+; AVX-LABEL: fptoui_4f64_to_4i32_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4,6,8]
 ; AVX-NEXT:    retq
@@ -869,13 +1158,13 @@ define <4 x i32> @fptoui_4vf64c_i32(<4 x double> %a) {
   ret <4 x i32> %cvt
 }
 
-define <4 x i32> @fptosi_4vf32c() {
-; SSE2-LABEL: fptosi_4vf32c:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
-; SSE2-NEXT:    retq
+define <4 x i32> @fptosi_4f32_to_4i32_const() {
+; SSE-LABEL: fptosi_4f32_to_4i32_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_4vf32c:
+; AVX-LABEL: fptosi_4f32_to_4i32_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3]
 ; AVX-NEXT:    retq
@@ -883,14 +1172,14 @@ define <4 x i32> @fptosi_4vf32c() {
   ret <4 x i32> %cvt
 }
 
-define <4 x i64> @fptosi_4vf32c_i64() {
-; SSE2-LABEL: fptosi_4vf32c_i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
-; SSE2-NEXT:    retq
+define <4 x i64> @fptosi_4f32_to_4i64_const() {
+; SSE-LABEL: fptosi_4f32_to_4i64_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_4vf32c_i64:
+; AVX-LABEL: fptosi_4f32_to_4i64_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
 ; AVX-NEXT:    retq
@@ -898,14 +1187,14 @@ define <4 x i64> @fptosi_4vf32c_i64() {
   ret <4 x i64> %cvt
 }
 
-define <8 x i32> @fptosi_8vf32c(<8 x float> %a) {
-; SSE2-LABEL: fptosi_8vf32c:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
-; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295]
-; SSE2-NEXT:    retq
+define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) {
+; SSE-LABEL: fptosi_8f32_to_8i32_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptosi_8vf32c:
+; AVX-LABEL: fptosi_8f32_to_8i32_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
 ; AVX-NEXT:    retq
@@ -913,13 +1202,13 @@ define <8 x i32> @fptosi_8vf32c(<8 x float> %a) {
   ret <8 x i32> %cvt
 }
 
-define <4 x i32> @fptoui_4vf32c(<4 x float> %a) {
-; SSE2-LABEL: fptoui_4vf32c:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [1,2,4,6]
-; SSE2-NEXT:    retq
+define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) {
+; SSE-LABEL: fptoui_4f32_to_4i32_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,4,6]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_4vf32c:
+; AVX-LABEL: fptoui_4f32_to_4i32_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,4,6]
 ; AVX-NEXT:    retq
@@ -927,14 +1216,14 @@ define <4 x i32> @fptoui_4vf32c(<4 x float> %a) {
   ret <4 x i32> %cvt
 }
 
-define <4 x i64> @fptoui_4vf32c_i64() {
-; SSE2-LABEL: fptoui_4vf32c_i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [1,2]
-; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [4,8]
-; SSE2-NEXT:    retq
+define <4 x i64> @fptoui_4f32_to_4i64_const() {
+; SSE-LABEL: fptoui_4f32_to_4i64_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4,8]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_4vf32c_i64:
+; AVX-LABEL: fptoui_4f32_to_4i64_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,8]
 ; AVX-NEXT:    retq
@@ -942,14 +1231,14 @@ define <4 x i64> @fptoui_4vf32c_i64() {
   ret <4 x i64> %cvt
 }
 
-define <8 x i32> @fptoui_8vf32c(<8 x float> %a) {
-; SSE2-LABEL: fptoui_8vf32c:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [1,2,4,6]
-; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [8,6,4,1]
-; SSE2-NEXT:    retq
+define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
+; SSE-LABEL: fptoui_8f32_to_8i32_const:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,4,6]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [8,6,4,1]
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fptoui_8vf32c:
+; AVX-LABEL: fptoui_8f32_to_8i32_const:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
 ; AVX-NEXT:    retq
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index 4018a21090e7b..14b57e76dc8fa 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=x86 -mattr=+sse2,+ssse3 | FileCheck %s
 ; There are no MMX operations in @t1
 
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 4a3d088139040..fd98791815e71 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -11,20 +11,20 @@
 ; Signed Integer to Double
 ;
 
-define <2 x double> @sitofp_2vf64(<2 x i64> %a) {
-; SSE2-LABEL: sitofp_2vf64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    cvtsi2sdq %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2sdq %rax, %xmm0
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: sitofp_2vf64:
+define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
+; SSE-LABEL: sitofp_2i64_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    cvtsi2sdq %rax, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_2i64_to_2f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
@@ -37,13 +37,13 @@ define <2 x double> @sitofp_2vf64(<2 x i64> %a) {
   ret <2 x double> %cvt
 }
 
-define <2 x double> @sitofp_2vf64_i32(<4 x i32> %a) {
-; SSE2-LABEL: sitofp_2vf64_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
+define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
+; SSE-LABEL: sitofp_2i32_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: sitofp_2vf64_i32:
+; AVX-LABEL: sitofp_2i32_to_2f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -52,15 +52,31 @@ define <2 x double> @sitofp_2vf64_i32(<4 x i32> %a) {
   ret <2 x double> %cvt
 }
 
-define <2 x double> @sitofp_2vf64_i16(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_2vf64_i16:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: sitofp_2vf64_i16:
+define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
+; SSE-LABEL: sitofp_4i32_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i32_to_2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %cvt = sitofp <4 x i32> %a to <4 x double>
+  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
+; SSE-LABEL: sitofp_2i16_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_2i16_to_2f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
@@ -70,16 +86,42 @@ define <2 x double> @sitofp_2vf64_i16(<8 x i16> %a) {
   ret <2 x double> %cvt
 }
 
-define <2 x double> @sitofp_2vf64_i8(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_2vf64_i8:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: sitofp_2vf64_i8:
+define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
+; SSE-LABEL: sitofp_8i16_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_8i16_to_2f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_8i16_to_2f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %cvt = sitofp <8 x i16> %a to <8 x double>
+  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
+; SSE-LABEL: sitofp_2i8_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $24, %xmm0
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_2i8_to_2f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
@@ -89,28 +131,56 @@ define <2 x double> @sitofp_2vf64_i8(<16 x i8> %a) {
   ret <2 x double> %cvt
 }
 
-define <4 x double> @sitofp_4vf64(<4 x i64> %a) {
-; SSE2-LABEL: sitofp_4vf64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    cvtsi2sdq %rax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2sdq %rax, %xmm0
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT:    movd %xmm1, %rax
-; SSE2-NEXT:    cvtsi2sdq %rax, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2sdq %rax, %xmm0
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE2-NEXT:    movapd %xmm2, %xmm0
-; SSE2-NEXT:    movapd %xmm3, %xmm1
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_4vf64:
+define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
+; SSE-LABEL: sitofp_16i8_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $24, %xmm0
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_16i8_to_2f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_16i8_to_2f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %cvt = sitofp <16 x i8> %a to <16 x double>
+  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
+; SSE-LABEL: sitofp_4i64_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    cvtsi2sdq %rax, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    movd %xmm1, %rax
+; SSE-NEXT:    cvtsi2sdq %rax, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    movapd %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_4i64_to_4f64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
@@ -127,7 +197,7 @@ define <4 x double> @sitofp_4vf64(<4 x i64> %a) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: sitofp_4vf64:
+; AVX2-LABEL: sitofp_4i64_to_4f64:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
@@ -147,16 +217,16 @@ define <4 x double> @sitofp_4vf64(<4 x i64> %a) {
   ret <4 x double> %cvt
 }
 
-define <4 x double> @sitofp_4vf64_i32(<4 x i32> %a) {
-; SSE2-LABEL: sitofp_4vf64_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4vf64_i32:
+define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
+; SSE-LABEL: sitofp_4i32_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i32_to_4f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX-NEXT:    retq
@@ -164,17 +234,17 @@ define <4 x double> @sitofp_4vf64_i32(<4 x i32> %a) {
   ret <4 x double> %cvt
 }
 
-define <4 x double> @sitofp_4vf64_i16(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_4vf64_i16:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4vf64_i16:
+define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
+; SSE-LABEL: sitofp_4i16_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i16_to_4f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
@@ -184,18 +254,44 @@ define <4 x double> @sitofp_4vf64_i16(<8 x i16> %a) {
   ret <4 x double> %cvt
 }
 
-define <4 x double> @sitofp_4vf64_i8(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_4vf64_i8:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4vf64_i8:
+define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
+; SSE-LABEL: sitofp_8i16_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_8i16_to_4f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_8i16_to_4f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %cvt = sitofp <8 x i16> %a to <8 x double>
+  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %shuf
+}
+
+define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
+; SSE-LABEL: sitofp_4i8_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    psrad $24, %xmm1
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i8_to_4f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
@@ -205,28 +301,56 @@ define <4 x double> @sitofp_4vf64_i8(<16 x i8> %a) {
   ret <4 x double> %cvt
 }
 
+define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
+; SSE-LABEL: sitofp_16i8_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    psrad $24, %xmm1
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_16i8_to_4f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_16i8_to_4f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %cvt = sitofp <16 x i8> %a to <16 x double>
+  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %shuf
+}
+
 ;
 ; Unsigned Integer to Double
 ;
 
-define <2 x double> @uitofp_2vf64(<2 x i64> %a) {
-; SSE2-LABEL: uitofp_2vf64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT:    subpd %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm4, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: uitofp_2vf64:
+define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
+; SSE-LABEL: uitofp_2i64_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT:    subpd %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE-NEXT:    addpd %xmm4, %xmm0
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    subpd %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE-NEXT:    addpd %xmm2, %xmm1
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uitofp_2i64_to_2f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -243,26 +367,26 @@ define <2 x double> @uitofp_2vf64(<2 x i64> %a) {
   ret <2 x double> %cvt
 }
 
-define <2 x double> @uitofp_2vf64_i32(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_2vf64_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT:    subpd %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm4, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: uitofp_2vf64_i32:
+define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
+; SSE-LABEL: uitofp_2i32_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT:    subpd %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE-NEXT:    addpd %xmm4, %xmm0
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    subpd %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE-NEXT:    addpd %xmm2, %xmm1
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uitofp_2i32_to_2f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
@@ -281,21 +405,64 @@ define <2 x double> @uitofp_2vf64_i32(<4 x i32> %a) {
   ret <2 x double> %cvt
 }
 
-define <2 x double> @uitofp_2vf64_i16(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_2vf64_i16:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSE2-NEXT:    pand .LCPI10_0(%rip), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: uitofp_2vf64_i16:
+define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
+; SSE-LABEL: uitofp_4i32_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT:    subpd %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE-NEXT:    addpd %xmm4, %xmm0
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    subpd %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE-NEXT:    addpd %xmm2, %xmm1
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_4i32_to_2f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_4i32_to_2f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
+; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %cvt = uitofp <4 x i32> %a to <4 x double>
+  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
+; SSE-LABEL: uitofp_2i16_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uitofp_2i16_to_2f64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT:    vpand .LCPI10_0(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -303,22 +470,44 @@ define <2 x double> @uitofp_2vf64_i16(<8 x i16> %a) {
   ret <2 x double> %cvt
 }
 
-define <2 x double> @uitofp_2vf64_i8(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_2vf64_i8:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT:    pand .LCPI11_0(%rip), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: uitofp_2vf64_i8:
+define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
+; SSE-LABEL: uitofp_8i16_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_8i16_to_2f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_8i16_to_2f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %cvt = uitofp <8 x i16> %a to <8 x double>
+  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
+; SSE-LABEL: uitofp_2i8_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uitofp_2i8_to_2f64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpand .LCPI11_0(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
@@ -326,34 +515,62 @@ define <2 x double> @uitofp_2vf64_i8(<16 x i8> %a) {
   ret <2 x double> %cvt
 }
 
-define <4 x double> @uitofp_4vf64(<4 x i64> %a) {
-; SSE2-LABEL: uitofp_4vf64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT:    subpd %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm5, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm3, %xmm5
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm5, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm3, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_4vf64:
+define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
+; SSE-LABEL: uitofp_16i8_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_16i8_to_2f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_16i8_to_2f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %cvt = uitofp <16 x i8> %a to <16 x double>
+  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
+; SSE-LABEL: uitofp_4i64_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT:    subpd %xmm4, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
+; SSE-NEXT:    addpd %xmm5, %xmm0
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT:    subpd %xmm4, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
+; SSE-NEXT:    addpd %xmm3, %xmm5
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT:    subpd %xmm4, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; SSE-NEXT:    addpd %xmm5, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT:    subpd %xmm4, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE-NEXT:    addpd %xmm3, %xmm2
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_4i64_to_4f64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
@@ -377,7 +594,7 @@ define <4 x double> @uitofp_4vf64(<4 x i64> %a) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: uitofp_4vf64:
+; AVX2-LABEL: uitofp_4i64_to_4f64:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
@@ -404,54 +621,54 @@ define <4 x double> @uitofp_4vf64(<4 x i64> %a) {
   ret <4 x double> %cvt
 }
 
-define <4 x double> @uitofp_4vf64_i32(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_4vf64_i32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT:    subpd %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm5, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm1, %xmm5
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; SSE2-NEXT:    pand .LCPI13_2(%rip), %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm5, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_4vf64_i32:
+define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
+; SSE-LABEL: uitofp_4i32_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT:    movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT:    subpd %xmm5, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; SSE-NEXT:    addpd %xmm6, %xmm0
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    subpd %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
+; SSE-NEXT:    addpd %xmm4, %xmm6
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT:    subpd %xmm5, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE-NEXT:    addpd %xmm2, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    subpd %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; SSE-NEXT:    addpd %xmm4, %xmm2
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_4i32_to_4f64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpand .LCPI13_0(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
 ; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT:    vmulpd .LCPI13_1(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: uitofp_4vf64_i32:
+; AVX2-LABEL: uitofp_4i32_to_4f64:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd .LCPI13_0(%rip), %ymm2
+; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
 ; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpbroadcastd .LCPI13_1(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
@@ -460,18 +677,18 @@ define <4 x double> @uitofp_4vf64_i32(<4 x i32> %a) {
   ret <4 x double> %cvt
 }
 
-define <4 x double> @uitofp_4vf64_i16(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_4vf64_i16:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: uitofp_4vf64_i16:
+define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
+; SSE-LABEL: uitofp_4i16_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uitofp_4i16_to_4f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
@@ -481,19 +698,46 @@ define <4 x double> @uitofp_4vf64_i16(<8 x i16> %a) {
   ret <4 x double> %cvt
 }
 
-define <4 x double> @uitofp_4vf64_i8(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_4vf64_i8:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: uitofp_4vf64_i8:
+define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
+; SSE-LABEL: uitofp_8i16_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_8i16_to_4f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_8i16_to_4f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %cvt = uitofp <8 x i16> %a to <8 x double>
+  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %shuf
+}
+
+define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
+; SSE-LABEL: uitofp_4i8_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uitofp_4i8_to_4f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
@@ -503,38 +747,86 @@ define <4 x double> @uitofp_4vf64_i8(<16 x i8> %a) {
   ret <4 x double> %cvt
 }
 
+define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
+; SSE-LABEL: uitofp_16i8_to_4f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_16i8_to_4f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_16i8_to_4f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %cvt = uitofp <16 x i8> %a to <16 x double>
+  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %shuf
+}
+
 ;
 ; Signed Integer to Float
 ;
 
-define <4 x float> @sitofp_4vf32(<4 x i32> %a) {
-; SSE2-LABEL: sitofp_4vf32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
+define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
+; SSE-LABEL: sitofp_2i64_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX-LABEL: sitofp_4vf32:
+; AVX-LABEL: sitofp_2i64_to_4f32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX-NEXT:    retq
-  %cvt = sitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %cvt
+  %cvt = sitofp <2 x i64> %a to <2 x float>
+  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  ret <4 x float> %ext
 }
 
-define <4 x float> @sitofp_4vf32_i64(<2 x i64> %a) {
-; SSE2-LABEL: sitofp_4vf32_i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4vf32_i64:
+define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
+; SSE-LABEL: sitofp_4i64_to_4f32_undef:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i64_to_4f32_undef:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
@@ -546,20 +838,34 @@ define <4 x float> @sitofp_4vf32_i64(<2 x i64> %a) {
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX-NEXT:    retq
-  %cvt = sitofp <2 x i64> %a to <2 x float>
-  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  ret <4 x float> %ext
+  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %cvt = sitofp <4 x i64> %ext to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
+; SSE-LABEL: sitofp_4i32_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i32_to_4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %cvt
 }
 
-define <4 x float> @sitofp_4vf32_i16(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_4vf32_i16:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4vf32_i16:
+define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
+; SSE-LABEL: sitofp_4i16_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i16_to_4f32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
@@ -569,16 +875,45 @@ define <4 x float> @sitofp_4vf32_i16(<8 x i16> %a) {
   ret <4 x float> %cvt
 }
 
-define <4 x float> @sitofp_4vf32_i8(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_4vf32_i8:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4vf32_i8:
+define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
+; SSE-LABEL: sitofp_8i16_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_8i16_to_4f32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_8i16_to_4f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %cvt = sitofp <8 x i16> %a to <8 x float>
+  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuf
+}
+
+define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
+; SSE-LABEL: sitofp_4i8_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $24, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i8_to_4f32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
@@ -588,43 +923,59 @@ define <4 x float> @sitofp_4vf32_i8(<16 x i8> %a) {
   ret <4 x float> %cvt
 }
 
-define <8 x float> @sitofp_8vf32(<8 x i32> %a) {
-; SSE2-LABEL: sitofp_8vf32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: sitofp_8vf32:
-; AVX:       # BB#0:
-; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT:    retq
-  %cvt = sitofp <8 x i32> %a to <8 x float>
-  ret <8 x float> %cvt
+define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
+; SSE-LABEL: sitofp_16i8_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $24, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_16i8_to_4f32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_16i8_to_4f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %cvt = sitofp <16 x i8> %a to <16 x float>
+  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuf
 }
 
-define <4 x float> @sitofp_4vf32_4i64(<4 x i64> %a) {
-; SSE2-LABEL: sitofp_4vf32_4i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movd %xmm1, %rax
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm3
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_4vf32_4i64:
+define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
+; SSE-LABEL: sitofp_4i64_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %xmm1, %rax
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    movd %xmm1, %rax
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_4i64_to_4f32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
@@ -642,7 +993,7 @@ define <4 x float> @sitofp_4vf32_4i64(<4 x i64> %a) {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: sitofp_4vf32_4i64:
+; AVX2-LABEL: sitofp_4i64_to_4f32:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
@@ -663,20 +1014,34 @@ define <4 x float> @sitofp_4vf32_4i64(<4 x i64> %a) {
   ret <4 x float> %cvt
 }
 
-define <8 x float> @sitofp_8vf32_i16(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_8vf32_i16:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_8vf32_i16:
+define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
+; SSE-LABEL: sitofp_8i32_to_8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_8i32_to_8f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %cvt = sitofp <8 x i32> %a to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
+; SSE-LABEL: sitofp_8i16_to_8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_8i16_to_8f32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -685,7 +1050,7 @@ define <8 x float> @sitofp_8vf32_i16(<8 x i16> %a) {
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: sitofp_8vf32_i16:
+; AVX2-LABEL: sitofp_8i16_to_8f32:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
@@ -694,22 +1059,22 @@ define <8 x float> @sitofp_8vf32_i16(<8 x i16> %a) {
   ret <8 x float> %cvt
 }
 
-define <8 x float> @sitofp_8vf32_i8(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_8vf32_i8:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_8vf32_i8:
+define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
+; SSE-LABEL: sitofp_8i8_to_8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $24, %xmm1
+; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $24, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_8i8_to_8f32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -718,9 +1083,9 @@ define <8 x float> @sitofp_8vf32_i8(<16 x i8> %a) {
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: sitofp_8vf32_i8:
+; AVX2-LABEL: sitofp_8i8_to_8f32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpmovzxbd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX2-NEXT:    vpslld $24, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrad $24, %ymm0, %ymm0
 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
@@ -730,125 +1095,124 @@ define <8 x float> @sitofp_8vf32_i8(<16 x i8> %a) {
   ret <8 x float> %cvt
 }
 
+define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
+; SSE-LABEL: sitofp_16i8_to_8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $24, %xmm1
+; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    psrad $24, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
 ;
-; Unsigned Integer to Float
-;
-
-define <4 x float> @uitofp_4vf32(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_4vf32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    por .LCPI24_1(%rip), %xmm1
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    por .LCPI24_2(%rip), %xmm0
-; SSE2-NEXT:    addps .LCPI24_3(%rip), %xmm0
-; SSE2-NEXT:    addps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_4vf32:
+; AVX1-LABEL: sitofp_16i8_to_8f32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; AVX1-NEXT:    vaddps .LCPI24_2(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: uitofp_4vf32:
+; AVX2-LABEL: sitofp_16i8_to_8f32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd .LCPI24_0(%rip), %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd .LCPI24_1(%rip), %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT:    vbroadcastss .LCPI24_2(%rip), %xmm2
-; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX2-NEXT:    retq
-  %cvt = uitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %cvt
+  %cvt = sitofp <16 x i8> %a to <16 x float>
+  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuf
 }
 
-define <4 x float> @uitofp_4vf32_i64(<2 x i64> %a) {
-; SSE2-LABEL: uitofp_4vf32_i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %rax
-; SSE2-NEXT:    movl %eax, %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB25_1
-; SSE2-NEXT:  # BB#2:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB25_3
-; SSE2-NEXT:  .LBB25_1:
-; SSE2-NEXT:    shrq %rax
-; SSE2-NEXT:    orq %rax, %rcx
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ssq %rcx, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB25_3:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd %xmm1, %rax
-; SSE2-NEXT:    movl %eax, %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB25_4
-; SSE2-NEXT:  # BB#5:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    retq
-; SSE2-NEXT:  .LBB25_4:
-; SSE2-NEXT:    shrq %rax
-; SSE2-NEXT:    orq %rax, %rcx
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ssq %rcx, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: uitofp_4vf32_i64:
+;
+; Unsigned Integer to Float
+;
+
+define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
+; SSE-LABEL: uitofp_2i64_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %rax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $1, %ecx
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    js .LBB38_1
+; SSE-NEXT:  # BB#2:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE-NEXT:    jmp .LBB38_3
+; SSE-NEXT:  .LBB38_1:
+; SSE-NEXT:    shrq %rax
+; SSE-NEXT:    orq %rax, %rcx
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT:    addss %xmm0, %xmm0
+; SSE-NEXT:  .LBB38_3:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    movd %xmm1, %rax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $1, %ecx
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    js .LBB38_4
+; SSE-NEXT:  # BB#5:
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+; SSE-NEXT:  .LBB38_4:
+; SSE-NEXT:    shrq %rax
+; SSE-NEXT:    orq %rax, %rcx
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT:    addss %xmm1, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uitofp_2i64_to_4f32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX-NEXT:    movl %eax, %ecx
 ; AVX-NEXT:    andl $1, %ecx
 ; AVX-NEXT:    testq %rax, %rax
-; AVX-NEXT:    js .LBB25_1
+; AVX-NEXT:    js .LBB38_1
 ; AVX-NEXT:  # BB#2:
 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
-; AVX-NEXT:    jmp .LBB25_3
-; AVX-NEXT:  .LBB25_1:
+; AVX-NEXT:    jmp .LBB38_3
+; AVX-NEXT:  .LBB38_1:
 ; AVX-NEXT:    shrq %rax
 ; AVX-NEXT:    orq %rax, %rcx
 ; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
 ; AVX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX-NEXT:  .LBB25_3:
+; AVX-NEXT:  .LBB38_3:
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    movl %eax, %ecx
 ; AVX-NEXT:    andl $1, %ecx
 ; AVX-NEXT:    testq %rax, %rax
-; AVX-NEXT:    js .LBB25_4
+; AVX-NEXT:    js .LBB38_4
 ; AVX-NEXT:  # BB#5:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
-; AVX-NEXT:    jmp .LBB25_6
-; AVX-NEXT:  .LBB25_4:
+; AVX-NEXT:    jmp .LBB38_6
+; AVX-NEXT:  .LBB38_4:
 ; AVX-NEXT:    shrq %rax
 ; AVX-NEXT:    orq %rax, %rcx
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
 ; AVX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; AVX-NEXT:  .LBB25_6:
+; AVX-NEXT:  .LBB38_6:
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    testq %rax, %rax
-; AVX-NEXT:    js .LBB25_8
+; AVX-NEXT:    js .LBB38_8
 ; AVX-NEXT:  # BB#7:
 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
-; AVX-NEXT:  .LBB25_8:
+; AVX-NEXT:  .LBB38_8:
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX-NEXT:    retq
@@ -857,15 +1221,147 @@ define <4 x float> @uitofp_4vf32_i64(<2 x i64> %a) {
   ret <4 x float> %ext
 }
 
-define <4 x float> @uitofp_4vf32_i16(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_4vf32_i16:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: uitofp_4vf32_i16:
+define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
+; SSE-LABEL: uitofp_4i64_to_4f32_undef:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    xorps %xmm2, %xmm2
+; SSE-NEXT:    js .LBB39_2
+; SSE-NEXT:  # BB#1:
+; SSE-NEXT:    xorps %xmm2, %xmm2
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE-NEXT:  .LBB39_2:
+; SSE-NEXT:    movd %xmm1, %rax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $1, %ecx
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    js .LBB39_3
+; SSE-NEXT:  # BB#4:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE-NEXT:    jmp .LBB39_5
+; SSE-NEXT:  .LBB39_3:
+; SSE-NEXT:    shrq %rax
+; SSE-NEXT:    orq %rax, %rcx
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT:    addss %xmm0, %xmm0
+; SSE-NEXT:  .LBB39_5:
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    movd %xmm1, %rax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $1, %ecx
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    js .LBB39_6
+; SSE-NEXT:  # BB#7:
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE-NEXT:    jmp .LBB39_8
+; SSE-NEXT:  .LBB39_6:
+; SSE-NEXT:    shrq %rax
+; SSE-NEXT:    orq %rax, %rcx
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT:    addss %xmm1, %xmm1
+; SSE-NEXT:  .LBB39_8:
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uitofp_4i64_to_4f32_undef:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    andl $1, %ecx
+; AVX-NEXT:    testq %rax, %rax
+; AVX-NEXT:    js .LBB39_1
+; AVX-NEXT:  # BB#2:
+; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX-NEXT:    jmp .LBB39_3
+; AVX-NEXT:  .LBB39_1:
+; AVX-NEXT:    shrq %rax
+; AVX-NEXT:    orq %rax, %rcx
+; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; AVX-NEXT:  .LBB39_3:
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    andl $1, %ecx
+; AVX-NEXT:    testq %rax, %rax
+; AVX-NEXT:    js .LBB39_4
+; AVX-NEXT:  # BB#5:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX-NEXT:    jmp .LBB39_6
+; AVX-NEXT:  .LBB39_4:
+; AVX-NEXT:    shrq %rax
+; AVX-NEXT:    orq %rax, %rcx
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:  .LBB39_6:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    testq %rax, %rax
+; AVX-NEXT:    js .LBB39_8
+; AVX-NEXT:  # BB#7:
+; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX-NEXT:  .LBB39_8:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT:    retq
+  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %cvt = uitofp <4 x i64> %ext to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
+; SSE-LABEL: uitofp_4i32_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE-NEXT:    addps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_4i32_to_4f32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_4i32_to_4f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+  %cvt = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
+; SSE-LABEL: uitofp_4i16_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uitofp_4i16_to_4f32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
@@ -875,16 +1371,45 @@ define <4 x float> @uitofp_4vf32_i16(<8 x i16> %a) {
   ret <4 x float> %cvt
 }
 
-define <4 x float> @uitofp_4vf32_i8(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_4vf32_i8:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: uitofp_4vf32_i8:
+define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
+; SSE-LABEL: uitofp_8i16_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_8i16_to_4f32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_8i16_to_4f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %cvt = uitofp <8 x i16> %a to <8 x float>
+  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuf
+}
+
+define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
+; SSE-LABEL: uitofp_4i8_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uitofp_4i8_to_4f32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
@@ -894,186 +1419,168 @@ define <4 x float> @uitofp_4vf32_i8(<16 x i8> %a) {
   ret <4 x float> %cvt
 }
 
-define <8 x float> @uitofp_8vf32(<8 x i32> %a) {
-; SSE2-LABEL: uitofp_8vf32:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
-; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
-; SSE2-NEXT:    addps %xmm6, %xmm0
-; SSE2-NEXT:    addps %xmm3, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    addps %xmm6, %xmm1
-; SSE2-NEXT:    addps %xmm2, %xmm1
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_8vf32:
+define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
+; SSE-LABEL: uitofp_16i8_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_16i8_to_4f32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vandps .LCPI28_0(%rip), %ymm0, %ymm1
-; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    vmulps .LCPI28_1(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: uitofp_8vf32:
+; AVX2-LABEL: uitofp_16i8_to_4f32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd .LCPI28_0(%rip), %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpbroadcastd .LCPI28_1(%rip), %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
-; AVX2-NEXT:    vbroadcastss .LCPI28_2(%rip), %ymm2
-; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
-  %cvt = uitofp <8 x i32> %a to <8 x float>
-  ret <8 x float> %cvt
+  %cvt = uitofp <16 x i8> %a to <16 x float>
+  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuf
 }
 
-define <4 x float> @uitofp_4vf32_4i64(<4 x i64> %a) {
-; SSE2-LABEL: uitofp_4vf32_4i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movd %xmm1, %rax
-; SSE2-NEXT:    movl %eax, %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB29_1
-; SSE2-NEXT:  # BB#2:
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm3
-; SSE2-NEXT:    jmp .LBB29_3
-; SSE2-NEXT:  .LBB29_1:
-; SSE2-NEXT:    shrq %rax
-; SSE2-NEXT:    orq %rax, %rcx
-; SSE2-NEXT:    cvtsi2ssq %rcx, %xmm3
-; SSE2-NEXT:    addss %xmm3, %xmm3
-; SSE2-NEXT:  .LBB29_3:
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    movl %eax, %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB29_4
-; SSE2-NEXT:  # BB#5:
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
-; SSE2-NEXT:    jmp .LBB29_6
-; SSE2-NEXT:  .LBB29_4:
-; SSE2-NEXT:    shrq %rax
-; SSE2-NEXT:    orq %rax, %rcx
-; SSE2-NEXT:    cvtsi2ssq %rcx, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
-; SSE2-NEXT:  .LBB29_6:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd %xmm1, %rax
-; SSE2-NEXT:    movl %eax, %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB29_7
-; SSE2-NEXT:  # BB#8:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
-; SSE2-NEXT:    jmp .LBB29_9
-; SSE2-NEXT:  .LBB29_7:
-; SSE2-NEXT:    shrq %rax
-; SSE2-NEXT:    orq %rax, %rcx
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ssq %rcx, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
-; SSE2-NEXT:  .LBB29_9:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    movl %eax, %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB29_10
-; SSE2-NEXT:  # BB#11:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB29_12
-; SSE2-NEXT:  .LBB29_10:
-; SSE2-NEXT:    shrq %rax
-; SSE2-NEXT:    orq %rax, %rcx
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ssq %rcx, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB29_12:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_4vf32_4i64:
+define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
+; SSE-LABEL: uitofp_4i64_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %xmm1, %rax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $1, %ecx
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    js .LBB45_1
+; SSE-NEXT:  # BB#2:
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE-NEXT:    jmp .LBB45_3
+; SSE-NEXT:  .LBB45_1:
+; SSE-NEXT:    shrq %rax
+; SSE-NEXT:    orq %rax, %rcx
+; SSE-NEXT:    cvtsi2ssq %rcx, %xmm3
+; SSE-NEXT:    addss %xmm3, %xmm3
+; SSE-NEXT:  .LBB45_3:
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $1, %ecx
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    js .LBB45_4
+; SSE-NEXT:  # BB#5:
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE-NEXT:    jmp .LBB45_6
+; SSE-NEXT:  .LBB45_4:
+; SSE-NEXT:    shrq %rax
+; SSE-NEXT:    orq %rax, %rcx
+; SSE-NEXT:    cvtsi2ssq %rcx, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm2
+; SSE-NEXT:  .LBB45_6:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    movd %xmm1, %rax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $1, %ecx
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    js .LBB45_7
+; SSE-NEXT:  # BB#8:
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE-NEXT:    jmp .LBB45_9
+; SSE-NEXT:  .LBB45_7:
+; SSE-NEXT:    shrq %rax
+; SSE-NEXT:    orq %rax, %rcx
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT:    addss %xmm1, %xmm1
+; SSE-NEXT:  .LBB45_9:
+; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $1, %ecx
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    js .LBB45_10
+; SSE-NEXT:  # BB#11:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE-NEXT:    jmp .LBB45_12
+; SSE-NEXT:  .LBB45_10:
+; SSE-NEXT:    shrq %rax
+; SSE-NEXT:    orq %rax, %rcx
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT:    addss %xmm0, %xmm0
+; SSE-NEXT:  .LBB45_12:
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_4i64_to_4f32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX1-NEXT:    movl %eax, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
 ; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB29_1
+; AVX1-NEXT:    js .LBB45_1
 ; AVX1-NEXT:  # BB#2:
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
-; AVX1-NEXT:    jmp .LBB29_3
-; AVX1-NEXT:  .LBB29_1:
+; AVX1-NEXT:    jmp .LBB45_3
+; AVX1-NEXT:  .LBB45_1:
 ; AVX1-NEXT:    shrq %rax
 ; AVX1-NEXT:    orq %rax, %rcx
 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
 ; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:  .LBB29_3:
+; AVX1-NEXT:  .LBB45_3:
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    movl %eax, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
 ; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB29_4
+; AVX1-NEXT:    js .LBB45_4
 ; AVX1-NEXT:  # BB#5:
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
-; AVX1-NEXT:    jmp .LBB29_6
-; AVX1-NEXT:  .LBB29_4:
+; AVX1-NEXT:    jmp .LBB45_6
+; AVX1-NEXT:  .LBB45_4:
 ; AVX1-NEXT:    shrq %rax
 ; AVX1-NEXT:    orq %rax, %rcx
 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
 ; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:  .LBB29_6:
+; AVX1-NEXT:  .LBB45_6:
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    movl %eax, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
 ; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB29_7
+; AVX1-NEXT:    js .LBB45_7
 ; AVX1-NEXT:  # BB#8:
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
-; AVX1-NEXT:    jmp .LBB29_9
-; AVX1-NEXT:  .LBB29_7:
+; AVX1-NEXT:    jmp .LBB45_9
+; AVX1-NEXT:  .LBB45_7:
 ; AVX1-NEXT:    shrq %rax
 ; AVX1-NEXT:    orq %rax, %rcx
 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
 ; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:  .LBB29_9:
+; AVX1-NEXT:  .LBB45_9:
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX1-NEXT:    movl %eax, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
 ; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB29_10
+; AVX1-NEXT:    js .LBB45_10
 ; AVX1-NEXT:  # BB#11:
 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
-; AVX1-NEXT:  .LBB29_10:
+; AVX1-NEXT:  .LBB45_10:
 ; AVX1-NEXT:    shrq %rax
 ; AVX1-NEXT:    orq %rax, %rcx
 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
@@ -1082,65 +1589,65 @@ define <4 x float> @uitofp_4vf32_4i64(<4 x i64> %a) {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: uitofp_4vf32_4i64:
+; AVX2-LABEL: uitofp_4i64_to_4f32:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB29_1
+; AVX2-NEXT:    js .LBB45_1
 ; AVX2-NEXT:  # BB#2:
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
-; AVX2-NEXT:    jmp .LBB29_3
-; AVX2-NEXT:  .LBB29_1:
+; AVX2-NEXT:    jmp .LBB45_3
+; AVX2-NEXT:  .LBB45_1:
 ; AVX2-NEXT:    shrq %rax
 ; AVX2-NEXT:    orq %rax, %rcx
 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
 ; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:  .LBB29_3:
+; AVX2-NEXT:  .LBB45_3:
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB29_4
+; AVX2-NEXT:    js .LBB45_4
 ; AVX2-NEXT:  # BB#5:
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
-; AVX2-NEXT:    jmp .LBB29_6
-; AVX2-NEXT:  .LBB29_4:
+; AVX2-NEXT:    jmp .LBB45_6
+; AVX2-NEXT:  .LBB45_4:
 ; AVX2-NEXT:    shrq %rax
 ; AVX2-NEXT:    orq %rax, %rcx
 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
 ; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:  .LBB29_6:
+; AVX2-NEXT:  .LBB45_6:
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB29_7
+; AVX2-NEXT:    js .LBB45_7
 ; AVX2-NEXT:  # BB#8:
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
-; AVX2-NEXT:    jmp .LBB29_9
-; AVX2-NEXT:  .LBB29_7:
+; AVX2-NEXT:    jmp .LBB45_9
+; AVX2-NEXT:  .LBB45_7:
 ; AVX2-NEXT:    shrq %rax
 ; AVX2-NEXT:    orq %rax, %rcx
 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
 ; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:  .LBB29_9:
+; AVX2-NEXT:  .LBB45_9:
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB29_10
+; AVX2-NEXT:    js .LBB45_10
 ; AVX2-NEXT:  # BB#11:
 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
-; AVX2-NEXT:  .LBB29_10:
+; AVX2-NEXT:  .LBB45_10:
 ; AVX2-NEXT:    shrq %rax
 ; AVX2-NEXT:    orq %rax, %rcx
 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
@@ -1152,20 +1659,69 @@ define <4 x float> @uitofp_4vf32_4i64(<4 x i64> %a) {
   ret <4 x float> %cvt
 }
 
-define <8 x float> @uitofp_8vf32_i16(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_8vf32_i16:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pand .LCPI30_0(%rip), %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_8vf32_i16:
+define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
+; SSE-LABEL: uitofp_8i32_to_8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
+; SSE-NEXT:    por %xmm4, %xmm3
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
+; SSE-NEXT:    por %xmm5, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
+; SSE-NEXT:    addps %xmm6, %xmm0
+; SSE-NEXT:    addps %xmm3, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    por %xmm4, %xmm2
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    addps %xmm6, %xmm1
+; SSE-NEXT:    addps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_8i32_to_8f32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_8i32_to_8f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %cvt = uitofp <8 x i32> %a to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
+; SSE-LABEL: uitofp_8i16_to_8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_8i16_to_8f32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -1174,7 +1730,7 @@ define <8 x float> @uitofp_8vf32_i16(<8 x i16> %a) {
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: uitofp_8vf32_i16:
+; AVX2-LABEL: uitofp_8i16_to_8f32:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
@@ -1183,36 +1739,35 @@ define <8 x float> @uitofp_8vf32_i16(<8 x i16> %a) {
   ret <8 x float> %cvt
 }
 
-define <8 x float> @uitofp_8vf32_i8(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_8vf32_i8:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pand .LCPI31_0(%rip), %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_8vf32_i8:
+define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
+; SSE-LABEL: uitofp_8i8_to_8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_8i8_to_8f32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vandps .LCPI31_0(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: uitofp_8vf32_i8:
+; AVX2-LABEL: uitofp_8i8_to_8f32:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT:    vpbroadcastd .LCPI31_0(%rip), %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1220,28 +1775,61 @@ define <8 x float> @uitofp_8vf32_i8(<16 x i8> %a) {
   ret <8 x float> %cvt
 }
 
+define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
+; SSE-LABEL: uitofp_16i8_to_8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_16i8_to_8f32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_16i8_to_8f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %cvt = uitofp <16 x i8> %a to <16 x float>
+  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuf
+}
+
 ;
 ; Aggregates
 ;
 
 %Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
-define void @aggregate_sitofp_8f32_i16(%Arguments* nocapture readonly %a0) {
-; SSE2-LABEL: aggregate_sitofp_8f32_i16:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movq 24(%rdi), %rax
-; SSE2-NEXT:    movdqu 8(%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    movaps %xmm0, (%rax)
-; SSE2-NEXT:    movaps %xmm1, 16(%rax)
-; SSE2-NEXT:    retq
-;
-; AVX1-LABEL: aggregate_sitofp_8f32_i16:
+define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
+; SSE-LABEL: aggregate_sitofp_8i16_to_8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq 24(%rdi), %rax
+; SSE-NEXT:    movdqu 8(%rdi), %xmm0
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, 16(%rax)
+; SSE-NEXT:    movaps %xmm1, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    movq 24(%rdi), %rax
 ; AVX1-NEXT:    vmovdqu 8(%rdi), %xmm0
@@ -1254,7 +1842,7 @@ define void @aggregate_sitofp_8f32_i16(%Arguments* nocapture readonly %a0) {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: aggregate_sitofp_8f32_i16:
+; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movq 24(%rdi), %rax
 ; AVX2-NEXT:    vpmovsxwd 8(%rdi), %ymm0
diff --git a/test/CodeGen/X86/vec_minmax_sint.ll b/test/CodeGen/X86/vec_minmax_sint.ll
new file mode 100644
index 0000000000000..419eb2bed7430
--- /dev/null
+++ b/test/CodeGen/X86/vec_minmax_sint.ll
@@ -0,0 +1,2090 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Signed Maximum (GT)
+;
+
+define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: max_gt_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sgt <2 x i64> %a, %b
+  %2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: max_gt_v4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    pxor %xmm0, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm8, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v4i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm4
+; SSE42-NEXT:    movdqa %xmm1, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm3, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    movapd %xmm2, %xmm0
+; SSE42-NEXT:    movapd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_gt_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sgt <4 x i64> %a, %b
+  %2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %2
+}
+
+define <4 x i32> @max_gt_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: max_gt_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v4i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sgt <4 x i32> %a, %b
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: max_gt_v8i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v8i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE41-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v8i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE42-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_gt_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sgt <8 x i32> %a, %b
+  %2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @max_gt_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: max_gt_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sgt <8 x i16> %a, %b
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: max_gt_v16i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: max_gt_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sgt <16 x i16> %a, %b
+  %2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %2
+}
+
+define <16 x i8> @max_gt_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: max_gt_v16i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v16i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v16i8:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sgt <16 x i8> %a, %b
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @max_gt_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: max_gt_v32i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v32i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE41-NEXT:    pmaxsb %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v32i8:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE42-NEXT:    pmaxsb %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_gt_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sgt <32 x i8> %a, %b
+  %2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %2
+}
+
+;
+; Signed Maximum (GE)
+;
+
+define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: max_ge_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    movdqa %xmm1, %xmm3
+; SSE42-NEXT:    pcmpgtq %xmm2, %xmm3
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE42-NEXT:    pxor %xmm3, %xmm0
+; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sge <2 x i64> %a, %b
+  %2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: max_ge_v4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm8
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm0, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm9
+; SSE41-NEXT:    pxor %xmm9, %xmm5
+; SSE41-NEXT:    movdqa %xmm8, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v4i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm4
+; SSE42-NEXT:    movdqa %xmm3, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm5
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE42-NEXT:    pxor %xmm0, %xmm5
+; SSE42-NEXT:    movdqa %xmm2, %xmm6
+; SSE42-NEXT:    pcmpgtq %xmm4, %xmm6
+; SSE42-NEXT:    pxor %xmm6, %xmm0
+; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    movapd %xmm2, %xmm0
+; SSE42-NEXT:    movapd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_ge_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX512-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sge <4 x i64> %a, %b
+  %2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %2
+}
+
+define <4 x i32> @max_ge_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: max_ge_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v4i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sge <4 x i32> %a, %b
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: max_ge_v8i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm7
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v8i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE41-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v8i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE42-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_ge_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sge <8 x i32> %a, %b
+  %2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @max_ge_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: max_ge_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sge <8 x i16> %a, %b
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: max_ge_v16i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: max_ge_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sge <16 x i16> %a, %b
+  %2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %2
+}
+
+define <16 x i8> @max_ge_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: max_ge_v16i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v16i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v16i8:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sge <16 x i8> %a, %b
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @max_ge_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: max_ge_v32i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm7
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v32i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE41-NEXT:    pmaxsb %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v32i8:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE42-NEXT:    pmaxsb %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_ge_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sge <32 x i8> %a, %b
+  %2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %2
+}
+
+;
+; Signed Minimum (LT)
+;
+
+define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: min_lt_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    movdqa %xmm1, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp slt <2 x i64> %a, %b
+  %2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: min_lt_v4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm0, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm5
+; SSE41-NEXT:    movdqa %xmm8, %xmm4
+; SSE41-NEXT:    pxor %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v4i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm4
+; SSE42-NEXT:    movdqa %xmm3, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm5
+; SSE42-NEXT:    movdqa %xmm2, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    movapd %xmm2, %xmm0
+; SSE42-NEXT:    movapd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_lt_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX512-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp slt <4 x i64> %a, %b
+  %2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %2
+}
+
+define <4 x i32> @min_lt_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: min_lt_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminsd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v4i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminsd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp slt <4 x i32> %a, %b
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: min_lt_v8i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v8i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminsd %xmm2, %xmm0
+; SSE41-NEXT:    pminsd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v8i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminsd %xmm2, %xmm0
+; SSE42-NEXT:    pminsd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_lt_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp slt <8 x i32> %a, %b
+  %2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @min_lt_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: min_lt_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pminsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp slt <8 x i16> %a, %b
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: min_lt_v16i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pminsw %xmm2, %xmm0
+; SSE-NEXT:    pminsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: min_lt_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp slt <16 x i16> %a, %b
+  %2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %2
+}
+
+define <16 x i8> @min_lt_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: min_lt_v16i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v16i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminsb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v16i8:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminsb %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp slt <16 x i8> %a, %b
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @min_lt_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: min_lt_v32i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v32i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminsb %xmm2, %xmm0
+; SSE41-NEXT:    pminsb %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v32i8:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminsb %xmm2, %xmm0
+; SSE42-NEXT:    pminsb %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_lt_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp slt <32 x i8> %a, %b
+  %2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %2
+}
+
+;
+; Signed Minimum (LE)
+;
+
+define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: min_le_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE42-NEXT:    pxor %xmm3, %xmm0
+; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_le_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sle <2 x i64> %a, %b
+  %2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: min_le_v4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm8
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    pxor %xmm0, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm9
+; SSE41-NEXT:    pxor %xmm9, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    pxor %xmm8, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v4i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm4
+; SSE42-NEXT:    movdqa %xmm1, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm3, %xmm5
+; SSE42-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE42-NEXT:    pxor %xmm6, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; SSE42-NEXT:    pxor %xmm6, %xmm0
+; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    movapd %xmm2, %xmm0
+; SSE42-NEXT:    movapd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_le_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sle <4 x i64> %a, %b
+  %2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %2
+}
+
+define <4 x i32> @min_le_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: min_le_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminsd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v4i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminsd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_le_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sle <4 x i32> %a, %b
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: min_le_v8i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm7
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v8i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminsd %xmm2, %xmm0
+; SSE41-NEXT:    pminsd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v8i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminsd %xmm2, %xmm0
+; SSE42-NEXT:    pminsd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_le_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sle <8 x i32> %a, %b
+  %2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @min_le_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: min_le_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pminsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sle <8 x i16> %a, %b
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: min_le_v16i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    pminsw %xmm2, %xmm0
+; SSE-NEXT:    pminsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: min_le_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sle <16 x i16> %a, %b
+  %2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %2
+}
+
+define <16 x i8> @min_le_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: min_le_v16i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v16i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminsb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v16i8:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminsb %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_le_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp sle <16 x i8> %a, %b
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @min_le_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: min_le_v32i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm7
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v32i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminsb %xmm2, %xmm0
+; SSE41-NEXT:    pminsb %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v32i8:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminsb %xmm2, %xmm0
+; SSE42-NEXT:    pminsb %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_le_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp sle <32 x i8> %a, %b
+  %2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %2
+}
+
+;
+; Constant Folding
+;
+
+define <2 x i64> @max_gt_v2i64c() {
+; SSE-LABEL: max_gt_v2i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v2i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
+  %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
+  %3 = icmp sgt <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @max_gt_v4i64c() {
+; SSE-LABEL: max_gt_v4i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,7]
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v4i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
+  %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
+  %3 = icmp sgt <4 x i64> %1, %2
+  %4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
+  ret <4 x i64> %4
+}
+
+define <4 x i32> @max_gt_v4i32c() {
+; SSE-LABEL: max_gt_v4i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v4i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
+  %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
+  %3 = icmp sgt <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ret <4 x i32> %4
+}
+
+define <8 x i32> @max_gt_v8i32c() {
+; SSE-LABEL: max_gt_v8i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967293,4294967293,4294967295]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,5,5,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v8i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
+  %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
+  %3 = icmp sgt <8 x i32> %1, %2
+  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
+  ret <8 x i32> %4
+}
+
+define <8 x i16> @max_gt_v8i16c() {
+; SSE-LABEL: max_gt_v8i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v8i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
+  %2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
+  %3 = icmp sgt <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @max_gt_v16i16c() {
+; SSE-LABEL: max_gt_v16i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65534,65533,65532,65533,65534,65535,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v16i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
+  %2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
+  %3 = icmp sgt <16 x i16> %1, %2
+  %4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
+  ret <16 x i16> %4
+}
+
+define <16 x i8> @max_gt_v16i8c() {
+; SSE-LABEL: max_gt_v16i8c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v16i8c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
+  %2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
+  %3 = icmp sgt <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ret <16 x i8> %4
+}
+
+define <2 x i64> @max_ge_v2i64c() {
+; SSE-LABEL: max_ge_v2i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v2i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
+  %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
+  %3 = icmp sge <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @max_ge_v4i64c() {
+; SSE-LABEL: max_ge_v4i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,7]
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v4i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
+  %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
+  %3 = icmp sge <4 x i64> %1, %2
+  %4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
+  ret <4 x i64> %4
+}
+
+define <4 x i32> @max_ge_v4i32c() {
+; SSE-LABEL: max_ge_v4i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v4i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
+  %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
+  %3 = icmp sge <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ret <4 x i32> %4
+}
+
+define <8 x i32> @max_ge_v8i32c() {
+; SSE-LABEL: max_ge_v8i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967293,4294967293,4294967295]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,5,5,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v8i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
+  %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
+  %3 = icmp sge <8 x i32> %1, %2
+  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
+  ret <8 x i32> %4
+}
+
+define <8 x i16> @max_ge_v8i16c() {
+; SSE-LABEL: max_ge_v8i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v8i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
+  %2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
+  %3 = icmp sge <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @max_ge_v16i16c() {
+; SSE-LABEL: max_ge_v16i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65534,65533,65532,65533,65534,65535,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v16i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
+  %2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
+  %3 = icmp sge <16 x i16> %1, %2
+  %4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
+  ret <16 x i16> %4
+}
+
+define <16 x i8> @max_ge_v16i8c() {
+; SSE-LABEL: max_ge_v16i8c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v16i8c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
+  %2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
+  %3 = icmp sge <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ret <16 x i8> %4
+}
+
+define <2 x i64> @min_lt_v2i64c() {
+; SSE-LABEL: min_lt_v2i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v2i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
+  %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
+  %3 = icmp slt <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @min_lt_v4i64c() {
+; SSE-LABEL: min_lt_v4i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,18446744073709551609]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v4i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
+  %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
+  %3 = icmp slt <4 x i64> %1, %2
+  %4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
+  ret <4 x i64> %4
+}
+
+define <4 x i32> @min_lt_v4i32c() {
+; SSE-LABEL: min_lt_v4i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v4i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
+  %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
+  %3 = icmp slt <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ret <4 x i32> %4
+}
+
+define <8 x i32> @min_lt_v8i32c() {
+; SSE-LABEL: min_lt_v8i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967291,4294967291,4294967289]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,3,3,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v8i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
+  %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
+  %3 = icmp slt <8 x i32> %1, %2
+  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
+  ret <8 x i32> %4
+}
+
+define <8 x i16> @min_lt_v8i16c() {
+; SSE-LABEL: min_lt_v8i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v8i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
+  %2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
+  %3 = icmp slt <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @min_lt_v16i16c() {
+; SSE-LABEL: min_lt_v16i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65529,65530,65531,65532,65531,65530,65529,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v16i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
+  %2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
+  %3 = icmp slt <16 x i16> %1, %2
+  %4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
+  ret <16 x i16> %4
+}
+
+define <16 x i8> @min_lt_v16i8c() {
+; SSE-LABEL: min_lt_v16i8c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v16i8c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
+  %2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
+  %3 = icmp slt <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ret <16 x i8> %4
+}
+
+define <2 x i64> @min_le_v2i64c() {
+; SSE-LABEL: min_le_v2i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v2i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
+  %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
+  %3 = icmp sle <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @min_le_v4i64c() {
+; SSE-LABEL: min_le_v4i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,18446744073709551609]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v4i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
+  %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
+  %3 = icmp sle <4 x i64> %1, %2
+  %4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
+  ret <4 x i64> %4
+}
+
+define <4 x i32> @min_le_v4i32c() {
+; SSE-LABEL: min_le_v4i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v4i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
+  %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
+  %3 = icmp sle <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ret <4 x i32> %4
+}
+
+define <8 x i32> @min_le_v8i32c() {
+; SSE-LABEL: min_le_v8i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967291,4294967291,4294967289]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,3,3,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v8i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
+  %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
+  %3 = icmp sle <8 x i32> %1, %2
+  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
+  ret <8 x i32> %4
+}
+
+define <8 x i16> @min_le_v8i16c() {
+; SSE-LABEL: min_le_v8i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v8i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
+  %2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
+  %3 = icmp sle <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @min_le_v16i16c() {
+; SSE-LABEL: min_le_v16i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65529,65530,65531,65532,65531,65530,65529,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v16i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
+  %2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
+  %3 = icmp sle <16 x i16> %1, %2
+  %4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
+  ret <16 x i16> %4
+}
+
+define <16 x i8> @min_le_v16i8c() {
+; SSE-LABEL: min_le_v16i8c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v16i8c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
+  %2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
+  %3 = icmp sle <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ret <16 x i8> %4
+}
diff --git a/test/CodeGen/X86/vec_minmax_uint.ll b/test/CodeGen/X86/vec_minmax_uint.ll
new file mode 100644
index 0000000000000..6e48423c1520d
--- /dev/null
+++ b/test/CodeGen/X86/vec_minmax_uint.ll
@@ -0,0 +1,2229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Unsigned Maximum (GT)
+;
+
+define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: max_gt_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    movdqa %xmm1, %xmm3
+; SSE42-NEXT:    pxor %xmm0, %xmm3
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ugt <2 x i64> %a, %b
+  %2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: max_gt_v4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    pxor %xmm0, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm8, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v4i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm4
+; SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    movdqa %xmm3, %xmm6
+; SSE42-NEXT:    pxor %xmm0, %xmm6
+; SSE42-NEXT:    movdqa %xmm1, %xmm5
+; SSE42-NEXT:    pxor %xmm0, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE42-NEXT:    movdqa %xmm2, %xmm6
+; SSE42-NEXT:    pxor %xmm0, %xmm6
+; SSE42-NEXT:    pxor %xmm4, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    movapd %xmm2, %xmm0
+; SSE42-NEXT:    movapd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_gt_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ugt <4 x i64> %a, %b
+  %2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %2
+}
+
+define <4 x i32> @max_gt_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: max_gt_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v4i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ugt <4 x i32> %a, %b
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: max_gt_v8i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v8i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxud %xmm2, %xmm0
+; SSE41-NEXT:    pmaxud %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v8i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxud %xmm2, %xmm0
+; SSE42-NEXT:    pmaxud %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_gt_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ugt <8 x i32> %a, %b
+  %2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @max_gt_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: max_gt_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v8i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v8i16:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ugt <8 x i16> %a, %b
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: max_gt_v16i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_gt_v16i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE41-NEXT:    pmaxuw %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_gt_v16i16:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE42-NEXT:    pmaxuw %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_gt_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ugt <16 x i16> %a, %b
+  %2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %2
+}
+
+define <16 x i8> @max_gt_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: max_gt_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmaxub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ugt <16 x i8> %a, %b
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @max_gt_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: max_gt_v32i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmaxub %xmm2, %xmm0
+; SSE-NEXT:    pmaxub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: max_gt_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_gt_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_gt_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ugt <32 x i8> %a, %b
+  %2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %2
+}
+
+;
+; Unsigned Maximum (GE)
+;
+
+define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: max_ge_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    pxor %xmm3, %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm3
+; SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE42-NEXT:    pxor %xmm3, %xmm0
+; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp uge <2 x i64> %a, %b
+  %2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: max_ge_v4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm8
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm0, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm9
+; SSE41-NEXT:    pxor %xmm9, %xmm5
+; SSE41-NEXT:    movdqa %xmm8, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v4i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm4
+; SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    movdqa %xmm1, %xmm6
+; SSE42-NEXT:    pxor %xmm0, %xmm6
+; SSE42-NEXT:    movdqa %xmm3, %xmm5
+; SSE42-NEXT:    pxor %xmm0, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE42-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE42-NEXT:    pxor %xmm6, %xmm5
+; SSE42-NEXT:    movdqa %xmm4, %xmm7
+; SSE42-NEXT:    pxor %xmm0, %xmm7
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
+; SSE42-NEXT:    pxor %xmm6, %xmm0
+; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    movapd %xmm2, %xmm0
+; SSE42-NEXT:    movapd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_ge_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX512-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp uge <4 x i64> %a, %b
+  %2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %2
+}
+
+define <4 x i32> @max_ge_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: max_ge_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v4i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp uge <4 x i32> %a, %b
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: max_ge_v8i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm8
+; SSE2-NEXT:    pxor %xmm6, %xmm8
+; SSE2-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v8i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxud %xmm2, %xmm0
+; SSE41-NEXT:    pmaxud %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v8i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxud %xmm2, %xmm0
+; SSE42-NEXT:    pmaxud %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_ge_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp uge <8 x i32> %a, %b
+  %2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @max_ge_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: max_ge_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psubusw %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqw %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v8i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v8i16:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp uge <8 x i16> %a, %b
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: max_ge_v16i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psubusw %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpeqw %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    psubusw %xmm0, %xmm6
+; SSE2-NEXT:    pcmpeqw %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: max_ge_v16i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE41-NEXT:    pmaxuw %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: max_ge_v16i16:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE42-NEXT:    pmaxuw %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: max_ge_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp uge <16 x i16> %a, %b
+  %2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %2
+}
+
+define <16 x i8> @max_ge_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: max_ge_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmaxub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp uge <16 x i8> %a, %b
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @max_ge_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: max_ge_v32i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmaxub %xmm2, %xmm0
+; SSE-NEXT:    pmaxub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: max_ge_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: max_ge_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: max_ge_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp uge <32 x i8> %a, %b
+  %2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %2
+}
+
+;
+; Unsigned Minimum (LT)
+;
+
+define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: min_lt_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    movdqa %xmm2, %xmm3
+; SSE42-NEXT:    pxor %xmm0, %xmm3
+; SSE42-NEXT:    pxor %xmm1, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ult <2 x i64> %a, %b
+  %2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: min_lt_v4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm0, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm5
+; SSE41-NEXT:    movdqa %xmm8, %xmm4
+; SSE41-NEXT:    pxor %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v4i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm4
+; SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    movdqa %xmm1, %xmm6
+; SSE42-NEXT:    pxor %xmm0, %xmm6
+; SSE42-NEXT:    movdqa %xmm3, %xmm5
+; SSE42-NEXT:    pxor %xmm0, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE42-NEXT:    movdqa %xmm4, %xmm6
+; SSE42-NEXT:    pxor %xmm0, %xmm6
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    movapd %xmm2, %xmm0
+; SSE42-NEXT:    movapd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_lt_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX512-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ult <4 x i64> %a, %b
+  %2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %2
+}
+
+define <4 x i32> @min_lt_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: min_lt_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v4i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminud %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ult <4 x i32> %a, %b
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: min_lt_v8i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v8i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminud %xmm2, %xmm0
+; SSE41-NEXT:    pminud %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v8i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminud %xmm2, %xmm0
+; SSE42-NEXT:    pminud %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_lt_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ult <8 x i32> %a, %b
+  %2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @min_lt_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: min_lt_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v8i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminuw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v8i16:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminuw %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ult <8 x i16> %a, %b
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: min_lt_v16i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtw %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_lt_v16i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminuw %xmm2, %xmm0
+; SSE41-NEXT:    pminuw %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_lt_v16i16:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminuw %xmm2, %xmm0
+; SSE42-NEXT:    pminuw %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_lt_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ult <16 x i16> %a, %b
+  %2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %2
+}
+
+define <16 x i8> @min_lt_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: min_lt_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pminub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ult <16 x i8> %a, %b
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @min_lt_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: min_lt_v32i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pminub %xmm2, %xmm0
+; SSE-NEXT:    pminub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: min_lt_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_lt_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_lt_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ult <32 x i8> %a, %b
+  %2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %2
+}
+
+;
+; Unsigned Minimum (LE)
+;
+
+define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: min_le_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v2i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    movdqa %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm3, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm3
+; SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE42-NEXT:    pxor %xmm3, %xmm0
+; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_le_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ule <2 x i64> %a, %b
+  %2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: min_le_v4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm8
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    pxor %xmm0, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm9
+; SSE41-NEXT:    pxor %xmm9, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm6
+; SSE41-NEXT:    pxor %xmm8, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v4i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm4
+; SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    movdqa %xmm3, %xmm6
+; SSE42-NEXT:    pxor %xmm0, %xmm6
+; SSE42-NEXT:    movdqa %xmm1, %xmm5
+; SSE42-NEXT:    pxor %xmm0, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE42-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE42-NEXT:    pxor %xmm6, %xmm5
+; SSE42-NEXT:    movdqa %xmm2, %xmm7
+; SSE42-NEXT:    pxor %xmm0, %xmm7
+; SSE42-NEXT:    pxor %xmm4, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
+; SSE42-NEXT:    pxor %xmm6, %xmm0
+; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    movapd %xmm2, %xmm0
+; SSE42-NEXT:    movapd %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_le_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ule <4 x i64> %a, %b
+  %2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %2
+}
+
+define <4 x i32> @min_le_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: min_le_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v4i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminud %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_le_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ule <4 x i32> %a, %b
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: min_le_v8i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    pxor %xmm6, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v8i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminud %xmm2, %xmm0
+; SSE41-NEXT:    pminud %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v8i32:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminud %xmm2, %xmm0
+; SSE42-NEXT:    pminud %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_le_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ule <8 x i32> %a, %b
+  %2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @min_le_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: min_le_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psubusw %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqw %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v8i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminuw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v8i16:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminuw %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: min_le_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ule <8 x i16> %a, %b
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: min_le_v16i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psubusw %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpeqw %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psubusw %xmm2, %xmm5
+; SSE2-NEXT:    pcmpeqw %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: min_le_v16i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pminuw %xmm2, %xmm0
+; SSE41-NEXT:    pminuw %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE42-LABEL: min_le_v16i16:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pminuw %xmm2, %xmm0
+; SSE42-NEXT:    pminuw %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: min_le_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ule <16 x i16> %a, %b
+  %2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %2
+}
+
+define <16 x i8> @min_le_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: min_le_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pminub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = icmp ule <16 x i8> %a, %b
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @min_le_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: min_le_v32i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pminub %xmm2, %xmm0
+; SSE-NEXT:    pminub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: min_le_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: min_le_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: min_le_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ule <32 x i8> %a, %b
+  %2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %2
+}
+
+;
+; Constant Folding
+;
+
+define <2 x i64> @max_gt_v2i64c() {
+; SSE-LABEL: max_gt_v2i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v2i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
+  %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
+  %3 = icmp ugt <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @max_gt_v4i64c() {
+; SSE-LABEL: max_gt_v4i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,7]
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v4i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
+  %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
+  %3 = icmp ugt <4 x i64> %1, %2
+  %4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
+  ret <4 x i64> %4
+}
+
+define <4 x i32> @max_gt_v4i32c() {
+; SSE-LABEL: max_gt_v4i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v4i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
+  %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
+  %3 = icmp ugt <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ret <4 x i32> %4
+}
+
+define <8 x i32> @max_gt_v8i32c() {
+; SSE-LABEL: max_gt_v8i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967293,4294967293,4294967295]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,5,5,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v8i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
+  %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
+  %3 = icmp ugt <8 x i32> %1, %2
+  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
+  ret <8 x i32> %4
+}
+
+define <8 x i16> @max_gt_v8i16c() {
+; SSE-LABEL: max_gt_v8i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v8i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
+  %2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
+  %3 = icmp ugt <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @max_gt_v16i16c() {
+; SSE-LABEL: max_gt_v16i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65534,65533,65532,65533,65534,65535,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v16i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
+  %2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
+  %3 = icmp ugt <16 x i16> %1, %2
+  %4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
+  ret <16 x i16> %4
+}
+
+define <16 x i8> @max_gt_v16i8c() {
+; SSE-LABEL: max_gt_v16i8c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_gt_v16i8c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
+  %2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
+  %3 = icmp ugt <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ret <16 x i8> %4
+}
+
+define <2 x i64> @max_ge_v2i64c() {
+; SSE-LABEL: max_ge_v2i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v2i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
+  %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
+  %3 = icmp uge <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @max_ge_v4i64c() {
+; SSE-LABEL: max_ge_v4i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,7]
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v4i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
+  %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
+  %3 = icmp uge <4 x i64> %1, %2
+  %4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
+  ret <4 x i64> %4
+}
+
+define <4 x i32> @max_ge_v4i32c() {
+; SSE-LABEL: max_ge_v4i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v4i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
+  %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
+  %3 = icmp uge <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ret <4 x i32> %4
+}
+
+define <8 x i32> @max_ge_v8i32c() {
+; SSE-LABEL: max_ge_v8i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967293,4294967293,4294967295]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,5,5,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v8i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
+  %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
+  %3 = icmp uge <8 x i32> %1, %2
+  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
+  ret <8 x i32> %4
+}
+
+define <8 x i16> @max_ge_v8i16c() {
+; SSE-LABEL: max_ge_v8i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v8i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
+  %2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
+  %3 = icmp uge <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @max_ge_v16i16c() {
+; SSE-LABEL: max_ge_v16i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65534,65533,65532,65533,65534,65535,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v16i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
+  %2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
+  %3 = icmp uge <16 x i16> %1, %2
+  %4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
+  ret <16 x i16> %4
+}
+
+define <16 x i8> @max_ge_v16i8c() {
+; SSE-LABEL: max_ge_v16i8c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: max_ge_v16i8c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
+  %2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
+  %3 = icmp uge <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ret <16 x i8> %4
+}
+
+define <2 x i64> @min_lt_v2i64c() {
+; SSE-LABEL: min_lt_v2i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v2i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
+  %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
+  %3 = icmp ult <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @min_lt_v4i64c() {
+; SSE-LABEL: min_lt_v4i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,18446744073709551609]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v4i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
+  %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
+  %3 = icmp ult <4 x i64> %1, %2
+  %4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
+  ret <4 x i64> %4
+}
+
+define <4 x i32> @min_lt_v4i32c() {
+; SSE-LABEL: min_lt_v4i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v4i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
+  %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
+  %3 = icmp ult <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ret <4 x i32> %4
+}
+
+define <8 x i32> @min_lt_v8i32c() {
+; SSE-LABEL: min_lt_v8i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967291,4294967291,4294967289]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,3,3,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v8i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
+  %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
+  %3 = icmp ult <8 x i32> %1, %2
+  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
+  ret <8 x i32> %4
+}
+
+define <8 x i16> @min_lt_v8i16c() {
+; SSE-LABEL: min_lt_v8i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v8i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
+  %2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16  1, i32 0
+  %3 = icmp ult <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @min_lt_v16i16c() {
+; SSE-LABEL: min_lt_v16i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,65530,65531,65532,65531,65530,65529,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v16i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
+  %2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16  1, i32 0
+  %3 = icmp ult <16 x i16> %1, %2
+  %4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
+  ret <16 x i16> %4
+}
+
+define <16 x i8> @min_lt_v16i8c() {
+; SSE-LABEL: min_lt_v16i8c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_lt_v16i8c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
+  %2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8  1, i32 0
+  %3 = icmp ult <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ret <16 x i8> %4
+}
+
+define <2 x i64> @min_le_v2i64c() {
+; SSE-LABEL: min_le_v2i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v2i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
+  %2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
+  %3 = icmp ule <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @min_le_v4i64c() {
+; SSE-LABEL: min_le_v4i64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551609,18446744073709551609]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v4i64c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
+  %2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
+  %3 = icmp ule <4 x i64> %1, %2
+  %4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
+  ret <4 x i64> %4
+}
+
+define <4 x i32> @min_le_v4i32c() {
+; SSE-LABEL: min_le_v4i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v4i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
+  %2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
+  %3 = icmp ule <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ret <4 x i32> %4
+}
+
+define <8 x i32> @min_le_v8i32c() {
+; SSE-LABEL: min_le_v8i32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967289,4294967291,4294967291,4294967289]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,3,3,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v8i32c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
+  %2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
+  %3 = icmp ule <8 x i32> %1, %2
+  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
+  ret <8 x i32> %4
+}
+
+define <8 x i16> @min_le_v8i16c() {
+; SSE-LABEL: min_le_v8i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v8i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
+  %2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
+  %3 = icmp ule <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @min_le_v16i16c() {
+; SSE-LABEL: min_le_v16i16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65529,65530,65531,65532,65531,65530,65529,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v16i16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
+  %2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
+  %3 = icmp ule <16 x i16> %1, %2
+  %4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
+  ret <16 x i16> %4
+}
+
+define <16 x i8> @min_le_v16i8c() {
+; SSE-LABEL: min_le_v16i8c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: min_le_v16i8c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
+  %2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
+  %3 = icmp ule <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ret <16 x i8> %4
+}
diff --git a/test/CodeGen/X86/vec_sdiv_to_shift.ll b/test/CodeGen/X86/vec_sdiv_to_shift.ll
index 56855d3c44ebc..7f71a0c2ea5b3 100644
--- a/test/CodeGen/X86/vec_sdiv_to_shift.ll
+++ b/test/CodeGen/X86/vec_sdiv_to_shift.ll
@@ -13,6 +13,19 @@ entry:
   ret <8 x i16> %0
 }
 
+define <8 x i16> @sdiv_vec8x16_minsize(<8 x i16> %var) minsize {
+entry:
+; CHECK: sdiv_vec8x16_minsize
+; CHECK: psraw  $15
+; CHECK: vpsrlw  $11
+; CHECK: vpaddw
+; CHECK: vpsraw  $5
+; CHECK: ret
+  %0 = sdiv <8 x i16> %var, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %0
+}
+
+
 define <4 x i32> @sdiv_zero(<4 x i32> %var) {
 entry:
 ; CHECK: sdiv_zero
diff --git a/test/CodeGen/X86/vec_trunc_sext.ll b/test/CodeGen/X86/vec_trunc_sext.ll
index dcfe423eb7487..66af87c781878 100644
--- a/test/CodeGen/X86/vec_trunc_sext.ll
+++ b/test/CodeGen/X86/vec_trunc_sext.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='-sse4.1' -o - | FileCheck %s -check-prefix=NO_SSE_41
-; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='+sse4.1' -o - | FileCheck %s -check-prefix=SSE_41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse4.1 | FileCheck %s --check-prefix=NO_SSE_41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE_41
 
 ; PR20472 ( http://llvm.org/bugs/show_bug.cgi?id=20472 )
 ; When sexting a trunc'd vector value, we can't eliminate the zext.
@@ -9,22 +9,23 @@
 ; but that is beyond our current codegen capabilities.
 
 define <4 x i32> @trunc_sext(<4 x i16>* %in) {
+; NO_SSE_41-LABEL: trunc_sext:
+; NO_SSE_41:       # BB#0:
+; NO_SSE_41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; NO_SSE_41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; NO_SSE_41-NEXT:    pslld $24, %xmm0
+; NO_SSE_41-NEXT:    psrad $24, %xmm0
+; NO_SSE_41-NEXT:    retq
+;
+; SSE_41-LABEL: trunc_sext:
+; SSE_41:       # BB#0:
+; SSE_41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE_41-NEXT:    pslld $24, %xmm0
+; SSE_41-NEXT:    psrad $24, %xmm0
+; SSE_41-NEXT:    retq
   %load = load <4 x i16>, <4 x i16>* %in
   %trunc = trunc <4 x i16> %load to <4 x i8>
   %sext = sext <4 x i8> %trunc to <4 x i32>
   ret <4 x i32> %sext
-
-; NO_SSE_41-LABEL: trunc_sext:
-; NO_SSE_41: movq (%rdi), %xmm0
-; NO_SSE_41-NEXT: punpcklwd %xmm0, %xmm0
-; NO_SSE_41-NEXT: pslld $24, %xmm0
-; NO_SSE_41-NEXT: psrad $24, %xmm0
-; NO_SSE_41-NEXT: retq
-
-; SSE_41-LABEL: trunc_sext:
-; SSE_41: pmovzxwd (%rdi), %xmm0
-; SSE_41-NEXT: pslld $24, %xmm0
-; SSE_41-NEXT: psrad $24, %xmm0
-; SSE_41-NEXT: retq
 }
 
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
new file mode 100644
index 0000000000000..1f36d064f873e
--- /dev/null
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -0,0 +1,130 @@
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+sse4.1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx2 \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+
+; CST: [[MASKCSTADDR:.LCPI[0-9_]+]]:
+; CST-NEXT: .long 65535 # 0xffff
+; CST-NEXT: .long 65535 # 0xffff
+; CST-NEXT: .long 65535 # 0xffff
+; CST-NEXT: .long 65535 # 0xffff
+
+; CST: [[FPMASKCSTADDR:.LCPI[0-9_]+]]:
+; CST-NEXT: .long 1199570944 # float 65536
+; CST-NEXT: .long 1199570944 # float 65536
+; CST-NEXT: .long 1199570944 # float 65536
+; CST-NEXT: .long 1199570944 # float 65536
+
+; AVX2: [[FPMASKCSTADDR:.LCPI[0-9_]+]]:
+; AVX2-NEXT: .long 1199570944 # float 65536
+
+; AVX2: [[MASKCSTADDR:.LCPI[0-9_]+]]:
+; AVX2-NEXT: .long 65535 # 0xffff
+
+define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
+; SSE-LABEL: test_uitofp_v4i32_to_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE-NEXT:    andps %xmm0, %xmm1
+; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    mulps [[FPMASKCSTADDR]](%rip), %xmm0
+; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_uitofp_v4i32_to_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps [[MASKCSTADDR]](%rip), %xmm0, %xmm1
+; AVX-NEXT:    vcvtdq2ps %xmm1, %xmm1
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    vmulps [[FPMASKCSTADDR]](%rip), %xmm0, %xmm0
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_uitofp_v4i32_to_v4f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vcvtdq2ps %xmm1, %xmm1
+; AVX2-NEXT:    vbroadcastss [[FPMASKCSTADDR]](%rip), %xmm2
+; AVX2-NEXT:    vmulps %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastd [[MASKCSTADDR]](%rip), %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+  %tmp = uitofp <4 x i32> %arg to <4 x float>
+  ret <4 x float> %tmp
+}
+
+; AVX: [[MASKCSTADDR_v8:.LCPI[0-9_]+]]:
+; AVX-NEXT: .long 65535 # 0xffff
+; AVX-NEXT: .long 65535 # 0xffff
+; AVX-NEXT: .long 65535 # 0xffff
+; AVX-NEXT: .long 65535 # 0xffff
+
+; AVX: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]:
+; AVX-NEXT: .long 1199570944 # float 65536
+; AVX-NEXT: .long 1199570944 # float 65536
+; AVX-NEXT: .long 1199570944 # float 65536
+; AVX-NEXT: .long 1199570944 # float 65536
+
+; AVX2: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]:
+; AVX2-NEXT: .long 1199570944 # float 65536
+
+; AVX2: [[MASKCSTADDR_v8:.LCPI[0-9_]+]]:
+; AVX2-NEXT: .long 65535 # 0xffff
+
+define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
+; SSE-LABEL: test_uitofp_v8i32_to_v8f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrld $16, %xmm2
+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [6.553600e+04,6.553600e+04,6.553600e+04,6.553600e+04]
+; SSE-NEXT:    mulps %xmm3, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    addps %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrld $16, %xmm2
+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
+; SSE-NEXT:    mulps %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT:    addps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_uitofp_v8i32_to_v8f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps [[MASKCSTADDR_v8]](%rip), %ymm0, %ymm1
+; AVX-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT:    vmulps [[FPMASKCSTADDR_v8]](%rip), %ymm0, %ymm0
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_uitofp_v8i32_to_v8f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm1
+; AVX2-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastss [[FPMASKCSTADDR_v8]](%rip), %ymm2
+; AVX2-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd [[MASKCSTADDR_v8]](%rip), %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %tmp = uitofp <8 x i32> %arg to <8 x float>
+  ret <8 x float> %tmp
+}
diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll
index 46cfcd9a9a12a..ce0c11b2fa2a2 100644
--- a/test/CodeGen/X86/vec_uint_to_fp.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp.ll
@@ -23,10 +23,10 @@
 ; CST-NEXT: .long	1392508928              ## 0x53000000
 
 ; CST: [[MAGICCSTADDR:LCPI0_[0-9]+]]:
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
 
 ; AVX2: [[LOWCSTADDR:LCPI0_[0-9]+]]:
 ; AVX2-NEXT: .long	1258291200              ## 0x4b000000
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index e15daaa54a332..aaf81f2f9bb6d 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -255,31 +255,32 @@ entry:
 define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
 ; SSE2-LABEL: vsel_i8:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    andps %xmm2, %xmm0
-; SSE2-NEXT:    andnps %xmm1, %xmm2
-; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; SSE2-NEXT:    andps %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_i8:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3],zero,xmm1[5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8,9,10,11,12,13,14,15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3],zero,xmm1[5,6,7],zero,xmm1[9,10,11],zero,xmm1[13,14,15]
 ; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: vsel_i8:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vsel_i8:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255]
-; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
@@ -623,49 +624,52 @@ entry:
 define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
 ; SSE2-LABEL: constant_pblendvb_avx2:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movaps {{.*#+}} xmm4 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
 ; SSE2-NEXT:    movaps %xmm4, %xmm5
-; SSE2-NEXT:    andnps %xmm2, %xmm5
-; SSE2-NEXT:    andps %xmm4, %xmm0
-; SSE2-NEXT:    orps %xmm5, %xmm0
-; SSE2-NEXT:    andps %xmm4, %xmm1
-; SSE2-NEXT:    andnps %xmm3, %xmm4
-; SSE2-NEXT:    orps %xmm4, %xmm1
+; SSE2-NEXT:    andnps %xmm0, %xmm5
+; SSE2-NEXT:    andps %xmm4, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm5
+; SSE2-NEXT:    andps %xmm4, %xmm3
+; SSE2-NEXT:    andnps %xmm1, %xmm4
+; SSE2-NEXT:    orps %xmm3, %xmm4
+; SSE2-NEXT:    movaps  %xmm5, %xmm0
+; SSE2-NEXT:    movaps  %xmm4, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: constant_pblendvb_avx2:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,128,3,128,128,128,7,128,128,128,128,128,128,128,128]
-; SSSE3-NEXT:    pshufb %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [128,128,2,128,4,5,6,128,8,9,10,11,12,13,14,15]
-; SSSE3-NEXT:    pshufb %xmm5, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [128,128,2,128,4,5,6,128,128,128,10,128,12,13,14,128]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,1,128,3,128,128,128,7,8,9,128,11,128,128,128,15]
+; SSSE3-NEXT:    pshufb %xmm5, %xmm2
 ; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    pshufb %xmm4, %xmm3
-; SSSE3-NEXT:    pshufb %xmm5, %xmm1
+; SSSE3-NEXT:    pshufb %xmm4, %xmm1
+; SSSE3-NEXT:    pshufb %xmm5, %xmm3
 ; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_pblendvb_avx2:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pblendvb %xmm4, %xmm2
-; SSE41-NEXT:    pblendvb %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE41-NEXT:    pblendvb %xmm2, %xmm4
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_pblendvb_avx2:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128    $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128    $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovdqa .LCPI18_0(%rip), %xmm4  # xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX1-NEXT:    vpblendvb       %xmm4, %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendvb       %xmm4, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128     $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_pblendvb_avx2:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index 2e482a0f14304..1117e206e5b0b 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE41
 ; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s --check-prefix=SSE
 ; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index b43188b7c6ea1..8bf0af68e6dc0 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -1,13 +1,13 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VLCD --check-prefix=ALL --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=AVX512CD --check-prefix=ALL --check-prefix=AVX512
 
-target triple = "x86_64-unknown-unknown"
-
-define <2 x i64> @testv2i64(<2 x i64> %in) {
+define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movd %xmm0, %rax
@@ -16,13 +16,13 @@ define <2 x i64> @testv2i64(<2 x i64> %in) {
 ; SSE2-NEXT:    cmoveq %rcx, %rax
 ; SSE2-NEXT:    xorq $63, %rax
 ; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    pshufd $78, %xmm0, %xmm0       # xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    movd %xmm0, %rax
 ; SSE2-NEXT:    bsrq %rax, %rax
 ; SSE2-NEXT:    cmoveq %rcx, %rax
 ; SSE2-NEXT:    xorq $63, %rax
 ; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq %xmm0, %xmm1    # xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -34,13 +34,13 @@ define <2 x i64> @testv2i64(<2 x i64> %in) {
 ; SSE3-NEXT:    cmoveq %rcx, %rax
 ; SSE3-NEXT:    xorq $63, %rax
 ; SSE3-NEXT:    movd %rax, %xmm1
-; SSE3-NEXT:    pshufd $78, %xmm0, %xmm0       # xmm0 = xmm0[2,3,0,1]
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE3-NEXT:    movd %xmm0, %rax
 ; SSE3-NEXT:    bsrq %rax, %rax
 ; SSE3-NEXT:    cmoveq %rcx, %rax
 ; SSE3-NEXT:    xorq $63, %rax
 ; SSE3-NEXT:    movd %rax, %xmm0
-; SSE3-NEXT:    punpcklqdq %xmm0, %xmm1    # xmm1 = xmm1[0],xmm0[0]
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
@@ -52,16 +52,15 @@ define <2 x i64> @testv2i64(<2 x i64> %in) {
 ; SSSE3-NEXT:    cmoveq %rcx, %rax
 ; SSSE3-NEXT:    xorq $63, %rax
 ; SSSE3-NEXT:    movd %rax, %xmm1
-; SSSE3-NEXT:    pshufd $78, %xmm0, %xmm0       # xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSSE3-NEXT:    movd %xmm0, %rax
 ; SSSE3-NEXT:    bsrq %rax, %rax
 ; SSSE3-NEXT:    cmoveq %rcx, %rax
 ; SSSE3-NEXT:    xorq $63, %rax
 ; SSSE3-NEXT:    movd %rax, %xmm0
-; SSSE3-NEXT:    punpcklqdq %xmm0, %xmm1    # xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
-
 ;
 ; SSE41-LABEL: testv2i64:
 ; SSE41:       # BB#0:
@@ -94,11 +93,22 @@ define <2 x i64> @testv2i64(<2 x i64> %in) {
 ; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv2i64:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vplzcntq %xmm0, %xmm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv2i64:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT:    retq
+
   %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0)
   ret <2 x i64> %out
 }
 
-define <2 x i64> @testv2i64u(<2 x i64> %in) {
+define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64u:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movd %xmm0, %rax
@@ -169,11 +179,22 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) {
 ; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv2i64u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vplzcntq %xmm0, %xmm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv2i64u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT:    retq
+
   %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1)
   ret <2 x i64> %out
 }
 
-define <4 x i32> @testv4i32(<4 x i32> %in) {
+define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
@@ -320,11 +341,22 @@ define <4 x i32> @testv4i32(<4 x i32> %in) {
 ; AVX-NEXT:    xorl $31, %eax
 ; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv4i32:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vplzcntd %xmm0, %xmm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv4i32:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    retq
+
   %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0)
   ret <4 x i32> %out
 }
 
-define <4 x i32> @testv4i32u(<4 x i32> %in) {
+define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32u:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
@@ -446,11 +478,22 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) {
 ; AVX-NEXT:    xorl $31, %eax
 ; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv4i32u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vplzcntd %xmm0, %xmm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv4i32u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    retq
+
   %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1)
   ret <4 x i32> %out
 }
 
-define <8 x i16> @testv8i16(<8 x i16> %in) {
+define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE2-LABEL: testv8i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pextrw $7, %xmm0, %eax
@@ -697,11 +740,27 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
 ; AVX-NEXT:    xorl $15, %ecx
 ; AVX-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm0
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv8i16:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vpmovzxwd %xmm0, %ymm0
+; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
+; AVX512VLCD-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VLCD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv8i16:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT:    retq
   %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0)
   ret <8 x i16> %out
 }
 
-define <8 x i16> @testv8i16u(<8 x i16> %in) {
+define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE2-LABEL: testv8i16u:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pextrw $7, %xmm0, %eax
@@ -903,29 +962,46 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) {
 ; AVX-NEXT:    xorl $15, %eax
 ; AVX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv8i16u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vpmovzxwd %xmm0, %ymm0
+; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
+; AVX512VLCD-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VLCD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv8i16u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT:    retq
   %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1)
   ret <8 x i16> %out
 }
 
-define <16 x i8> @testv16i8(<16 x i8> %in) {
+define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; SSE2-LABEL: testv16i8:
 ; SSE2:       # BB#0:
-; SSE2:         pushq %rbp
-; SSE2:         movaps %xmm0, -24(%rsp)
-; SSE2-NEXT:    movzbl -9(%rsp), %eax
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    bsrl %eax, %ecx
 ; SSE2-NEXT:    movl $15, %eax
 ; SSE2-NEXT:    cmovel %eax, %ecx
 ; SSE2-NEXT:    xorl $7, %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    movzbl -10(%rsp), %ebx
-; SSE2-NEXT:    movzbl -11(%rsp), %edi
-; SSE2-NEXT:    movzbl -12(%rsp), %r9d
-; SSE2-NEXT:    movzbl -13(%rsp), %edx
-; SSE2-NEXT:    movzbl -14(%rsp), %r11d
-; SSE2-NEXT:    movzbl -15(%rsp), %esi
-; SSE2-NEXT:    movzbl -16(%rsp), %r8d
-; SSE2-NEXT:    movzbl -17(%rsp), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    bsrl %ecx, %ecx
 ; SSE2-NEXT:    cmovel %eax, %ecx
 ; SSE2-NEXT:    xorl $7, %ecx
@@ -935,10 +1011,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; SSE2-NEXT:    cmovel %eax, %ecx
 ; SSE2-NEXT:    xorl $7, %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    movzbl -18(%rsp), %edx
-; SSE2-NEXT:    movzbl -19(%rsp), %ecx
-; SSE2-NEXT:    movzbl -20(%rsp), %r10d
-; SSE2-NEXT:    movzbl -21(%rsp), %ebp
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
 ; SSE2-NEXT:    bsrl %ebp, %ebp
 ; SSE2-NEXT:    cmovel %eax, %ebp
 ; SSE2-NEXT:    xorl $7, %ebp
@@ -958,8 +1034,8 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; SSE2-NEXT:    cmovel %eax, %ecx
 ; SSE2-NEXT:    xorl $7, %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm3
-; SSE2-NEXT:    movzbl -22(%rsp), %esi
-; SSE2-NEXT:    movzbl -23(%rsp), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    bsrl %ecx, %ecx
 ; SSE2-NEXT:    cmovel %eax, %ecx
 ; SSE2-NEXT:    xorl $7, %ecx
@@ -999,7 +1075,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; SSE2-NEXT:    cmovel %eax, %ecx
 ; SSE2-NEXT:    xorl $7, %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm4
-; SSE2-NEXT:    movzbl -24(%rsp), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    bsrl %ecx, %ecx
 ; SSE2-NEXT:    cmovel %eax, %ecx
 ; SSE2-NEXT:    xorl $7, %ecx
@@ -1014,22 +1090,23 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ;
 ; SSE3-LABEL: testv16i8:
 ; SSE3:       # BB#0:
-; SSE3:         pushq %rbp
-; SSE3:         movaps %xmm0, -24(%rsp)
-; SSE3-NEXT:    movzbl -9(%rsp), %eax
+; SSE3-NEXT:    pushq %rbp
+; SSE3-NEXT:    pushq %rbx
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE3-NEXT:    bsrl %eax, %ecx
 ; SSE3-NEXT:    movl $15, %eax
 ; SSE3-NEXT:    cmovel %eax, %ecx
 ; SSE3-NEXT:    xorl $7, %ecx
 ; SSE3-NEXT:    movd %ecx, %xmm0
-; SSE3-NEXT:    movzbl -10(%rsp), %ebx
-; SSE3-NEXT:    movzbl -11(%rsp), %edi
-; SSE3-NEXT:    movzbl -12(%rsp), %r9d
-; SSE3-NEXT:    movzbl -13(%rsp), %edx
-; SSE3-NEXT:    movzbl -14(%rsp), %r11d
-; SSE3-NEXT:    movzbl -15(%rsp), %esi
-; SSE3-NEXT:    movzbl -16(%rsp), %r8d
-; SSE3-NEXT:    movzbl -17(%rsp), %ecx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE3-NEXT:    bsrl %ecx, %ecx
 ; SSE3-NEXT:    cmovel %eax, %ecx
 ; SSE3-NEXT:    xorl $7, %ecx
@@ -1039,10 +1116,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; SSE3-NEXT:    cmovel %eax, %ecx
 ; SSE3-NEXT:    xorl $7, %ecx
 ; SSE3-NEXT:    movd %ecx, %xmm2
-; SSE3-NEXT:    movzbl -18(%rsp), %edx
-; SSE3-NEXT:    movzbl -19(%rsp), %ecx
-; SSE3-NEXT:    movzbl -20(%rsp), %r10d
-; SSE3-NEXT:    movzbl -21(%rsp), %ebp
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
 ; SSE3-NEXT:    bsrl %ebp, %ebp
 ; SSE3-NEXT:    cmovel %eax, %ebp
 ; SSE3-NEXT:    xorl $7, %ebp
@@ -1062,8 +1139,8 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; SSE3-NEXT:    cmovel %eax, %ecx
 ; SSE3-NEXT:    xorl $7, %ecx
 ; SSE3-NEXT:    movd %ecx, %xmm3
-; SSE3-NEXT:    movzbl -22(%rsp), %esi
-; SSE3-NEXT:    movzbl -23(%rsp), %ecx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE3-NEXT:    bsrl %ecx, %ecx
 ; SSE3-NEXT:    cmovel %eax, %ecx
 ; SSE3-NEXT:    xorl $7, %ecx
@@ -1103,7 +1180,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; SSE3-NEXT:    cmovel %eax, %ecx
 ; SSE3-NEXT:    xorl $7, %ecx
 ; SSE3-NEXT:    movd %ecx, %xmm4
-; SSE3-NEXT:    movzbl -24(%rsp), %ecx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE3-NEXT:    bsrl %ecx, %ecx
 ; SSE3-NEXT:    cmovel %eax, %ecx
 ; SSE3-NEXT:    xorl $7, %ecx
@@ -1118,22 +1195,23 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ;
 ; SSSE3-LABEL: testv16i8:
 ; SSSE3:       # BB#0:
-; SSSE3:         pushq %rbp
-; SSSE3:         movaps %xmm0, -24(%rsp)
-; SSSE3-NEXT:    movzbl -9(%rsp), %eax
+; SSSE3-NEXT:    pushq %rbp
+; SSSE3-NEXT:    pushq %rbx
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    bsrl %eax, %ecx
 ; SSSE3-NEXT:    movl $15, %eax
 ; SSSE3-NEXT:    cmovel %eax, %ecx
 ; SSSE3-NEXT:    xorl $7, %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    movzbl -10(%rsp), %ebx
-; SSSE3-NEXT:    movzbl -11(%rsp), %edi
-; SSSE3-NEXT:    movzbl -12(%rsp), %r9d
-; SSSE3-NEXT:    movzbl -13(%rsp), %edx
-; SSSE3-NEXT:    movzbl -14(%rsp), %r11d
-; SSSE3-NEXT:    movzbl -15(%rsp), %esi
-; SSSE3-NEXT:    movzbl -16(%rsp), %r8d
-; SSSE3-NEXT:    movzbl -17(%rsp), %ecx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSSE3-NEXT:    bsrl %ecx, %ecx
 ; SSSE3-NEXT:    cmovel %eax, %ecx
 ; SSSE3-NEXT:    xorl $7, %ecx
@@ -1143,10 +1221,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; SSSE3-NEXT:    cmovel %eax, %ecx
 ; SSSE3-NEXT:    xorl $7, %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    movzbl -18(%rsp), %edx
-; SSSE3-NEXT:    movzbl -19(%rsp), %ecx
-; SSSE3-NEXT:    movzbl -20(%rsp), %r10d
-; SSSE3-NEXT:    movzbl -21(%rsp), %ebp
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
 ; SSSE3-NEXT:    bsrl %ebp, %ebp
 ; SSSE3-NEXT:    cmovel %eax, %ebp
 ; SSSE3-NEXT:    xorl $7, %ebp
@@ -1166,8 +1244,8 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; SSSE3-NEXT:    cmovel %eax, %ecx
 ; SSSE3-NEXT:    xorl $7, %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    movzbl -22(%rsp), %esi
-; SSSE3-NEXT:    movzbl -23(%rsp), %ecx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSSE3-NEXT:    bsrl %ecx, %ecx
 ; SSSE3-NEXT:    cmovel %eax, %ecx
 ; SSSE3-NEXT:    xorl $7, %ecx
@@ -1207,7 +1285,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; SSSE3-NEXT:    cmovel %eax, %ecx
 ; SSSE3-NEXT:    xorl $7, %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm4
-; SSSE3-NEXT:    movzbl -24(%rsp), %ecx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSSE3-NEXT:    bsrl %ecx, %ecx
 ; SSSE3-NEXT:    cmovel %eax, %ecx
 ; SSSE3-NEXT:    xorl $7, %ecx
@@ -1390,27 +1468,43 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; AVX-NEXT:    xorl $7, %ecx
 ; AVX-NEXT:    vpinsrb $15, %ecx, %xmm1, %xmm0
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv16i8:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vpmovzxbd %xmm0, %zmm0
+; AVX512VLCD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512VLCD-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VLCD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv16i8:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vpmovzxbd %xmm0, %zmm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT:    retq
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
   ret <16 x i8> %out
 }
 
-define <16 x i8> @testv16i8u(<16 x i8> %in) {
+define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; SSE2-LABEL: testv16i8u:
 ; SSE2:       # BB#0:
-; SSE2:         pushq %rbx
-; SSE2:         movaps %xmm0, -16(%rsp)
-; SSE2-NEXT:    movzbl -1(%rsp), %eax
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    bsrl %eax, %eax
 ; SSE2-NEXT:    xorl $7, %eax
 ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -2(%rsp), %edi
-; SSE2-NEXT:    movzbl -3(%rsp), %edx
-; SSE2-NEXT:    movzbl -4(%rsp), %r9d
-; SSE2-NEXT:    movzbl -5(%rsp), %eax
-; SSE2-NEXT:    movzbl -6(%rsp), %r10d
-; SSE2-NEXT:    movzbl -7(%rsp), %ecx
-; SSE2-NEXT:    movzbl -8(%rsp), %r8d
-; SSE2-NEXT:    movzbl -9(%rsp), %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
 ; SSE2-NEXT:    bsrl %esi, %esi
 ; SSE2-NEXT:    xorl $7, %esi
 ; SSE2-NEXT:    movd %esi, %xmm1
@@ -1418,10 +1512,10 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ; SSE2-NEXT:    bsrl %eax, %eax
 ; SSE2-NEXT:    xorl $7, %eax
 ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -10(%rsp), %eax
-; SSE2-NEXT:    movzbl -11(%rsp), %esi
-; SSE2-NEXT:    movzbl -12(%rsp), %r11d
-; SSE2-NEXT:    movzbl -13(%rsp), %ebx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
 ; SSE2-NEXT:    bsrl %ebx, %ebx
 ; SSE2-NEXT:    xorl $7, %ebx
 ; SSE2-NEXT:    movd %ebx, %xmm2
@@ -1437,8 +1531,8 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ; SSE2-NEXT:    bsrl %ecx, %ecx
 ; SSE2-NEXT:    xorl $7, %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    movzbl -14(%rsp), %ecx
-; SSE2-NEXT:    movzbl -15(%rsp), %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    bsrl %edx, %edx
 ; SSE2-NEXT:    xorl $7, %edx
 ; SSE2-NEXT:    movd %edx, %xmm1
@@ -1470,7 +1564,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ; SSE2-NEXT:    bsrl %r8d, %eax
 ; SSE2-NEXT:    xorl $7, %eax
 ; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movzbl -16(%rsp), %eax
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    bsrl %eax, %eax
 ; SSE2-NEXT:    xorl $7, %eax
 ; SSE2-NEXT:    movd %eax, %xmm0
@@ -1483,20 +1577,20 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ;
 ; SSE3-LABEL: testv16i8u:
 ; SSE3:       # BB#0:
-; SSE3:         pushq %rbx
-; SSE3:         movaps %xmm0, -16(%rsp)
-; SSE3-NEXT:    movzbl -1(%rsp), %eax
+; SSE3-NEXT:    pushq %rbx
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE3-NEXT:    bsrl %eax, %eax
 ; SSE3-NEXT:    xorl $7, %eax
 ; SSE3-NEXT:    movd %eax, %xmm0
-; SSE3-NEXT:    movzbl -2(%rsp), %edi
-; SSE3-NEXT:    movzbl -3(%rsp), %edx
-; SSE3-NEXT:    movzbl -4(%rsp), %r9d
-; SSE3-NEXT:    movzbl -5(%rsp), %eax
-; SSE3-NEXT:    movzbl -6(%rsp), %r10d
-; SSE3-NEXT:    movzbl -7(%rsp), %ecx
-; SSE3-NEXT:    movzbl -8(%rsp), %r8d
-; SSE3-NEXT:    movzbl -9(%rsp), %esi
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
 ; SSE3-NEXT:    bsrl %esi, %esi
 ; SSE3-NEXT:    xorl $7, %esi
 ; SSE3-NEXT:    movd %esi, %xmm1
@@ -1504,10 +1598,10 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ; SSE3-NEXT:    bsrl %eax, %eax
 ; SSE3-NEXT:    xorl $7, %eax
 ; SSE3-NEXT:    movd %eax, %xmm0
-; SSE3-NEXT:    movzbl -10(%rsp), %eax
-; SSE3-NEXT:    movzbl -11(%rsp), %esi
-; SSE3-NEXT:    movzbl -12(%rsp), %r11d
-; SSE3-NEXT:    movzbl -13(%rsp), %ebx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
 ; SSE3-NEXT:    bsrl %ebx, %ebx
 ; SSE3-NEXT:    xorl $7, %ebx
 ; SSE3-NEXT:    movd %ebx, %xmm2
@@ -1523,8 +1617,8 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ; SSE3-NEXT:    bsrl %ecx, %ecx
 ; SSE3-NEXT:    xorl $7, %ecx
 ; SSE3-NEXT:    movd %ecx, %xmm0
-; SSE3-NEXT:    movzbl -14(%rsp), %ecx
-; SSE3-NEXT:    movzbl -15(%rsp), %edx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE3-NEXT:    bsrl %edx, %edx
 ; SSE3-NEXT:    xorl $7, %edx
 ; SSE3-NEXT:    movd %edx, %xmm1
@@ -1556,7 +1650,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ; SSE3-NEXT:    bsrl %r8d, %eax
 ; SSE3-NEXT:    xorl $7, %eax
 ; SSE3-NEXT:    movd %eax, %xmm4
-; SSE3-NEXT:    movzbl -16(%rsp), %eax
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE3-NEXT:    bsrl %eax, %eax
 ; SSE3-NEXT:    xorl $7, %eax
 ; SSE3-NEXT:    movd %eax, %xmm0
@@ -1569,20 +1663,20 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ;
 ; SSSE3-LABEL: testv16i8u:
 ; SSSE3:       # BB#0:
-; SSSE3:         pushq %rbx
-; SSSE3:         movaps %xmm0, -16(%rsp)
-; SSSE3-NEXT:    movzbl -1(%rsp), %eax
+; SSSE3-NEXT:    pushq %rbx
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    bsrl %eax, %eax
 ; SSSE3-NEXT:    xorl $7, %eax
 ; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzbl -2(%rsp), %edi
-; SSSE3-NEXT:    movzbl -3(%rsp), %edx
-; SSSE3-NEXT:    movzbl -4(%rsp), %r9d
-; SSSE3-NEXT:    movzbl -5(%rsp), %eax
-; SSSE3-NEXT:    movzbl -6(%rsp), %r10d
-; SSSE3-NEXT:    movzbl -7(%rsp), %ecx
-; SSSE3-NEXT:    movzbl -8(%rsp), %r8d
-; SSSE3-NEXT:    movzbl -9(%rsp), %esi
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
 ; SSSE3-NEXT:    bsrl %esi, %esi
 ; SSSE3-NEXT:    xorl $7, %esi
 ; SSSE3-NEXT:    movd %esi, %xmm1
@@ -1590,10 +1684,10 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ; SSSE3-NEXT:    bsrl %eax, %eax
 ; SSSE3-NEXT:    xorl $7, %eax
 ; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzbl -10(%rsp), %eax
-; SSSE3-NEXT:    movzbl -11(%rsp), %esi
-; SSSE3-NEXT:    movzbl -12(%rsp), %r11d
-; SSSE3-NEXT:    movzbl -13(%rsp), %ebx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
 ; SSSE3-NEXT:    bsrl %ebx, %ebx
 ; SSSE3-NEXT:    xorl $7, %ebx
 ; SSSE3-NEXT:    movd %ebx, %xmm2
@@ -1609,8 +1703,8 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ; SSSE3-NEXT:    bsrl %ecx, %ecx
 ; SSSE3-NEXT:    xorl $7, %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    movzbl -14(%rsp), %ecx
-; SSSE3-NEXT:    movzbl -15(%rsp), %edx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSSE3-NEXT:    bsrl %edx, %edx
 ; SSSE3-NEXT:    xorl $7, %edx
 ; SSSE3-NEXT:    movd %edx, %xmm1
@@ -1642,7 +1736,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ; SSSE3-NEXT:    bsrl %r8d, %eax
 ; SSSE3-NEXT:    xorl $7, %eax
 ; SSSE3-NEXT:    movd %eax, %xmm4
-; SSSE3-NEXT:    movzbl -16(%rsp), %eax
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    bsrl %eax, %eax
 ; SSSE3-NEXT:    xorl $7, %eax
 ; SSSE3-NEXT:    movd %eax, %xmm0
@@ -1789,11 +1883,27 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) {
 ; AVX-NEXT:    xorl $7, %eax
 ; AVX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv16i8u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vpmovzxbd %xmm0, %zmm0
+; AVX512VLCD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512VLCD-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VLCD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv16i8u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vpmovzxbd %xmm0, %zmm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT:    retq
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
   ret <16 x i8> %out
 }
 
-define <2 x i64> @foldv2i64() {
+define <2 x i64> @foldv2i64() nounwind {
 ; SSE-LABEL: foldv2i64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movl $55, %eax
@@ -1805,11 +1915,23 @@ define <2 x i64> @foldv2i64() {
 ; AVX-NEXT:    movl $55, %eax
 ; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv2i64:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    movl $55, %eax
+; AVX512VLCD-NEXT:    vmovq %rax, %xmm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv2i64:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    movl $55, %eax
+; AVX512CD-NEXT:    vmovq %rax, %xmm0
+; AVX512CD-NEXT:    retq
   %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
   ret <2 x i64> %out
 }
 
-define <2 x i64> @foldv2i64u() {
+define <2 x i64> @foldv2i64u() nounwind {
 ; SSE-LABEL: foldv2i64u:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movl $55, %eax
@@ -1821,11 +1943,23 @@ define <2 x i64> @foldv2i64u() {
 ; AVX-NEXT:    movl $55, %eax
 ; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv2i64u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    movl $55, %eax
+; AVX512VLCD-NEXT:    vmovq %rax, %xmm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv2i64u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    movl $55, %eax
+; AVX512CD-NEXT:    vmovq %rax, %xmm0
+; AVX512CD-NEXT:    retq
   %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
   ret <2 x i64> %out
 }
 
-define <4 x i32> @foldv4i32() {
+define <4 x i32> @foldv4i32() nounwind {
 ; SSE-LABEL: foldv4i32:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
@@ -1835,11 +1969,21 @@ define <4 x i32> @foldv4i32() {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv4i32:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa32 {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv4i32:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512CD-NEXT:    retq
   %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
   ret <4 x i32> %out
 }
 
-define <4 x i32> @foldv4i32u() {
+define <4 x i32> @foldv4i32u() nounwind {
 ; SSE-LABEL: foldv4i32u:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
@@ -1849,11 +1993,21 @@ define <4 x i32> @foldv4i32u() {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv4i32u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa32 {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv4i32u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512CD-NEXT:    retq
   %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
   ret <4 x i32> %out
 }
 
-define <8 x i16> @foldv8i16() {
+define <8 x i16> @foldv8i16() nounwind {
 ; SSE-LABEL: foldv8i16:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
@@ -1863,11 +2017,21 @@ define <8 x i16> @foldv8i16() {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv8i16:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv8i16:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; AVX512CD-NEXT:    retq
   %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
   ret <8 x i16> %out
 }
 
-define <8 x i16> @foldv8i16u() {
+define <8 x i16> @foldv8i16u() nounwind {
 ; SSE-LABEL: foldv8i16u:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
@@ -1877,11 +2041,21 @@ define <8 x i16> @foldv8i16u() {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv8i16u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv8i16u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; AVX512CD-NEXT:    retq
   %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
   ret <8 x i16> %out
 }
 
-define <16 x i8> @foldv16i8() {
+define <16 x i8> @foldv16i8() nounwind {
 ; SSE-LABEL: foldv16i8:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
@@ -1891,11 +2065,21 @@ define <16 x i8> @foldv16i8() {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv16i8:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv16i8:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; AVX512CD-NEXT:    retq
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
   ret <16 x i8> %out
 }
 
-define <16 x i8> @foldv16i8u() {
+define <16 x i8> @foldv16i8u() nounwind {
 ; SSE-LABEL: foldv16i8u:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
@@ -1905,6 +2089,16 @@ define <16 x i8> @foldv16i8u() {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv16i8u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv16i8u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; AVX512CD-NEXT:    retq
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
   ret <16 x i8> %out
 }
diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll
index 48abe1290528d..1608bf53748db 100644
--- a/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s --check-prefix=AVX512VLCD --check-prefix=ALL --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=AVX512CD --check-prefix=ALL --check-prefix=AVX512
 
-target triple = "x86_64-unknown-unknown"
-
-define <4 x i64> @testv4i64(<4 x i64> %in) {
+define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -61,11 +62,22 @@ define <4 x i64> @testv4i64(<4 x i64> %in) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv4i64:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vplzcntq %ymm0, %ymm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv4i64:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT:    retq
+
   %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 0)
   ret <4 x i64> %out
 }
 
-define <4 x i64> @testv4i64u(<4 x i64> %in) {
+define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -113,11 +125,22 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv4i64u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vplzcntq %ymm0, %ymm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv4i64u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT:    retq
+
   %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 -1)
   ret <4 x i64> %out
 }
 
-define <8 x i32> @testv8i32(<8 x i32> %in) {
+define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -211,11 +234,22 @@ define <8 x i32> @testv8i32(<8 x i32> %in) {
 ; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv8i32:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv8i32:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    retq
+
   %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 0)
   ret <8 x i32> %out
 }
 
-define <8 x i32> @testv8i32u(<8 x i32> %in) {
+define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -291,11 +325,22 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) {
 ; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv8i32u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv8i32u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    retq
+
   %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 -1)
   ret <8 x i32> %out
 }
 
-define <16 x i16> @testv16i16(<16 x i16> %in) {
+define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -469,11 +514,27 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
 ; AVX2-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv16i16:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vpmovzxwd %ymm0, %zmm0
+; AVX512VLCD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512VLCD-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VLCD-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv16i16:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vpmovzxwd %ymm0, %zmm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CD-NEXT:    retq
   %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 0)
   ret <16 x i16> %out
 }
 
-define <16 x i16> @testv16i16u(<16 x i16> %in) {
+define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -613,11 +674,27 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) {
 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv16i16u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vpmovzxwd %ymm0, %zmm0
+; AVX512VLCD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512VLCD-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VLCD-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv16i16u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vpmovzxwd %ymm0, %zmm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CD-NEXT:    retq
   %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 -1)
   ret <16 x i16> %out
 }
 
-define <32 x i8> @testv32i8(<32 x i8> %in) {
+define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -951,11 +1028,41 @@ define <32 x i8> @testv32i8(<32 x i8> %in) {
 ; AVX2-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv32i8:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VLCD-NEXT:    vpmovzxbd %xmm1, %zmm1
+; AVX512VLCD-NEXT:    vplzcntd %zmm1, %zmm1
+; AVX512VLCD-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512VLCD-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX512VLCD-NEXT:    vpmovzxbd %xmm0, %zmm0
+; AVX512VLCD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512VLCD-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VLCD-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX512VLCD-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv32i8:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512CD-NEXT:    vpmovzxbd %xmm1, %zmm1
+; AVX512CD-NEXT:    vplzcntd %zmm1, %zmm1
+; AVX512CD-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512CD-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpmovzxbd %xmm0, %zmm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512CD-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    retq
   %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0)
   ret <32 x i8> %out
 }
 
-define <32 x i8> @testv32i8u(<32 x i8> %in) {
+define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -1223,78 +1330,188 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) {
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VLCD-LABEL: testv32i8u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VLCD-NEXT:    vpmovzxbd %xmm1, %zmm1
+; AVX512VLCD-NEXT:    vplzcntd %zmm1, %zmm1
+; AVX512VLCD-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512VLCD-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX512VLCD-NEXT:    vpmovzxbd %xmm0, %zmm0
+; AVX512VLCD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512VLCD-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VLCD-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX512VLCD-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: testv32i8u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512CD-NEXT:    vpmovzxbd %xmm1, %zmm1
+; AVX512CD-NEXT:    vplzcntd %zmm1, %zmm1
+; AVX512CD-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512CD-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpmovzxbd %xmm0, %zmm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512CD-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    retq
   %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 -1)
   ret <32 x i8> %out
 }
 
-define <4 x i64> @foldv4i64() {
+define <4 x i64> @foldv4i64() nounwind {
 ; AVX-LABEL: foldv4i64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv4i64:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [55,0,64,56]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv4i64:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
+; AVX512CD-NEXT:    retq
   %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
   ret <4 x i64> %out
 }
 
-define <4 x i64> @foldv4i64u() {
+define <4 x i64> @foldv4i64u() nounwind {
 ; AVX-LABEL: foldv4i64u:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv4i64u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [55,0,64,56]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv4i64u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
+; AVX512CD-NEXT:    retq
   %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
   ret <4 x i64> %out
 }
 
-define <8 x i32> @foldv8i32() {
+define <8 x i32> @foldv8i32() nounwind {
 ; AVX-LABEL: foldv8i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv8i32:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa32 {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv8i32:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX512CD-NEXT:    retq
   %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
   ret <8 x i32> %out
 }
 
-define <8 x i32> @foldv8i32u() {
+define <8 x i32> @foldv8i32u() nounwind {
 ; AVX-LABEL: foldv8i32u:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv8i32u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa32 {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv8i32u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; AVX512CD-NEXT:    retq
   %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
   ret <8 x i32> %out
 }
 
-define <16 x i16> @foldv16i16() {
+define <16 x i16> @foldv16i16() nounwind {
 ; AVX-LABEL: foldv16i16:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv16i16:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv16i16:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; AVX512CD-NEXT:    retq
   %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
   ret <16 x i16> %out
 }
 
-define <16 x i16> @foldv16i16u() {
+define <16 x i16> @foldv16i16u() nounwind {
 ; AVX-LABEL: foldv16i16u:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv16i16u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv16i16u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; AVX512CD-NEXT:    retq
   %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
   ret <16 x i16> %out
 }
 
-define <32 x i8> @foldv32i8() {
+define <32 x i8> @foldv32i8() nounwind {
 ; AVX-LABEL: foldv32i8:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv32i8:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv32i8:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; AVX512CD-NEXT:    retq
   %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
   ret <32 x i8> %out
 }
 
-define <32 x i8> @foldv32i8u() {
+define <32 x i8> @foldv32i8u() nounwind {
 ; AVX-LABEL: foldv32i8u:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
 ; AVX-NEXT:    retq
+;
+; AVX512VLCD-LABEL: foldv32i8u:
+; AVX512VLCD:       ## BB#0:
+; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; AVX512VLCD-NEXT:    retq
+;
+; AVX512CD-LABEL: foldv32i8u:
+; AVX512CD:       ## BB#0:
+; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; AVX512CD-NEXT:    retq
   %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
   ret <32 x i8> %out
 }
diff --git a/test/CodeGen/X86/vector-lzcnt-512.ll b/test/CodeGen/X86/vector-lzcnt-512.ll
new file mode 100644
index 0000000000000..20ea86e5d4394
--- /dev/null
+++ b/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -0,0 +1,219 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
+
+define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
+; ALL-LABEL: testv8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vplzcntq %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %out = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %in, i1 0)
+  ret <8 x i64> %out
+}
+
+define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
+; ALL-LABEL: testv8i64u:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vplzcntq %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %out = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %in, i1 -1)
+  ret <8 x i64> %out
+}
+
+define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
+; ALL-LABEL: testv16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vplzcntd %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %out = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %in, i1 0)
+  ret <16 x i32> %out
+}
+
+define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
+; ALL-LABEL: testv16i32u:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vplzcntd %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %out = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %in, i1 -1)
+  ret <16 x i32> %out
+}
+
+define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
+; ALL-LABEL: testv32i16:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovzxwd %ymm0, %zmm0
+; ALL-NEXT:    vplzcntd %zmm0, %zmm0
+; ALL-NEXT:    vpmovdw %zmm0, %ymm0
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; ALL-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
+; ALL-NEXT:    vpmovzxwd %ymm1, %zmm1
+; ALL-NEXT:    vplzcntd %zmm1, %zmm1
+; ALL-NEXT:    vpmovdw %zmm1, %ymm1
+; ALL-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    retq
+;
+; AVX512BW-LABEL: testv32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpmovzxwd %ymm1, %zmm1
+; AVX512BW-NEXT:    vplzcntd %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxwd %ymm0, %zmm0
+; AVX512BW-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %out = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %in, i1 0)
+  ret <32 x i16> %out
+}
+
+define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
+; ALL-LABEL: testv32i16u:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovzxwd %ymm0, %zmm0
+; ALL-NEXT:    vplzcntd %zmm0, %zmm0
+; ALL-NEXT:    vpmovdw %zmm0, %ymm0
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; ALL-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
+; ALL-NEXT:    vpmovzxwd %ymm1, %zmm1
+; ALL-NEXT:    vplzcntd %zmm1, %zmm1
+; ALL-NEXT:    vpmovdw %zmm1, %ymm1
+; ALL-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    retq
+;
+; AVX512BW-LABEL: testv32i16u:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpmovzxwd %ymm1, %zmm1
+; AVX512BW-NEXT:    vplzcntd %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxwd %ymm0, %zmm0
+; AVX512BW-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %out = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %in, i1 -1)
+  ret <32 x i16> %out
+}
+
+define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
+; ALL-LABEL: testv64i8:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; ALL-NEXT:    vpmovzxbd %xmm2, %zmm2
+; ALL-NEXT:    vplzcntd %zmm2, %zmm2
+; ALL-NEXT:    vpmovdb %zmm2, %xmm2
+; ALL-NEXT:    vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; ALL-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; ALL-NEXT:    vpmovzxbd %xmm0, %zmm0
+; ALL-NEXT:    vplzcntd %zmm0, %zmm0
+; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; ALL-NEXT:    vpmovzxbd %xmm2, %zmm2
+; ALL-NEXT:    vplzcntd %zmm2, %zmm2
+; ALL-NEXT:    vpmovdb %zmm2, %xmm2
+; ALL-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; ALL-NEXT:    vpmovzxbd %xmm1, %zmm1
+; ALL-NEXT:    vplzcntd %zmm1, %zmm1
+; ALL-NEXT:    vpmovdb %zmm1, %xmm1
+; ALL-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; ALL-NEXT:    retq
+;
+; AVX512BW-LABEL: testv64i8:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT:    vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT:    vplzcntd %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovdb %zmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512BW-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbd %xmm1, %zmm1
+; AVX512BW-NEXT:    vplzcntd %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512BW-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT:    vplzcntd %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovdb %zmm2, %xmm2
+; AVX512BW-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbd %xmm0, %zmm0
+; AVX512BW-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0)
+  ret <64 x i8> %out
+}
+
+define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
+; ALL-LABEL: testv64i8u:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; ALL-NEXT:    vpmovzxbd %xmm2, %zmm2
+; ALL-NEXT:    vplzcntd %zmm2, %zmm2
+; ALL-NEXT:    vpmovdb %zmm2, %xmm2
+; ALL-NEXT:    vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; ALL-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; ALL-NEXT:    vpmovzxbd %xmm0, %zmm0
+; ALL-NEXT:    vplzcntd %zmm0, %zmm0
+; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; ALL-NEXT:    vpmovzxbd %xmm2, %zmm2
+; ALL-NEXT:    vplzcntd %zmm2, %zmm2
+; ALL-NEXT:    vpmovdb %zmm2, %xmm2
+; ALL-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; ALL-NEXT:    vpmovzxbd %xmm1, %zmm1
+; ALL-NEXT:    vplzcntd %zmm1, %zmm1
+; ALL-NEXT:    vpmovdb %zmm1, %xmm1
+; ALL-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; ALL-NEXT:    retq
+;
+; AVX512BW-LABEL: testv64i8u:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT:    vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT:    vplzcntd %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovdb %zmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512BW-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbd %xmm1, %zmm1
+; AVX512BW-NEXT:    vplzcntd %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512BW-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT:    vplzcntd %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovdb %zmm2, %xmm2
+; AVX512BW-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbd %xmm0, %zmm0
+; AVX512BW-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1)
+  ret <64 x i8> %out
+}
+
+declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1)
+declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
+declare <32 x i16> @llvm.ctlz.v32i16(<32 x i16>, i1)
+declare <64 x i8> @llvm.ctlz.v64i8(<64 x i8>, i1)
diff --git a/test/CodeGen/X86/vector-merge-store-fp-constants.ll b/test/CodeGen/X86/vector-merge-store-fp-constants.ll
new file mode 100644
index 0000000000000..a6fb32d48a7cd
--- /dev/null
+++ b/test/CodeGen/X86/vector-merge-store-fp-constants.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-unknown < %s | FileCheck -check-prefix=DEFAULTCPU -check-prefix=ALL %s
+; RUN: llc -march=x86-64 -mcpu=x86-64 -mtriple=x86_64-unknown-unknown < %s | FileCheck -check-prefix=X8664CPU -check-prefix=ALL %s
+
+
+; ALL-LABEL: {{^}}merge_8_float_zero_stores:
+
+; DEFAULTCPU-DAG: movq $0, ([[PTR:%[a-z]+]])
+; DEFAULTCPU-DAG: movq $0, 8([[PTR]])
+; DEFAULTCPU-DAG: movq $0, 16([[PTR]])
+; DEFAULTCPU-DAG: movq $0, 24([[PTR]])
+
+; X8664CPU: xorps [[ZEROREG:%xmm[0-9]+]], [[ZEROREG]]
+; X8664CPU-DAG: movups [[ZEROREG]], ([[PTR:%[a-z]+]])
+; X8664CPU-DAG: movups [[ZEROREG]], 16([[PTR:%[a-z]+]])
+
+; ALL: retq
+define void @merge_8_float_zero_stores(float* %ptr) {
+  %idx0 = getelementptr float, float* %ptr, i64 0
+  %idx1 = getelementptr float, float* %ptr, i64 1
+  %idx2 = getelementptr float, float* %ptr, i64 2
+  %idx3 = getelementptr float, float* %ptr, i64 3
+  %idx4 = getelementptr float, float* %ptr, i64 4
+  %idx5 = getelementptr float, float* %ptr, i64 5
+  %idx6 = getelementptr float, float* %ptr, i64 6
+  %idx7 = getelementptr float, float* %ptr, i64 7
+  store float 0.0, float* %idx0, align 4
+  store float 0.0, float* %idx1, align 4
+  store float 0.0, float* %idx2, align 4
+  store float 0.0, float* %idx3, align 4
+  store float 0.0, float* %idx4, align 4
+  store float 0.0, float* %idx5, align 4
+  store float 0.0, float* %idx6, align 4
+  store float 0.0, float* %idx7, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll
index fef445de04ab8..358bd40182907 100644
--- a/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/test/CodeGen/X86/vector-popcnt-128.ll
@@ -1,13 +1,12 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
-target triple = "x86_64-unknown-unknown"
-
-define <2 x i64> @testv2i64(<2 x i64> %in) {
+define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -93,13 +92,13 @@ define <2 x i64> @testv2i64(<2 x i64> %in) {
 ; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
   ret <2 x i64> %out
 }
 
-define <4 x i32> @testv4i32(<4 x i32> %in) {
+define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -208,16 +207,16 @@ define <4 x i32> @testv4i32(<4 x i32> %in) {
 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpsadbw %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
   ret <4 x i32> %out
 }
 
-define <8 x i16> @testv8i16(<8 x i16> %in) {
+define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE2-LABEL: testv8i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -316,7 +315,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
   ret <8 x i16> %out
 }
 
-define <16 x i8> @testv16i8(<16 x i8> %in) {
+define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; SSE2-LABEL: testv16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -400,7 +399,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
   ret <16 x i8> %out
 }
 
-define <2 x i64> @foldv2i64() {
+define <2 x i64> @foldv2i64() nounwind {
 ; SSE-LABEL: foldv2i64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,64]
@@ -414,7 +413,7 @@ define <2 x i64> @foldv2i64() {
   ret <2 x i64> %out
 }
 
-define <4 x i32> @foldv4i32() {
+define <4 x i32> @foldv4i32() nounwind {
 ; SSE-LABEL: foldv4i32:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,32,0,8]
@@ -428,7 +427,7 @@ define <4 x i32> @foldv4i32() {
   ret <4 x i32> %out
 }
 
-define <8 x i16> @foldv8i16() {
+define <8 x i16> @foldv8i16() nounwind {
 ; SSE-LABEL: foldv8i16:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
@@ -442,7 +441,7 @@ define <8 x i16> @foldv8i16() {
   ret <8 x i16> %out
 }
 
-define <16 x i8> @foldv16i8() {
+define <16 x i8> @foldv16i8() nounwind {
 ; SSE-LABEL: foldv16i8:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll
index 7ce4f712483a2..b0e39bdf49f94 100644
--- a/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/test/CodeGen/X86/vector-popcnt-256.ll
@@ -1,9 +1,8 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
-target triple = "x86_64-unknown-unknown"
-
-define <4 x i64> @testv4i64(<4 x i64> %in) {
+define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -16,14 +15,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) {
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm5
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpsadbw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -38,13 +37,13 @@ define <4 x i64> @testv4i64(<4 x i64> %in) {
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpsadbw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in)
   ret <4 x i64> %out
 }
 
-define <8 x i32> @testv8i32(<8 x i32> %in) {
+define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -58,9 +57,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) {
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT:    vpsadbw %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm5, %xmm5
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm5
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
@@ -69,9 +68,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) {
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-NEXT:    vpsadbw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -88,16 +87,16 @@ define <8 x i32> @testv8i32(<8 x i32> %in) {
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX2-NEXT:    vpsadbw %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX2-NEXT:    vpsadbw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in)
   ret <8 x i32> %out
 }
 
-define <16 x i16> @testv16i16(<16 x i16> %in) {
+define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -142,7 +141,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
   ret <16 x i16> %out
 }
 
-define <32 x i8> @testv32i8(<32 x i8> %in) {
+define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -178,38 +177,38 @@ define <32 x i8> @testv32i8(<32 x i8> %in) {
   ret <32 x i8> %out
 }
 
-define <4 x i64> @foldv4i64() {
-; AVX-LABEL: foldv4i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,64,0,8]
-; AVX-NEXT:    retq
+define <4 x i64> @foldv4i64() nounwind {
+; ALL-LABEL: foldv4i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [1,64,0,8]
+; ALL-NEXT:    retq
   %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>)
   ret <4 x i64> %out
 }
 
-define <8 x i32> @foldv8i32() {
-; AVX-LABEL: foldv8i32:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
-; AVX-NEXT:    retq
+define <8 x i32> @foldv8i32() nounwind {
+; ALL-LABEL: foldv8i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
+; ALL-NEXT:    retq
   %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>)
   ret <8 x i32> %out
 }
 
-define <16 x i16> @foldv16i16() {
-; AVX-LABEL: foldv16i16:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1]
-; AVX-NEXT:    retq
+define <16 x i16> @foldv16i16() nounwind {
+; ALL-LABEL: foldv16i16:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1]
+; ALL-NEXT:    retq
   %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>)
   ret <16 x i16> %out
 }
 
-define <32 x i8> @foldv32i8() {
-; AVX-LABEL: foldv32i8:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7]
-; AVX-NEXT:    retq
+define <32 x i8> @foldv32i8() nounwind {
+; ALL-LABEL: foldv32i8:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7]
+; ALL-NEXT:    retq
   %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>)
   ret <32 x i8> %out
 }
diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll
new file mode 100644
index 0000000000000..54b7af6830c00
--- /dev/null
+++ b/test/CodeGen/X86/vector-popcnt-512.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+
+define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
+; ALL-LABEL: testv8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; ALL-NEXT:    vpextrq $1, %xmm1, %rax
+; ALL-NEXT:    popcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm2
+; ALL-NEXT:    vmovq %xmm1, %rax
+; ALL-NEXT:    popcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm1
+; ALL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; ALL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; ALL-NEXT:    vpextrq $1, %xmm2, %rax
+; ALL-NEXT:    popcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm3
+; ALL-NEXT:    vmovq %xmm2, %rax
+; ALL-NEXT:    popcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm2
+; ALL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; ALL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; ALL-NEXT:    vpextrq $1, %xmm2, %rax
+; ALL-NEXT:    popcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm3
+; ALL-NEXT:    vmovq %xmm2, %rax
+; ALL-NEXT:    popcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm2
+; ALL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT:    vpextrq $1, %xmm0, %rax
+; ALL-NEXT:    popcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm3
+; ALL-NEXT:    vmovq %xmm0, %rax
+; ALL-NEXT:    popcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm0
+; ALL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in)
+  ret <8 x i64> %out
+}
+
+define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
+; ALL-LABEL: testv16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; ALL-NEXT:    vpextrd $1, %xmm1, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vmovd %xmm1, %ecx
+; ALL-NEXT:    popcntl %ecx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm2
+; ALL-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
+; ALL-NEXT:    vpextrd $2, %xmm1, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
+; ALL-NEXT:    vpextrd $3, %xmm1, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
+; ALL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; ALL-NEXT:    vpextrd $1, %xmm2, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vmovd %xmm2, %ecx
+; ALL-NEXT:    popcntl %ecx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm3
+; ALL-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $2, %xmm2, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $3, %xmm2, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm2
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; ALL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; ALL-NEXT:    vpextrd $1, %xmm2, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vmovd %xmm2, %ecx
+; ALL-NEXT:    popcntl %ecx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm3
+; ALL-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $2, %xmm2, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $3, %xmm2, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm2
+; ALL-NEXT:    vpextrd $1, %xmm0, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vmovd %xmm0, %ecx
+; ALL-NEXT:    popcntl %ecx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm3
+; ALL-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $2, %xmm0, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $3, %xmm0, %eax
+; ALL-NEXT:    popcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
+; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in)
+  ret <16 x i32> %out
+}
+
+define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
+; ALL-LABEL: testv32i16:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; ALL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; ALL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; ALL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; ALL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; ALL-NEXT:    vpsllw $8, %ymm0, %ymm3
+; ALL-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
+; ALL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; ALL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; ALL-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; ALL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; ALL-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; ALL-NEXT:    vpsllw $8, %ymm1, %ymm2
+; ALL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; ALL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; ALL-NEXT:    retq
+  %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
+  ret <32 x i16> %out
+}
+
+define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
+; ALL-LABEL: testv64i8:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; ALL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; ALL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; ALL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; ALL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; ALL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; ALL-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; ALL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; ALL-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; ALL-NEXT:    retq
+  %out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
+  ret <64 x i8> %out
+}
+
+declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>)
+declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>)
+declare <32 x i16> @llvm.ctpop.v32i16(<32 x i16>)
+declare <64 x i8> @llvm.ctpop.v64i8(<64 x i8>)
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
new file mode 100644
index 0000000000000..4ad4aa46c5a00
--- /dev/null
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -0,0 +1,1595 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 rotates.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
+
+;
+; Variable Rotates
+;
+
+define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE2-LABEL: var_rotate_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
+; SSE2-NEXT:    psubq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psllq %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psllq %xmm1, %xmm3
+; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq %xmm3, %xmm1
+; SSE2-NEXT:    psrlq %xmm2, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    orpd %xmm4, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_rotate_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
+; SSE41-NEXT:    psubq %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psllq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psllq %xmm1, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_rotate_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_rotate_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_rotate_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_rotate_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64,0,64,0]
+; X32-SSE-NEXT:    psubq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psllq %xmm3, %xmm4
+; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psllq %xmm1, %xmm3
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
+; X32-SSE-NEXT:    movq {{.*#+}} xmm2 = xmm2[0],zero
+; X32-SSE-NEXT:    psrlq %xmm2, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-SSE-NEXT:    orpd %xmm4, %xmm1
+; X32-SSE-NEXT:    movapd %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %b64 = sub <2 x i64> <i64 64, i64 64>, %b
+  %shl = shl <2 x i64> %a, %b
+  %lshr = lshr <2 x i64> %a, %b64
+  %or = or <2 x i64> %shl, %lshr
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE2-LABEL: var_rotate_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
+; SSE2-NEXT:    psubd %xmm1, %xmm2
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrld %xmm3, %xmm5
+; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    psrld %xmm5, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    psrld %xmm2, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_rotate_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psrld %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlq $32, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm3, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm2, %xmm3
+; SSE41-NEXT:    psrld %xmm4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_rotate_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_rotate_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlvd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_rotate_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_rotate_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
+; X32-SSE-NEXT:    psubd %xmm1, %xmm2
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd .LCPI1_1, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrld %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlq $32, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psrld %xmm3, %xmm5
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3]
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm5
+; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm6
+; X32-SSE-NEXT:    psrld %xmm5, %xmm6
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; X32-SSE-NEXT:    psrld %xmm2, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b
+  %shl = shl <4 x i32> %a, %b
+  %lshr = lshr <4 x i32> %a, %b32
+  %or = or <4 x i32> %shl, %lshr
+  ret <4 x i32> %or
+}
+
+define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE2-LABEL: var_rotate_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; SSE2-NEXT:    psubw %xmm1, %xmm3
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psllw $8, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psraw $15, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psraw $15, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    psllw $2, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    psllw $1, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psllw $12, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm3
+; SSE2-NEXT:    psraw $15, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_rotate_v8i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
+; SSE41-NEXT:    psubw %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    paddw %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    psllw $8, %xmm6
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm6, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
+; SSE41-NEXT:    psllw $2, %xmm1
+; SSE41-NEXT:    paddw %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
+; SSE41-NEXT:    psllw $1, %xmm1
+; SSE41-NEXT:    paddw %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psrlw $8, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    psrlw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    psrlw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm2, %xmm3
+; SSE41-NEXT:    por %xmm5, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_rotate_v8i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm4
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm4
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
+; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_rotate_v8i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_rotate_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_rotate_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT:    psubw %xmm1, %xmm3
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $8, %xmm4
+; X32-SSE-NEXT:    pand %xmm2, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm4, %xmm2
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    psraw $15, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm2, %xmm5
+; X32-SSE-NEXT:    psllw $4, %xmm2
+; X32-SSE-NEXT:    pand %xmm4, %xmm2
+; X32-SSE-NEXT:    por %xmm5, %xmm2
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    psraw $15, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm2, %xmm5
+; X32-SSE-NEXT:    psllw $2, %xmm2
+; X32-SSE-NEXT:    pand %xmm4, %xmm2
+; X32-SSE-NEXT:    por %xmm5, %xmm2
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    pandn %xmm2, %xmm4
+; X32-SSE-NEXT:    psllw $1, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    psllw $12, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm3, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm3, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm3, %xmm3
+; X32-SSE-NEXT:    psraw $15, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
+  %shl = shl <8 x i16> %a, %b
+  %lshr = lshr <8 x i16> %a, %b16
+  %or = or <8 x i16> %shl, %lshr
+  ret <8 x i16> %or
+}
+
+define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE2-LABEL: var_rotate_v16i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT:    psubb %xmm1, %xmm4
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm5
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psllw $2, %xmm2
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    psllw $5, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    paddb %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    paddb %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_rotate_v16i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    psubb %xmm3, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psllw $4, %xmm5
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    psllw $2, %xmm5
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
+; SSE41-NEXT:    paddb %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    paddb %xmm5, %xmm5
+; SSE41-NEXT:    paddb %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm5, %xmm4
+; SSE41-NEXT:    psllw $5, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrlw $4, %xmm5
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $2, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $1, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    paddb %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    por %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_rotate_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm3
+; AVX-NEXT:    vpsllw $2, %xmm3, %xmm4
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm4
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm4, %xmm3, %xmm1
+; AVX-NEXT:    vpsllw $5, %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_rotate_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_rotate_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X32-SSE-NEXT:    psubb %xmm1, %xmm4
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psllw $4, %xmm5
+; X32-SSE-NEXT:    pand .LCPI3_1, %xmm5
+; X32-SSE-NEXT:    pand %xmm2, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm5, %xmm2
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psllw $2, %xmm2
+; X32-SSE-NEXT:    pand .LCPI3_2, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm1
+; X32-SSE-NEXT:    pandn %xmm2, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    psllw $5, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm0, %xmm6
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand .LCPI3_3, %xmm0
+; X32-SSE-NEXT:    pand %xmm5, %xmm0
+; X32-SSE-NEXT:    por %xmm6, %xmm0
+; X32-SSE-NEXT:    paddb %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm0, %xmm6
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand .LCPI3_4, %xmm0
+; X32-SSE-NEXT:    pand %xmm5, %xmm0
+; X32-SSE-NEXT:    por %xmm6, %xmm0
+; X32-SSE-NEXT:    paddb %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand .LCPI3_5, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
+  %shl = shl <16 x i8> %a, %b
+  %lshr = lshr <16 x i8> %a, %b8
+  %or = or <16 x i8> %shl, %lshr
+  ret <16 x i8> %or
+}
+
+;
+; Constant Rotates
+;
+
+define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: constant_rotate_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllq $14, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllq $4, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $50, %xmm1
+; SSE2-NEXT:    psrlq $60, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    orpd %xmm2, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_rotate_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $14, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllq $4, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq $50, %xmm1
+; SSE41-NEXT:    psrlq $60, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_rotate_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_rotate_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_rotate_v2i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_rotate_v2i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_rotate_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq $14, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllq $4, %xmm1
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq $50, %xmm1
+; X32-SSE-NEXT:    psrlq $60, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-SSE-NEXT:    orpd %xmm2, %xmm1
+; X32-SSE-NEXT:    movapd %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <2 x i64> %a, <i64 4, i64 14>
+  %lshr = lshr <2 x i64> %a, <i64 60, i64 50>
+  %or = or <2 x i64> %shl, %lshr
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: constant_rotate_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $25, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld $27, %xmm3
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld $26, %xmm3
+; SSE2-NEXT:    psrld $28, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_rotate_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrld $25, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld $27, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrld $26, %xmm2
+; SSE41-NEXT:    psrld $28, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_rotate_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $25, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrld $27, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrld $26, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_rotate_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_rotate_v4i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_rotate_v4i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_rotate_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld $25, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrld $27, %xmm3
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrld $26, %xmm3
+; X32-SSE-NEXT:    psrld $28, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
+  %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25>
+  %or = or <4 x i32> %shl, %lshr
+  ret <4 x i32> %or
+}
+
+define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
+; SSE2-LABEL: constant_rotate_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,0,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_rotate_v8i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
+; SSE41-NEXT:    pmullw %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [256,61680,57568,53456,49344,45232,41120,37008]
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrlw $4, %xmm3
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [512,57824,49600,41376,33152,24928,16704,8480]
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrlw $2, %xmm3
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [1024,50112,33664,17216,768,49856,33408,16960]
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrlw $1, %xmm3
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [2048,34688,1792,34432,1536,34176,1280,33920]
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_rotate_v8i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [256,61680,57568,53456,49344,45232,41120,37008]
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [512,57824,49600,41376,33152,24928,16704,8480]
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1024,50112,33664,17216,768,49856,33408,16960]
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2048,34688,1792,34432,1536,34176,1280,33920]
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_rotate_v8i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_rotate_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm2
+; XOP-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_rotate_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
+; X32-SSE-NEXT:    pmullw %xmm0, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    por %xmm2, %xmm3
+; X32-SSE-NEXT:    por %xmm3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9>
+  %or = or <8 x i16> %shl, %lshr
+  ret <8 x i16> %or
+}
+
+define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: constant_rotate_v16i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; SSE2-NEXT:    psllw $5, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psllw $4, %xmm4
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    paddb %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    paddb %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    psllw $5, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    paddb %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    paddb %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_rotate_v16i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; SSE41-NEXT:    psllw $5, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psllw $4, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psllw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm3, %xmm3
+; SSE41-NEXT:    paddb %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    psllw $5, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrlw $4, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrlw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrlw $1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_rotate_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vpsllw $2, %xmm2, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpsllw $5, %xmm2, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_rotate_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm2
+; XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_rotate_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; X32-SSE-NEXT:    psllw $5, %xmm3
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtb %xmm3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $4, %xmm4
+; X32-SSE-NEXT:    pand .LCPI7_1, %xmm4
+; X32-SSE-NEXT:    pand %xmm1, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    por %xmm4, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtb %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm1, %xmm5
+; X32-SSE-NEXT:    psllw $2, %xmm1
+; X32-SSE-NEXT:    pand .LCPI7_2, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    por %xmm5, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtb %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm3
+; X32-SSE-NEXT:    pandn %xmm1, %xmm3
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; X32-SSE-NEXT:    psllw $5, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm0, %xmm6
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand .LCPI7_4, %xmm0
+; X32-SSE-NEXT:    pand %xmm5, %xmm0
+; X32-SSE-NEXT:    por %xmm6, %xmm0
+; X32-SSE-NEXT:    paddb %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm0, %xmm6
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand .LCPI7_5, %xmm0
+; X32-SSE-NEXT:    pand %xmm5, %xmm0
+; X32-SSE-NEXT:    por %xmm6, %xmm0
+; X32-SSE-NEXT:    paddb %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand .LCPI7_6, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
+  %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  %or = or <16 x i8> %shl, %lshr
+  ret <16 x i8> %or
+}
+
+;
+; Uniform Constant Rotates
+;
+
+define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
+; SSE-LABEL: splatconstant_rotate_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psllq $14, %xmm1
+; SSE-NEXT:    psrlq $50, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_rotate_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $14, %xmm0, %xmm1
+; AVX-NEXT:    vpsrlq $50, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_rotate_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotq $14, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_rotate_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllq $14, %xmm1
+; X32-SSE-NEXT:    psrlq $50, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <2 x i64> %a, <i64 14, i64 14>
+  %lshr = lshr <2 x i64> %a, <i64 50, i64 50>
+  %or = or <2 x i64> %shl, %lshr
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
+; SSE-LABEL: splatconstant_rotate_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pslld $4, %xmm1
+; SSE-NEXT:    psrld $28, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_rotate_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslld $4, %xmm0, %xmm1
+; AVX-NEXT:    vpsrld $28, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_rotate_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_rotate_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    pslld $4, %xmm1
+; X32-SSE-NEXT:    psrld $28, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
+  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
+  %or = or <4 x i32> %shl, %lshr
+  ret <4 x i32> %or
+}
+
+define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_rotate_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psllw $7, %xmm1
+; SSE-NEXT:    psrlw $9, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_rotate_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllw $7, %xmm0, %xmm1
+; AVX-NEXT:    vpsrlw $9, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_rotate_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotw $7, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_rotate_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllw $7, %xmm1
+; X32-SSE-NEXT:    psrlw $9, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
+  %or = or <8 x i16> %shl, %lshr
+  ret <8 x i16> %or
+}
+
+define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_rotate_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psllw $4, %xmm1
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    psrlw $4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_rotate_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_rotate_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_rotate_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllw $4, %xmm1
+; X32-SSE-NEXT:    pand .LCPI11_0, %xmm1
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand .LCPI11_1, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %or = or <16 x i8> %shl, %lshr
+  ret <16 x i8> %or
+}
+
+;
+; Masked Uniform Constant Rotates
+;
+
+define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
+; SSE-LABEL: splatconstant_rotate_mask_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psllq $15, %xmm1
+; SSE-NEXT:    psrlq $49, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_rotate_mask_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $15, %xmm0, %xmm1
+; AVX-NEXT:    vpsrlq $49, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_rotate_mask_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotq $15, %xmm0, %xmm0
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllq $15, %xmm1
+; X32-SSE-NEXT:    psrlq $49, %xmm0
+; X32-SSE-NEXT:    pand .LCPI12_0, %xmm0
+; X32-SSE-NEXT:    pand .LCPI12_1, %xmm1
+; X32-SSE-NEXT:    por %xmm0, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <2 x i64> %a, <i64 15, i64 15>
+  %lshr = lshr <2 x i64> %a, <i64 49, i64 49>
+  %rmask = and <2 x i64> %lshr, <i64 255, i64 127>
+  %lmask = and <2 x i64> %shl, <i64 65, i64 33>
+  %or = or <2 x i64> %lmask, %rmask
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
+; SSE-LABEL: splatconstant_rotate_mask_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pslld $4, %xmm1
+; SSE-NEXT:    psrld $28, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_rotate_mask_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslld $4, %xmm0, %xmm1
+; AVX-NEXT:    vpsrld $28, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_rotate_mask_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    pslld $4, %xmm1
+; X32-SSE-NEXT:    psrld $28, %xmm0
+; X32-SSE-NEXT:    pand .LCPI13_0, %xmm0
+; X32-SSE-NEXT:    pand .LCPI13_1, %xmm1
+; X32-SSE-NEXT:    por %xmm0, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
+  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
+  %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023>
+  %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127>
+  %or = or <4 x i32> %lmask, %rmask
+  ret <4 x i32> %or
+}
+
+define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_rotate_mask_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psllw $5, %xmm1
+; SSE-NEXT:    psrlw $11, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_rotate_mask_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllw $5, %xmm0, %xmm1
+; AVX-NEXT:    vpsrlw $11, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_rotate_mask_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotw $5, %xmm0, %xmm0
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    psrlw $11, %xmm0
+; X32-SSE-NEXT:    pand .LCPI14_0, %xmm0
+; X32-SSE-NEXT:    pand .LCPI14_1, %xmm1
+; X32-SSE-NEXT:    por %xmm0, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
+  %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
+  %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
+  %or = or <8 x i16> %lmask, %rmask
+  ret <8 x i16> %or
+}
+
+define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_rotate_mask_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psllw $4, %xmm1
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    psrlw $4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_rotate_mask_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllw $4, %xmm1
+; X32-SSE-NEXT:    pand .LCPI15_0, %xmm1
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand .LCPI15_1, %xmm0
+; X32-SSE-NEXT:    pand .LCPI15_2, %xmm0
+; X32-SSE-NEXT:    pand .LCPI15_3, %xmm1
+; X32-SSE-NEXT:    por %xmm0, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
+  %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
+  %or = or <16 x i8> %lmask, %rmask
+  ret <16 x i8> %or
+}
diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll
new file mode 100644
index 0000000000000..379b5fcb635fd
--- /dev/null
+++ b/test/CodeGen/X86/vector-rotate-256.ll
@@ -0,0 +1,1089 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+
+;
+; Variable Rotates
+;
+
+define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
+; AVX1-LABEL: var_rotate_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm5, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_rotate_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_rotate_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vprotq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_rotate_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vprotq %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
+  %shl = shl <4 x i64> %a, %b
+  %lshr = lshr <4 x i64> %a, %b64
+  %or = or <4 x i64> %shl, %lshr
+  ret <4 x i64> %or
+}
+
+define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
+; AVX1-LABEL: var_rotate_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsubd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmulld %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT:    vpsrld %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT:    vpsrld %xmm7, %xmm6, %xmm7
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
+; AVX1-NEXT:    vpsrld %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX1-NEXT:    vpsrld %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_rotate_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpsubd %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_rotate_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vprotd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_rotate_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vprotd %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
+  %shl = shl <8 x i32> %a, %b
+  %lshr = lshr <8 x i32> %a, %b32
+  %or = or <8 x i32> %shl, %lshr
+  ret <8 x i32> %or
+}
+
+define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
+; AVX1-LABEL: var_rotate_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsubw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $12, %xmm4, %xmm5
+; AVX1-NEXT:    vpsllw $4, %xmm4, %xmm4
+; AVX1-NEXT:    vpor %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsllw $8, %xmm4, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm4, %xmm5
+; AVX1-NEXT:    vpsllw $4, %xmm5, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $2, %xmm5, %xmm7
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $1, %xmm5, %xmm7
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm6
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm6
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm7, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm7
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm7
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT:    vpsllw $12, %xmm3, %xmm5
+; AVX1-NEXT:    vpsllw $4, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm3, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm4
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm4
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_rotate_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpsllvd %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpackusdw %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+; AVX2-NEXT:    vpsrlvd %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_rotate_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vprotw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_rotate_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vprotw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
+  %shl = shl <16 x i16> %a, %b
+  %lshr = lshr <16 x i16> %a, %b16
+  %or = or <16 x i16> %shl, %lshr
+  ret <16 x i16> %or
+}
+
+define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
+; AVX1-LABEL: var_rotate_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsubb %xmm4, %xmm3, %xmm9
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsllw $4, %xmm5, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpsllw $2, %xmm6, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm6
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpsllw $2, %xmm4, %xmm6
+; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm4
+; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $4, %xmm5, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $5, %xmm9, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsllw $5, %xmm8, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_rotate_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpsllw $2, %ymm3, %ymm4
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
+; AVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_rotate_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vprotb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_rotate_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vprotb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
+  %shl = shl <32 x i8> %a, %b
+  %lshr = lshr <32 x i8> %a, %b8
+  %or = or <32 x i8> %shl, %lshr
+  ret <32 x i8> %or
+}
+
+;
+; Constant Rotates
+;
+
+define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: constant_rotate_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsllq $60, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllq $50, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vpsrlq $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_rotate_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_rotate_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_rotate_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
+  %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 2>
+  %or = or <4 x i64> %shl, %lshr
+  ret <4 x i64> %or
+}
+
+define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: constant_rotate_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrld $21, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrld $23, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrld $22, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpsrld $25, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrld $27, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrld $26, %xmm0, %xmm4
+; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_rotate_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_rotate_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_rotate_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm1
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
+  %or = or <8 x i32> %shl, %lshr
+  ret <8 x i32> %or
+}
+
+define <16 x i16> @constant_rotate_v8i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: constant_rotate_v8i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32896,28784,24672,20560,16448,12336,8224,4112]
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [256,57568,49344,41120,32896,24672,16448,8224]
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [512,49600,33152,16704,256,49344,32896,16448]
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1024,33664,768,33408,512,33152,256,32896]
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [256,61680,57568,53456,49344,45232,41120,37008]
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [512,57824,49600,41376,33152,24928,16704,8480]
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1024,50112,33664,17216,768,49856,33408,16960]
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [2048,34688,1792,34432,1536,34176,1280,33920]
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_rotate_v8i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpsrlvd %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_rotate_v8i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlw %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_rotate_v8i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
+  %or = or <16 x i16> %shl, %lshr
+  ret <16 x i16> %or
+}
+
+define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: constant_rotate_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; AVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm5
+; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm8, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm0, %xmm4
+; AVX1-NEXT:    vpsllw $2, %xmm4, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm9
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm9, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_rotate_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpsllw $2, %ymm2, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_rotate_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_rotate_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
+  %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  %or = or <32 x i8> %shl, %lshr
+  ret <32 x i8> %or
+}
+
+;
+; Uniform Constant Rotates
+;
+
+define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: splatconstant_rotate_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsllq $14, %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $50, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_rotate_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllq $14, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlq $50, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
+  %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
+  %or = or <4 x i64> %shl, %lshr
+  ret <4 x i64> %or
+}
+
+define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: splatconstant_rotate_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslld $4, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpslld $4, %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_rotate_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslld $4, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
+  %or = or <8 x i32> %shl, %lshr
+  ret <8 x i32> %or
+}
+
+define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: splatconstant_rotate_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $9, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $9, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_rotate_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlw $9, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
+  %or = or <16 x i16> %shl, %lshr
+  ret <16 x i16> %or
+}
+
+define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: splatconstant_rotate_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_rotate_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %or = or <32 x i8> %shl, %lshr
+  ret <32 x i8> %or
+}
+
+;
+; Masked Uniform Constant Rotates
+;
+
+define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsllq $15, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsllq $15, %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlq $49, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $49, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllq $15, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlq $49, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vprotq $15, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotq $15, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vprotq $15, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotq $15, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
+  %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
+  %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255>
+  %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257>
+  %or = or <4 x i64> %lmask, %rmask
+  ret <4 x i64> %or
+}
+
+define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslld $4, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpslld $4, %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslld $4, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
+  %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
+  %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
+  %or = or <8 x i32> %lmask, %rmask
+  ret <8 x i32> %or
+}
+
+define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsllw $5, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsllw $5, %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $11, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $11, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllw $5, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlw $11, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vprotw $5, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotw $5, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
+  %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
+  %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
+  %or = or <16 x i16> %lmask, %rmask
+  ret <16 x i16> %or
+}
+
+define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
+  %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
+  %or = or <32 x i8> %lmask, %rmask
+  ret <32 x i8> %or
+}
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index 8e79493ddd073..b63c3f084b22a 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -1,396 +1,3813 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ;
 ; Just one 32-bit run to make sure we do reasonable things there.
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
 
-define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_8i16_to_8i32:
+define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_8i16:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_8i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_8i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_16i8_to_8i16:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_16i8_to_8i16:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %C = sext <8 x i8> %B to <8 x i16>
+  ret <8 x i16> %C
+}
+
+define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_16i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: sext_8i16_to_8i32:
+; SSSE3-LABEL: sext_16i8_to_16i16:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT:    psrad $16, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    psraw $8, %xmm2
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSSE3-NEXT:    psraw $8, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: sext_8i16_to_8i32:
+; SSE41-LABEL: sext_16i8_to_16i16:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: sext_8i16_to_8i32:
+; AVX1-LABEL: sext_16i8_to_16i16:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: sext_8i16_to_8i32:
+; AVX2-LABEL: sext_16i8_to_16i16:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; X32-SSE41-LABEL: sext_8i16_to_8i32:
+; X32-SSE41-LABEL: sext_16i8_to_16i16:
 ; X32-SSE41:       # BB#0: # %entry
-; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
 ; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
 ; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; X32-SSE41-NEXT:    retl
 entry:
-  %B = sext <8 x i16> %A to <8 x i32>
-  ret <8 x i32>%B
+  %B = sext <16 x i8> %A to <16 x i16>
+  ret <16 x i16> %B
 }
 
-define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_4i32_to_4i64:
+define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_4i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: sext_4i32_to_4i64:
+; SSSE3-LABEL: sext_16i8_to_4i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $24, %xmm0
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: sext_4i32_to_4i64:
+; SSE41-LABEL: sext_16i8_to_4i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_16i8_to_4i32:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_16i8_to_4i32:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = sext <4 x i8> %B to <4 x i32>
+  ret <4 x i32> %C
+}
+
+define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    psrad $24, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: sext_4i32_to_4i64:
+; AVX1-LABEL: sext_16i8_to_8i32:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: sext_4i32_to_4i64:
+; AVX2-LABEL: sext_16i8_to_8i32:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vpslld $24, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $24, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; X32-SSE41-LABEL: sext_4i32_to_4i64:
+; X32-SSE41-LABEL: sext_16i8_to_8i32:
 ; X32-SSE41:       # BB#0: # %entry
-; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
-; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
+; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
 ; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; X32-SSE41-NEXT:    retl
 entry:
-  %B = sext <4 x i32> %A to <4 x i64>
-  ret <4 x i64>%B
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %C = sext <8 x i8> %B to <8 x i32>
+  ret <8 x i32> %C
 }
 
-define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_2i8_to_i32:
+define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_2i64:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: sext_2i8_to_i32:
+; SSSE3-LABEL: sext_16i8_to_2i64:
 ; SSSE3:       # BB#0: # %entry
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    psraw $8, %xmm0
-; SSSE3-NEXT:    movd %xmm0, %eax
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: sext_2i8_to_i32:
+; SSE41-LABEL: sext_16i8_to_2i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: sext_2i8_to_i32:
+; AVX-LABEL: sext_16i8_to_2i64:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
-; X32-SSE41-LABEL: sext_2i8_to_i32:
+; X32-SSE41-LABEL: sext_16i8_to_2i64:
 ; X32-SSE41:       # BB#0: # %entry
-; X32-SSE41:         pmovsxbw %xmm0, %xmm0
-; X32-SSE41-NEXT:    movd %xmm0, %eax
-; X32-SSE41-NEXT:    popl %edx
+; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
 ; X32-SSE41-NEXT:    retl
 entry:
-  %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-  %Ex = sext <2 x i8> %Shuf to <2 x i16>
-  %Bc = bitcast <2 x i16> %Ex to i32
-  ret i32 %Bc
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %C = sext <2 x i8> %B to <2 x i64>
+  ret <2 x i64> %C
 }
 
-define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
-; SSE2-LABEL: load_sext_test1:
+define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    psrad $24, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    psrld $16, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_16i8_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_16i8_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_16i8_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
+; X32-SSE41-NEXT:    psrld $16, %xmm0
+; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = sext <4 x i8> %B to <4 x i64>
+  ret <4 x i64> %C
+}
+
+define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_4i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    psrad $16, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: load_sext_test1:
+; SSSE3-LABEL: sext_8i16_to_4i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    psrad $16, %xmm0
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: load_sext_test1:
+; SSE41-LABEL: sext_8i16_to_4i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: load_sext_test1:
+; AVX-LABEL: sext_8i16_to_4i32:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
-; X32-SSE41-LABEL: load_sext_test1:
+; X32-SSE41-LABEL: sext_8i16_to_4i32:
 ; X32-SSE41:       # BB#0: # %entry
-; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
 ; X32-SSE41-NEXT:    retl
 entry:
- %X = load <4 x i16>, <4 x i16>* %ptr
- %Y = sext <4 x i16> %X to <4 x i32>
- ret <4 x i32>%Y
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = sext <4 x i16> %B to <4 x i32>
+  ret <4 x i32> %C
 }
 
-define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_test2:
+define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_8i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: load_sext_test2:
+; SSSE3-LABEL: sext_8i16_to_8i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psrad $16, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: load_sext_test2:
+; SSE41-LABEL: sext_8i16_to_8i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: load_sext_test2:
-; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: sext_8i16_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_8i16_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    retq
 ;
-; X32-SSE41-LABEL: load_sext_test2:
+; X32-SSE41-LABEL: sext_8i16_to_8i32:
 ; X32-SSE41:       # BB#0: # %entry
-; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; X32-SSE41-NEXT:    retl
 entry:
- %X = load <4 x i8>, <4 x i8>* %ptr
- %Y = sext <4 x i8> %X to <4 x i32>
- ret <4 x i32>%Y
+  %B = sext <8 x i16> %A to <8 x i32>
+  ret <8 x i32> %B
 }
 
-define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_test3:
+define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_2i64:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movzwl (%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: load_sext_test3:
+; SSSE3-LABEL: sext_8i16_to_2i64:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movzwl (%rdi), %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    psrad $16, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: load_sext_test3:
+; SSE41-LABEL: sext_8i16_to_2i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: load_sext_test3:
-; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
-; AVX-NEXT:    retq
+; AVX-LABEL: sext_8i16_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_8i16_to_2i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %C = sext <2 x i16> %B to <2 x i64>
+  ret <2 x i64> %C
+}
+
+define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_8i16_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    psrad $16, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_8i16_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_8i16_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_8i16_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_8i16_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = sext <4 x i16> %B to <4 x i64>
+  ret <4 x i64> %C
+}
+
+define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_4i32_to_2i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i32_to_2i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i32_to_2i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_4i32_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_4i32_to_2i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %C = sext <2 x i32> %B to <2 x i64>
+  ret <2 x i64> %C
+}
+
+define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_4i32_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i32_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i32_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_4i32_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <4 x i32> %A to <4 x i64>
+  ret <4 x i64> %B
+}
+
+define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
+; SSE-LABEL: load_sext_2i1_to_2i64:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movzbl (%rdi), %eax
+; SSE-NEXT:    movq %rax, %rcx
+; SSE-NEXT:    shlq $62, %rcx
+; SSE-NEXT:    sarq $63, %rcx
+; SSE-NEXT:    movd %rcx, %xmm1
+; SSE-NEXT:    shlq $63, %rax
+; SSE-NEXT:    sarq $63, %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_sext_2i1_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    movzbl (%rdi), %eax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $62, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vmovq %rcx, %xmm0
+; AVX-NEXT:    shlq $63, %rax
+; AVX-NEXT:    sarq $63, %rax
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movzbl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $31, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    movd %ecx, %xmm0
+; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
+; X32-SSE41-NEXT:    shll $30, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i1>, <2 x i1>* %ptr
+ %Y = sext <2 x i1> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_2i8_to_2i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_2i8_to_2i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_2i8_to_2i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_2i8_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_2i8_to_2i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i8>, <2 x i8>* %ptr
+ %Y = sext <2 x i8> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
+; SSE2-LABEL: load_sext_4i1_to_4i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movzbl (%rdi), %eax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $60, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $62, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $61, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    shlq $63, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i1_to_4i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movzbl (%rdi), %eax
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $60, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $62, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $61, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    shlq $63, %rax
+; SSSE3-NEXT:    sarq $63, %rax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i1_to_4i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movzbl (%rdi), %eax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $62, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rax, %rdx
+; SSE41-NEXT:    shlq $63, %rdx
+; SSE41-NEXT:    sarq $63, %rdx
+; SSE41-NEXT:    movd %edx, %xmm0
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $61, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT:    shlq $60, %rax
+; SSE41-NEXT:    sarq $63, %rax
+; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_4i1_to_4i32:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    movzbl (%rdi), %eax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $62, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    movq %rax, %rdx
+; AVX-NEXT:    shlq $63, %rdx
+; AVX-NEXT:    sarq $63, %rdx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $61, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    shlq $60, %rax
+; AVX-NEXT:    sarq $63, %rax
+; AVX-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $30, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    shll $31, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm0
+; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $29, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; X32-SSE41-NEXT:    shll $28, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i1>, <4 x i1>* %ptr
+ %Y = sext <4 x i1> %X to <4 x i32>
+ ret <4 x i32> %Y
+}
+
+define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_4i8_to_4i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i8_to_4i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i8_to_4i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_4i8_to_4i32:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i8_to_4i32:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i8>, <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i32>
+ ret <4 x i32> %Y
+}
+
+define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
+; SSE2-LABEL: load_sext_4i1_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movzbl (%rdi), %eax
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $3, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    shrl $2, %eax
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; SSE2-NEXT:    psllq $63, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; SSE2-NEXT:    psllq $63, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i1_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movzbl (%rdi), %eax
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $3, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    shrl $2, %eax
+; SSSE3-NEXT:    andl $1, %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; SSSE3-NEXT:    psllq $63, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; SSSE3-NEXT:    psllq $63, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i1_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movzbl (%rdi), %eax
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    movl %eax, %edx
+; SSE41-NEXT:    andl $1, %edx
+; SSE41-NEXT:    movd %edx, %xmm1
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $2, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
+; SSE41-NEXT:    shrl $3, %eax
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    psllq $63, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE41-NEXT:    psllq $63, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i1_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    movzbl (%rdi), %eax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $62, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $63, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm0
+; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $61, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $60, %rax
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i1_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    movzbl (%rdi), %eax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $60, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $61, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $62, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    shlq $63, %rax
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movzbl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    andl $1, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm1
+; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $2, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
+; X32-SSE41-NEXT:    shrl $3, %eax
+; X32-SSE41-NEXT:    andl $1, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; X32-SSE41-NEXT:    psllq $63, %xmm0
+; X32-SSE41-NEXT:    psrad $31, %xmm0
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; X32-SSE41-NEXT:    psllq $63, %xmm1
+; X32-SSE41-NEXT:    psrad $31, %xmm1
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i1>, <4 x i1>* %ptr
+ %Y = sext <4 x i1> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_4i8_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsbq 1(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    movsbq (%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movsbq 3(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    movsbq 2(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i8_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsbq 1(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    movsbq (%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movsbq 3(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    movsbq 2(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i8_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i8_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i8_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i8>, <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
+; SSE2-LABEL: load_sext_8i1_to_8i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsbq (%rdi), %rax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq $7, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $60, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $58, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $62, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $57, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $61, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $59, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    shlq $63, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i1_to_8i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsbq (%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shrq $7, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $60, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $58, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $62, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $57, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $61, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $59, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    shlq $63, %rax
+; SSSE3-NEXT:    sarq $63, %rax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i1_to_8i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movsbq (%rdi), %rax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $62, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rax, %rdx
+; SSE41-NEXT:    shlq $63, %rdx
+; SSE41-NEXT:    sarq $63, %rdx
+; SSE41-NEXT:    movd %edx, %xmm0
+; SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $61, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $60, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $59, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $58, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $57, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
+; SSE41-NEXT:    shrq $7, %rax
+; SSE41-NEXT:    pinsrw $7, %eax, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_8i1_to_8i16:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    movsbq (%rdi), %rax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $62, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    movq %rax, %rdx
+; AVX-NEXT:    shlq $63, %rdx
+; AVX-NEXT:    sarq $63, %rdx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $61, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $60, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $59, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $58, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $57, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    shrq $7, %rax
+; AVX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movsbl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $30, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    shll $31, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm0
+; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $29, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $28, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $27, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $26, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $25, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
+; X32-SSE41-NEXT:    shrl $7, %eax
+; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i1>, <8 x i1>* %ptr
+ %Y = sext <8 x i1> %X to <8 x i16>
+ ret <8 x i16> %Y
+}
+
+define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_8i8_to_8i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i8_to_8i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i8_to_8i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_8i8_to_8i16:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_8i8_to_8i16:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i16>
+ ret <8 x i16> %Y
+}
+
+define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
+; SSE2-LABEL: load_sext_8i1_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movzbl (%rdi), %eax
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $6, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $2, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $4, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $5, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $3, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    shrl $7, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pslld $31, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i1_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movzbl (%rdi), %eax
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $6, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $2, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $4, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $5, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $3, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    shrl $7, %eax
+; SSSE3-NEXT:    movzwl %ax, %eax
+; SSSE3-NEXT:    movd %eax, %xmm3
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pslld $31, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i1_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movzbl (%rdi), %eax
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    movl %eax, %edx
+; SSE41-NEXT:    andl $1, %edx
+; SSE41-NEXT:    movd %edx, %xmm1
+; SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $2, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $3, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $4, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $5, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $6, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
+; SSE41-NEXT:    shrl $7, %eax
+; SSE41-NEXT:    movzwl %ax, %eax
+; SSE41-NEXT:    pinsrw $7, %eax, %xmm1
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    pslld $31, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pslld $31, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_8i1_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    movsbq (%rdi), %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $58, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $59, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm0
+; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $57, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $7, %rcx
+; AVX1-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $62, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $63, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm1
+; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $61, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $60, %rax
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_8i1_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    movsbq (%rdi), %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $58, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    shlq $59, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovd %edx, %xmm0
+; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $57, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $7, %rcx
+; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $62, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    shlq $63, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovd %edx, %xmm1
+; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $61, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $60, %rax
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_8i1_to_8i32:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movzbl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    andl $1, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm1
+; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $2, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $3, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $4, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $5, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $6, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
+; X32-SSE41-NEXT:    shrl $7, %eax
+; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm1
+; X32-SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X32-SSE41-NEXT:    pslld $31, %xmm0
+; X32-SSE41-NEXT:    psrad $31, %xmm0
+; X32-SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE41-NEXT:    pslld $31, %xmm1
+; X32-SSE41-NEXT:    psrad $31, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i1>, <8 x i1>* %ptr
+ %Y = sext <8 x i1> %X to <8 x i32>
+ ret <8 x i32> %Y
+}
+
+define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_8i8_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i8_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i8_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_8i8_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_8i8_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxbd 4(%eax), %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i32>
+ ret <8 x i32> %Y
+}
+
+define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
+; SSE2-LABEL: load_sext_16i1_to_16i8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movswq (%rdi), %rax
+; SSE2-NEXT:    movq %rax, %r8
+; SSE2-NEXT:    movq %rax, %r9
+; SSE2-NEXT:    movq %rax, %r10
+; SSE2-NEXT:    movq %rax, %r11
+; SSE2-NEXT:    movq %rax, %r14
+; SSE2-NEXT:    movq %rax, %r15
+; SSE2-NEXT:    movq %rax, %r12
+; SSE2-NEXT:    movq %rax, %r13
+; SSE2-NEXT:    movq %rax, %rbx
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    movq %rax, %rdx
+; SSE2-NEXT:    movq %rax, %rsi
+; SSE2-NEXT:    movq %rax, %rdi
+; SSE2-NEXT:    movq %rax, %rbp
+; SSE2-NEXT:    shlq $49, %rbp
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    movd %ebp, %xmm0
+; SSE2-NEXT:    movq %rax, %rbp
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    shlq $57, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    movd %r8d, %xmm1
+; SSE2-NEXT:    shlq $53, %r9
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    movd %r9d, %xmm2
+; SSE2-NEXT:    shlq $61, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    movd %r10d, %xmm3
+; SSE2-NEXT:    shlq $51, %r11
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movd %r11d, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    shlq $59, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    movd %r14d, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    shlq $55, %r15
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    movd %r15d, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    shlq $63, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    movd %r12d, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT:    shlq $50, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movd %r13d, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    shlq $58, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movd %ebx, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT:    shlq $54, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT:    shlq $62, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    movd %edx, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    shlq $52, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    movd %esi, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    shlq $60, %rdi
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    movd %edi, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT:    shrq $15, %rbp
+; SSE2-NEXT:    movd %ebp, %xmm1
+; SSE2-NEXT:    shrq $7, %rax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_16i1_to_16i8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pushq %rbp
+; SSSE3-NEXT:    pushq %r15
+; SSSE3-NEXT:    pushq %r14
+; SSSE3-NEXT:    pushq %r13
+; SSSE3-NEXT:    pushq %r12
+; SSSE3-NEXT:    pushq %rbx
+; SSSE3-NEXT:    movswq (%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %r8
+; SSSE3-NEXT:    movq %rax, %r9
+; SSSE3-NEXT:    movq %rax, %r10
+; SSSE3-NEXT:    movq %rax, %r11
+; SSSE3-NEXT:    movq %rax, %r14
+; SSSE3-NEXT:    movq %rax, %r15
+; SSSE3-NEXT:    movq %rax, %r12
+; SSSE3-NEXT:    movq %rax, %r13
+; SSSE3-NEXT:    movq %rax, %rbx
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    movq %rax, %rdx
+; SSSE3-NEXT:    movq %rax, %rsi
+; SSSE3-NEXT:    movq %rax, %rdi
+; SSSE3-NEXT:    movq %rax, %rbp
+; SSSE3-NEXT:    shlq $49, %rbp
+; SSSE3-NEXT:    sarq $63, %rbp
+; SSSE3-NEXT:    movd %ebp, %xmm0
+; SSSE3-NEXT:    movq %rax, %rbp
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    shlq $57, %r8
+; SSSE3-NEXT:    sarq $63, %r8
+; SSSE3-NEXT:    movd %r8d, %xmm1
+; SSSE3-NEXT:    shlq $53, %r9
+; SSSE3-NEXT:    sarq $63, %r9
+; SSSE3-NEXT:    movd %r9d, %xmm2
+; SSSE3-NEXT:    shlq $61, %r10
+; SSSE3-NEXT:    sarq $63, %r10
+; SSSE3-NEXT:    movd %r10d, %xmm3
+; SSSE3-NEXT:    shlq $51, %r11
+; SSSE3-NEXT:    sarq $63, %r11
+; SSSE3-NEXT:    movd %r11d, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    shlq $59, %r14
+; SSSE3-NEXT:    sarq $63, %r14
+; SSSE3-NEXT:    movd %r14d, %xmm5
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    shlq $55, %r15
+; SSSE3-NEXT:    sarq $63, %r15
+; SSSE3-NEXT:    movd %r15d, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSSE3-NEXT:    shlq $63, %r12
+; SSSE3-NEXT:    sarq $63, %r12
+; SSSE3-NEXT:    movd %r12d, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSSE3-NEXT:    shlq $50, %r13
+; SSSE3-NEXT:    sarq $63, %r13
+; SSSE3-NEXT:    movd %r13d, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    shlq $58, %rbx
+; SSSE3-NEXT:    sarq $63, %rbx
+; SSSE3-NEXT:    movd %ebx, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSSE3-NEXT:    shlq $54, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT:    shlq $62, %rdx
+; SSSE3-NEXT:    sarq $63, %rdx
+; SSSE3-NEXT:    movd %edx, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT:    shlq $52, %rsi
+; SSSE3-NEXT:    sarq $63, %rsi
+; SSSE3-NEXT:    movd %esi, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT:    shlq $60, %rdi
+; SSSE3-NEXT:    sarq $63, %rdi
+; SSSE3-NEXT:    movd %edi, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSSE3-NEXT:    shrq $15, %rbp
+; SSSE3-NEXT:    movd %ebp, %xmm1
+; SSSE3-NEXT:    shrq $7, %rax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT:    popq %rbx
+; SSSE3-NEXT:    popq %r12
+; SSSE3-NEXT:    popq %r13
+; SSSE3-NEXT:    popq %r14
+; SSSE3-NEXT:    popq %r15
+; SSSE3-NEXT:    popq %rbp
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_16i1_to_16i8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movswq (%rdi), %rax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $62, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rax, %rdx
+; SSE41-NEXT:    shlq $63, %rdx
+; SSE41-NEXT:    sarq $63, %rdx
+; SSE41-NEXT:    movd %edx, %xmm0
+; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $61, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $60, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $59, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $58, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $57, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
+; SSE41-NEXT:    movsbq %al, %rcx
+; SSE41-NEXT:    shrq $7, %rcx
+; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $55, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $54, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $53, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $52, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $51, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $50, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $49, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
+; SSE41-NEXT:    shrq $15, %rax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_16i1_to_16i8:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    movswq (%rdi), %rax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $62, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    movq %rax, %rdx
+; AVX-NEXT:    shlq $63, %rdx
+; AVX-NEXT:    sarq $63, %rdx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $61, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $60, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $59, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $58, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $57, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movsbq %al, %rcx
+; AVX-NEXT:    shrq $7, %rcx
+; AVX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $55, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $54, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $53, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $52, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $51, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $50, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shlq $49, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    shrq $15, %rax
+; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movswl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $30, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    shll $31, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm0
+; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $29, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $28, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $27, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $26, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $25, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
+; X32-SSE41-NEXT:    movsbl %al, %ecx
+; X32-SSE41-NEXT:    shrl $7, %ecx
+; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $23, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $22, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $21, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $20, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $19, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $18, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $17, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
+; X32-SSE41-NEXT:    shrl $15, %eax
+; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <16 x i1>, <16 x i1>* %ptr
+ %Y = sext <16 x i1> %X to <16 x i8>
+ ret <16 x i8> %Y
+}
+
+define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
+; SSE2-LABEL: load_sext_16i1_to_16i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $14, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $6, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $10, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $2, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $12, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $4, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $8, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $13, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $5, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $9, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $11, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $3, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $7, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    shrl $15, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psllw $15, %xmm0
+; SSE2-NEXT:    psraw $15, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    psllw $15, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_16i1_to_16i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $14, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $6, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $10, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $2, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $12, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $4, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $8, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $13, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $5, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $9, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $11, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $3, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $7, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    shrl $15, %eax
+; SSSE3-NEXT:    movzwl %ax, %eax
+; SSSE3-NEXT:    movd %eax, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psllw $15, %xmm0
+; SSSE3-NEXT:    psraw $15, %xmm0
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSSE3-NEXT:    psllw $15, %xmm1
+; SSSE3-NEXT:    psraw $15, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_16i1_to_16i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movzwl (%rdi), %eax
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    movl %eax, %edx
+; SSE41-NEXT:    andl $1, %edx
+; SSE41-NEXT:    movd %edx, %xmm1
+; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $2, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $3, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $4, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $5, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $6, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $7, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $9, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $10, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $11, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $12, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $13, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $14, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
+; SSE41-NEXT:    shrl $15, %eax
+; SSE41-NEXT:    movzwl %ax, %eax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    psllw $15, %xmm0
+; SSE41-NEXT:    psraw $15, %xmm0
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    psllw $15, %xmm1
+; SSE41-NEXT:    psraw $15, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_16i1_to_16i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:  .Ltmp0:
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:  .Ltmp1:
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:  .Ltmp2:
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:  .Ltmp3:
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:  .Ltmp4:
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:  .Ltmp5:
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:  .Ltmp6:
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:  .Ltmp7:
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:  .Ltmp8:
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:  .Ltmp9:
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:  .Ltmp10:
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:  .Ltmp11:
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    movswq (%rdi), %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $55, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vmovd %ecx, %xmm0
+; AVX1-NEXT:    movq %rax, %r8
+; AVX1-NEXT:    movq %rax, %r10
+; AVX1-NEXT:    movq %rax, %r11
+; AVX1-NEXT:    movq %rax, %r14
+; AVX1-NEXT:    movq %rax, %r15
+; AVX1-NEXT:    movq %rax, %r9
+; AVX1-NEXT:    movq %rax, %r12
+; AVX1-NEXT:    movq %rax, %r13
+; AVX1-NEXT:    movq %rax, %rbx
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    movq %rax, %rsi
+; AVX1-NEXT:    movsbq %al, %rbp
+; AVX1-NEXT:    shlq $54, %rax
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $53, %r8
+; AVX1-NEXT:    sarq $63, %r8
+; AVX1-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $52, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $51, %r11
+; AVX1-NEXT:    sarq $63, %r11
+; AVX1-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $50, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $49, %r15
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
+; AVX1-NEXT:    shrq $15, %r9
+; AVX1-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $63, %r13
+; AVX1-NEXT:    sarq $63, %r13
+; AVX1-NEXT:    vmovd %r13d, %xmm1
+; AVX1-NEXT:    shlq $62, %r12
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $61, %rbx
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $60, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $59, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $58, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $57, %rsi
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
+; AVX1-NEXT:    shrq $7, %rbp
+; AVX1-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_16i1_to_16i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:  .Ltmp0:
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:  .Ltmp1:
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:  .Ltmp2:
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:  .Ltmp3:
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:  .Ltmp4:
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:  .Ltmp5:
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:  .Ltmp6:
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:  .Ltmp7:
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:  .Ltmp8:
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:  .Ltmp9:
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:  .Ltmp10:
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:  .Ltmp11:
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    movswq (%rdi), %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $55, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovd %ecx, %xmm0
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    movq %rax, %r11
+; AVX2-NEXT:    movq %rax, %r14
+; AVX2-NEXT:    movq %rax, %r15
+; AVX2-NEXT:    movq %rax, %r9
+; AVX2-NEXT:    movq %rax, %r12
+; AVX2-NEXT:    movq %rax, %r13
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rax, %rdi
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    movsbq %al, %rbp
+; AVX2-NEXT:    shlq $54, %rax
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $53, %r8
+; AVX2-NEXT:    sarq $63, %r8
+; AVX2-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $52, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $51, %r11
+; AVX2-NEXT:    sarq $63, %r11
+; AVX2-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $50, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $49, %r15
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
+; AVX2-NEXT:    shrq $15, %r9
+; AVX2-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $63, %r13
+; AVX2-NEXT:    sarq $63, %r13
+; AVX2-NEXT:    vmovd %r13d, %xmm1
+; AVX2-NEXT:    shlq $62, %r12
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $61, %rbx
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $60, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $59, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $58, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $57, %rsi
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
+; AVX2-NEXT:    shrq $7, %rbp
+; AVX2-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movzwl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    andl $1, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm1
+; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $2, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $3, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $4, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $5, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $6, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $7, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $8, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $9, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $10, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $11, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $12, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $13, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $14, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
+; X32-SSE41-NEXT:    shrl $15, %eax
+; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; X32-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X32-SSE41-NEXT:    psllw $15, %xmm0
+; X32-SSE41-NEXT:    psraw $15, %xmm0
+; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-SSE41-NEXT:    psllw $15, %xmm1
+; X32-SSE41-NEXT:    psraw $15, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <16 x i1>, <16 x i1>* %ptr
+ %Y = sext <16 x i1> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
+define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
+; SSE2-LABEL: load_sext_32i1_to_32i8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movswq (%rdi), %rbx
+; SSE2-NEXT:    movq %rbx, %r10
+; SSE2-NEXT:    movq %rbx, %r8
+; SSE2-NEXT:    movq %rbx, %r9
+; SSE2-NEXT:    movq %rbx, %r11
+; SSE2-NEXT:    movq %rbx, %r14
+; SSE2-NEXT:    movq %rbx, %r15
+; SSE2-NEXT:    movq %rbx, %r12
+; SSE2-NEXT:    movq %rbx, %r13
+; SSE2-NEXT:    movq %rbx, %rdx
+; SSE2-NEXT:    movq %rbx, %rsi
+; SSE2-NEXT:    movq %rbx, %rcx
+; SSE2-NEXT:    movq %rbx, %rbp
+; SSE2-NEXT:    movq %rbx, %rax
+; SSE2-NEXT:    shlq $49, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movq %rbx, %rax
+; SSE2-NEXT:    shlq $57, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    movd %r10d, %xmm15
+; SSE2-NEXT:    movq %rbx, %r10
+; SSE2-NEXT:    movsbq %bl, %rbx
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    shlq $53, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    movd %r8d, %xmm8
+; SSE2-NEXT:    shlq $61, %r9
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    movd %r9d, %xmm2
+; SSE2-NEXT:    shlq $51, %r11
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movd %r11d, %xmm9
+; SSE2-NEXT:    shlq $59, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    movd %r14d, %xmm5
+; SSE2-NEXT:    shlq $55, %r15
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    movd %r15d, %xmm10
+; SSE2-NEXT:    shlq $63, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    movd %r12d, %xmm0
+; SSE2-NEXT:    shlq $50, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movd %r13d, %xmm11
+; SSE2-NEXT:    shlq $58, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    movd %edx, %xmm4
+; SSE2-NEXT:    shlq $54, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    movd %esi, %xmm12
+; SSE2-NEXT:    shlq $62, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm6
+; SSE2-NEXT:    shlq $52, %rbp
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    movd %ebp, %xmm13
+; SSE2-NEXT:    shlq $60, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movd %eax, %xmm7
+; SSE2-NEXT:    shrq $15, %r10
+; SSE2-NEXT:    movd %r10d, %xmm14
+; SSE2-NEXT:    shrq $7, %rbx
+; SSE2-NEXT:    movd %ebx, %xmm3
+; SSE2-NEXT:    movswq 2(%rdi), %rdx
+; SSE2-NEXT:    movq %rdx, %r8
+; SSE2-NEXT:    movq %rdx, %r9
+; SSE2-NEXT:    movq %rdx, %r10
+; SSE2-NEXT:    movq %rdx, %r11
+; SSE2-NEXT:    movq %rdx, %r14
+; SSE2-NEXT:    movq %rdx, %r15
+; SSE2-NEXT:    movq %rdx, %r12
+; SSE2-NEXT:    movq %rdx, %r13
+; SSE2-NEXT:    movq %rdx, %rbx
+; SSE2-NEXT:    movq %rdx, %rax
+; SSE2-NEXT:    movq %rdx, %rcx
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    movq %rdx, %rdi
+; SSE2-NEXT:    movq %rdx, %rbp
+; SSE2-NEXT:    shlq $49, %rbp
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    movd %ebp, %xmm1
+; SSE2-NEXT:    movq %rdx, %rbp
+; SSE2-NEXT:    movsbq %dl, %rdx
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT:    shlq $57, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    movd %r8d, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; SSE2-NEXT:    shlq $53, %r9
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    movd %r9d, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSE2-NEXT:    shlq $61, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    movd %r10d, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT:    shlq $51, %r11
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movd %r11d, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    shlq $59, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    movd %r14d, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT:    shlq $55, %r15
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    movd %r15d, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    shlq $63, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    movd %r12d, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE2-NEXT:    shlq $50, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movd %r13d, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    shlq $58, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movd %ebx, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-NEXT:    shlq $54, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    shlq $62, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    shlq $52, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    movd %esi, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT:    shlq $60, %rdi
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    movd %edi, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    shrq $15, %rbp
+; SSE2-NEXT:    movd %ebp, %xmm2
+; SSE2-NEXT:    shrq $7, %rdx
+; SSE2-NEXT:    movd %edx, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_32i1_to_32i8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pushq %rbp
+; SSSE3-NEXT:    pushq %r15
+; SSSE3-NEXT:    pushq %r14
+; SSSE3-NEXT:    pushq %r13
+; SSSE3-NEXT:    pushq %r12
+; SSSE3-NEXT:    pushq %rbx
+; SSSE3-NEXT:    movswq (%rdi), %rbx
+; SSSE3-NEXT:    movq %rbx, %r10
+; SSSE3-NEXT:    movq %rbx, %r8
+; SSSE3-NEXT:    movq %rbx, %r9
+; SSSE3-NEXT:    movq %rbx, %r11
+; SSSE3-NEXT:    movq %rbx, %r14
+; SSSE3-NEXT:    movq %rbx, %r15
+; SSSE3-NEXT:    movq %rbx, %r12
+; SSSE3-NEXT:    movq %rbx, %r13
+; SSSE3-NEXT:    movq %rbx, %rdx
+; SSSE3-NEXT:    movq %rbx, %rsi
+; SSSE3-NEXT:    movq %rbx, %rcx
+; SSSE3-NEXT:    movq %rbx, %rbp
+; SSSE3-NEXT:    movq %rbx, %rax
+; SSSE3-NEXT:    shlq $49, %rax
+; SSSE3-NEXT:    sarq $63, %rax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movq %rbx, %rax
+; SSSE3-NEXT:    shlq $57, %r10
+; SSSE3-NEXT:    sarq $63, %r10
+; SSSE3-NEXT:    movd %r10d, %xmm15
+; SSSE3-NEXT:    movq %rbx, %r10
+; SSSE3-NEXT:    movsbq %bl, %rbx
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSSE3-NEXT:    shlq $53, %r8
+; SSSE3-NEXT:    sarq $63, %r8
+; SSSE3-NEXT:    movd %r8d, %xmm8
+; SSSE3-NEXT:    shlq $61, %r9
+; SSSE3-NEXT:    sarq $63, %r9
+; SSSE3-NEXT:    movd %r9d, %xmm2
+; SSSE3-NEXT:    shlq $51, %r11
+; SSSE3-NEXT:    sarq $63, %r11
+; SSSE3-NEXT:    movd %r11d, %xmm9
+; SSSE3-NEXT:    shlq $59, %r14
+; SSSE3-NEXT:    sarq $63, %r14
+; SSSE3-NEXT:    movd %r14d, %xmm5
+; SSSE3-NEXT:    shlq $55, %r15
+; SSSE3-NEXT:    sarq $63, %r15
+; SSSE3-NEXT:    movd %r15d, %xmm10
+; SSSE3-NEXT:    shlq $63, %r12
+; SSSE3-NEXT:    sarq $63, %r12
+; SSSE3-NEXT:    movd %r12d, %xmm0
+; SSSE3-NEXT:    shlq $50, %r13
+; SSSE3-NEXT:    sarq $63, %r13
+; SSSE3-NEXT:    movd %r13d, %xmm11
+; SSSE3-NEXT:    shlq $58, %rdx
+; SSSE3-NEXT:    sarq $63, %rdx
+; SSSE3-NEXT:    movd %edx, %xmm4
+; SSSE3-NEXT:    shlq $54, %rsi
+; SSSE3-NEXT:    sarq $63, %rsi
+; SSSE3-NEXT:    movd %esi, %xmm12
+; SSSE3-NEXT:    shlq $62, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm6
+; SSSE3-NEXT:    shlq $52, %rbp
+; SSSE3-NEXT:    sarq $63, %rbp
+; SSSE3-NEXT:    movd %ebp, %xmm13
+; SSSE3-NEXT:    shlq $60, %rax
+; SSSE3-NEXT:    sarq $63, %rax
+; SSSE3-NEXT:    movd %eax, %xmm7
+; SSSE3-NEXT:    shrq $15, %r10
+; SSSE3-NEXT:    movd %r10d, %xmm14
+; SSSE3-NEXT:    shrq $7, %rbx
+; SSSE3-NEXT:    movd %ebx, %xmm3
+; SSSE3-NEXT:    movswq 2(%rdi), %rdx
+; SSSE3-NEXT:    movq %rdx, %r8
+; SSSE3-NEXT:    movq %rdx, %r9
+; SSSE3-NEXT:    movq %rdx, %r10
+; SSSE3-NEXT:    movq %rdx, %r11
+; SSSE3-NEXT:    movq %rdx, %r14
+; SSSE3-NEXT:    movq %rdx, %r15
+; SSSE3-NEXT:    movq %rdx, %r12
+; SSSE3-NEXT:    movq %rdx, %r13
+; SSSE3-NEXT:    movq %rdx, %rbx
+; SSSE3-NEXT:    movq %rdx, %rax
+; SSSE3-NEXT:    movq %rdx, %rcx
+; SSSE3-NEXT:    movq %rdx, %rsi
+; SSSE3-NEXT:    movq %rdx, %rdi
+; SSSE3-NEXT:    movq %rdx, %rbp
+; SSSE3-NEXT:    shlq $49, %rbp
+; SSSE3-NEXT:    sarq $63, %rbp
+; SSSE3-NEXT:    movd %ebp, %xmm1
+; SSSE3-NEXT:    movq %rdx, %rbp
+; SSSE3-NEXT:    movsbq %dl, %rdx
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSSE3-NEXT:    shlq $57, %r8
+; SSSE3-NEXT:    sarq $63, %r8
+; SSSE3-NEXT:    movd %r8d, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; SSSE3-NEXT:    shlq $53, %r9
+; SSSE3-NEXT:    sarq $63, %r9
+; SSSE3-NEXT:    movd %r9d, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSSE3-NEXT:    shlq $61, %r10
+; SSSE3-NEXT:    sarq $63, %r10
+; SSSE3-NEXT:    movd %r10d, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSSE3-NEXT:    shlq $51, %r11
+; SSSE3-NEXT:    sarq $63, %r11
+; SSSE3-NEXT:    movd %r11d, %xmm5
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT:    shlq $59, %r14
+; SSSE3-NEXT:    sarq $63, %r14
+; SSSE3-NEXT:    movd %r14d, %xmm6
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSSE3-NEXT:    shlq $55, %r15
+; SSSE3-NEXT:    sarq $63, %r15
+; SSSE3-NEXT:    movd %r15d, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSSE3-NEXT:    shlq $63, %r12
+; SSSE3-NEXT:    sarq $63, %r12
+; SSSE3-NEXT:    movd %r12d, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSSE3-NEXT:    shlq $50, %r13
+; SSSE3-NEXT:    sarq $63, %r13
+; SSSE3-NEXT:    movd %r13d, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSSE3-NEXT:    shlq $58, %rbx
+; SSSE3-NEXT:    sarq $63, %rbx
+; SSSE3-NEXT:    movd %ebx, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSSE3-NEXT:    shlq $54, %rax
+; SSSE3-NEXT:    sarq $63, %rax
+; SSSE3-NEXT:    movd %eax, %xmm5
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSSE3-NEXT:    shlq $62, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    shlq $52, %rsi
+; SSSE3-NEXT:    sarq $63, %rsi
+; SSSE3-NEXT:    movd %esi, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSSE3-NEXT:    shlq $60, %rdi
+; SSSE3-NEXT:    sarq $63, %rdi
+; SSSE3-NEXT:    movd %edi, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    shrq $15, %rbp
+; SSSE3-NEXT:    movd %ebp, %xmm2
+; SSSE3-NEXT:    shrq $7, %rdx
+; SSSE3-NEXT:    movd %edx, %xmm5
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSSE3-NEXT:    popq %rbx
+; SSSE3-NEXT:    popq %r12
+; SSSE3-NEXT:    popq %r13
+; SSSE3-NEXT:    popq %r14
+; SSSE3-NEXT:    popq %r15
+; SSSE3-NEXT:    popq %rbp
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_32i1_to_32i8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movswq (%rdi), %rax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $62, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rax, %rdx
+; SSE41-NEXT:    shlq $63, %rdx
+; SSE41-NEXT:    sarq $63, %rdx
+; SSE41-NEXT:    movd %edx, %xmm0
+; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $61, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $60, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $59, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $58, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $57, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
+; SSE41-NEXT:    movsbq %al, %rcx
+; SSE41-NEXT:    shrq $7, %rcx
+; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $55, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $54, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $53, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $52, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $51, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $50, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $49, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
+; SSE41-NEXT:    shrq $15, %rax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
+; SSE41-NEXT:    movswq 2(%rdi), %rax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $62, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rax, %rdx
+; SSE41-NEXT:    shlq $63, %rdx
+; SSE41-NEXT:    sarq $63, %rdx
+; SSE41-NEXT:    movd %edx, %xmm1
+; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $61, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $60, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $59, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $58, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $57, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
+; SSE41-NEXT:    movsbq %al, %rcx
+; SSE41-NEXT:    shrq $7, %rcx
+; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $55, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $54, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $53, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $52, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $51, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $50, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $49, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
+; SSE41-NEXT:    shrq $15, %rax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_32i1_to_32i8:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    movslq (%rdi), %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $47, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vmovd %ecx, %xmm0
+; AVX1-NEXT:    movq %rax, %r8
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    movq %rax, %r13
+; AVX1-NEXT:    movq %rax, %rsi
+; AVX1-NEXT:    movq %rax, %r10
+; AVX1-NEXT:    movq %rax, %r11
+; AVX1-NEXT:    movq %rax, %r9
+; AVX1-NEXT:    movq %rax, %rbx
+; AVX1-NEXT:    movq %rax, %r14
+; AVX1-NEXT:    movq %rax, %r15
+; AVX1-NEXT:    movq %rax, %r12
+; AVX1-NEXT:    movq %rax, %rbp
+; AVX1-NEXT:    shlq $46, %rbp
+; AVX1-NEXT:    sarq $63, %rbp
+; AVX1-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rbp
+; AVX1-NEXT:    shlq $45, %r8
+; AVX1-NEXT:    sarq $63, %r8
+; AVX1-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r8
+; AVX1-NEXT:    shlq $44, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $43, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $42, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    shlq $41, %r13
+; AVX1-NEXT:    sarq $63, %r13
+; AVX1-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r13
+; AVX1-NEXT:    shlq $40, %rsi
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rsi
+; AVX1-NEXT:    shlq $39, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r10
+; AVX1-NEXT:    shlq $38, %r11
+; AVX1-NEXT:    sarq $63, %r11
+; AVX1-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX1-NEXT:    movsbq %al, %r11
+; AVX1-NEXT:    shlq $37, %r9
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r9
+; AVX1-NEXT:    shlq $36, %rbx
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rbx
+; AVX1-NEXT:    shlq $35, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r14
+; AVX1-NEXT:    shlq $34, %r15
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r15
+; AVX1-NEXT:    shlq $33, %r12
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r12
+; AVX1-NEXT:    shrq $31, %rbp
+; AVX1-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rbp
+; AVX1-NEXT:    shlq $63, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm1
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    movswq %ax, %rax
+; AVX1-NEXT:    shlq $62, %r8
+; AVX1-NEXT:    sarq $63, %r8
+; AVX1-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $61, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $60, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $59, %r13
+; AVX1-NEXT:    sarq $63, %r13
+; AVX1-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $58, %rsi
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $57, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX1-NEXT:    shrq $7, %r11
+; AVX1-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $55, %r9
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $54, %rbx
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $53, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $52, %r15
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $51, %r12
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $50, %rbp
+; AVX1-NEXT:    sarq $63, %rbp
+; AVX1-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $49, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
+; AVX1-NEXT:    shrq $15, %rax
+; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_32i1_to_32i8:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movslq (%rdi), %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $47, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovd %ecx, %xmm0
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    movq %rax, %rdi
+; AVX2-NEXT:    movq %rax, %r13
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    movq %rax, %r11
+; AVX2-NEXT:    movq %rax, %r9
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rax, %r14
+; AVX2-NEXT:    movq %rax, %r15
+; AVX2-NEXT:    movq %rax, %r12
+; AVX2-NEXT:    movq %rax, %rbp
+; AVX2-NEXT:    shlq $46, %rbp
+; AVX2-NEXT:    sarq $63, %rbp
+; AVX2-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rbp
+; AVX2-NEXT:    shlq $45, %r8
+; AVX2-NEXT:    sarq $63, %r8
+; AVX2-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    shlq $44, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    shlq $43, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $42, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rdi
+; AVX2-NEXT:    shlq $41, %r13
+; AVX2-NEXT:    sarq $63, %r13
+; AVX2-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r13
+; AVX2-NEXT:    shlq $40, %rsi
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    shlq $39, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    shlq $38, %r11
+; AVX2-NEXT:    sarq $63, %r11
+; AVX2-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX2-NEXT:    movsbq %al, %r11
+; AVX2-NEXT:    shlq $37, %r9
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r9
+; AVX2-NEXT:    shlq $36, %rbx
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    shlq $35, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r14
+; AVX2-NEXT:    shlq $34, %r15
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r15
+; AVX2-NEXT:    shlq $33, %r12
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r12
+; AVX2-NEXT:    shrq $31, %rbp
+; AVX2-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rbp
+; AVX2-NEXT:    shlq $63, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovd %edx, %xmm1
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    movswq %ax, %rax
+; AVX2-NEXT:    shlq $62, %r8
+; AVX2-NEXT:    sarq $63, %r8
+; AVX2-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $61, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $60, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $59, %r13
+; AVX2-NEXT:    sarq $63, %r13
+; AVX2-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $58, %rsi
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $57, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX2-NEXT:    shrq $7, %r11
+; AVX2-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $55, %r9
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $54, %rbx
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $53, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $52, %r15
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $51, %r12
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $50, %rbp
+; AVX2-NEXT:    sarq $63, %rbp
+; AVX2-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $49, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
+; AVX2-NEXT:    shrq $15, %rax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    pushl %esi
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movswl (%eax), %ecx
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $30, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    movl %ecx, %esi
+; X32-SSE41-NEXT:    shll $31, %esi
+; X32-SSE41-NEXT:    sarl $31, %esi
+; X32-SSE41-NEXT:    movd %esi, %xmm0
+; X32-SSE41-NEXT:    pinsrb $1, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $29, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $2, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $28, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $3, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $27, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $4, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $26, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $5, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $25, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $6, %edx, %xmm0
+; X32-SSE41-NEXT:    movsbl %cl, %edx
+; X32-SSE41-NEXT:    shrl $7, %edx
+; X32-SSE41-NEXT:    pinsrb $7, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $23, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $8, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $22, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $9, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $21, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $10, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $20, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $11, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $19, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $12, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $18, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $13, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $17, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $14, %edx, %xmm0
+; X32-SSE41-NEXT:    shrl $15, %ecx
+; X32-SSE41-NEXT:    pinsrb $15, %ecx, %xmm0
+; X32-SSE41-NEXT:    movswl 2(%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $30, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    shll $31, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm1
+; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $29, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $28, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $27, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $26, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $25, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
+; X32-SSE41-NEXT:    movsbl %al, %ecx
+; X32-SSE41-NEXT:    shrl $7, %ecx
+; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $23, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $22, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $21, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $20, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $19, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $18, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $17, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
+; X32-SSE41-NEXT:    shrl $15, %eax
+; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; X32-SSE41-NEXT:    popl %esi
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <32 x i1>, <32 x i1>* %ptr
+ %Y = sext <32 x i1> %X to <32 x i8>
+ ret <32 x i8> %Y
+}
+
+define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_16i8_to_16i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_16i8_to_16i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_16i8_to_16i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_16i8_to_16i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_16i8_to_16i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <16 x i8>, <16 x i8>* %ptr
+ %Y = sext <16 x i8> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
+define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_2i16_to_2i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_2i16_to_2i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_2i16_to_2i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_2i16_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_2i16_to_2i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i16>, <2 x i16>* %ptr
+ %Y = sext <2 x i16> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_4i16_to_4i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i16_to_4i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i16_to_4i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_4i16_to_4i32:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i16_to_4i32:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i16>, <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i32>
+ ret <4 x i32> %Y
+}
+
+define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_4i16_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movswq 2(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    movswq (%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movswq 6(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    movswq 4(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i16_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movswq 2(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    movswq (%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movswq 6(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    movswq 4(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i16_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i16_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i16_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i16>, <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_8i16_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i16_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i16_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_8i16_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_8i16_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
+; AVX2-NEXT:    retq
 ;
-; X32-SSE41-LABEL: load_sext_test3:
+; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
 ; X32-SSE41:       # BB#0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxwd 8(%eax), %xmm1
 ; X32-SSE41-NEXT:    retl
 entry:
- %X = load <2 x i8>, <2 x i8>* %ptr
- %Y = sext <2 x i8> %X to <2 x i64>
- ret <2 x i64>%Y
+ %X = load <8 x i16>, <8 x i16>* %ptr
+ %Y = sext <8 x i16> %X to <8 x i32>
+ ret <8 x i32> %Y
 }
 
-define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
-; SSE2-LABEL: load_sext_test4:
+define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
+; SSE2-LABEL: load_sext_2i32_to_2i64:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: load_sext_test4:
+; SSSE3-LABEL: load_sext_2i32_to_2i64:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    psrad $16, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: load_sext_test4:
+; SSE41-LABEL: load_sext_2i32_to_2i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: load_sext_test4:
+; AVX-LABEL: load_sext_2i32_to_2i64:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
+; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
-; X32-SSE41-LABEL: load_sext_test4:
+; X32-SSE41-LABEL: load_sext_2i32_to_2i64:
 ; X32-SSE41:       # BB#0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
 ; X32-SSE41-NEXT:    retl
 entry:
- %X = load <2 x i16>, <2 x i16>* %ptr
- %Y = sext <2 x i16> %X to <2 x i64>
- ret <2 x i64>%Y
+ %X = load <2 x i32>, <2 x i32>* %ptr
+ %Y = sext <2 x i32> %X to <2 x i64>
+ ret <2 x i64> %Y
 }
 
-define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
-; SSE2-LABEL: load_sext_test5:
+define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
+; SSE2-LABEL: load_sext_4i32_to_4i64:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: load_sext_test5:
+; SSSE3-LABEL: load_sext_4i32_to_4i64:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    movdqa (%rdi), %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: load_sext_test5:
+; SSE41-LABEL: load_sext_4i32_to_4i64:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxdq 8(%rdi), %xmm1
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: load_sext_test5:
-; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_sext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxdq (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxdq 8(%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
 ;
-; X32-SSE41-LABEL: load_sext_test5:
+; AVX2-LABEL: load_sext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxdq (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
 ; X32-SSE41:       # BB#0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxdq 8(%eax), %xmm1
 ; X32-SSE41-NEXT:    retl
 entry:
- %X = load <2 x i32>, <2 x i32>* %ptr
- %Y = sext <2 x i32> %X to <2 x i64>
- ret <2 x i64>%Y
+ %X = load <4 x i32>, <4 x i32>* %ptr
+ %Y = sext <4 x i32> %X to <4 x i64>
+ ret <4 x i64> %Y
 }
 
-define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_test6:
+define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_2i8_to_i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: load_sext_test6:
+; SSSE3-LABEL: sext_2i8_to_i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    movd %xmm0, %eax
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: load_sext_test6:
+; SSE41-LABEL: sext_2i8_to_i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: load_sext_test6:
+; AVX-LABEL: sext_2i8_to_i32:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
+; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 ;
-; X32-SSE41-LABEL: load_sext_test6:
+; X32-SSE41-LABEL: sext_2i8_to_i32:
 ; X32-SSE41:       # BB#0: # %entry
-; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT:    pushl %eax
+; X32-SSE41-NEXT:  .Ltmp0:
+; X32-SSE41-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
+; X32-SSE41-NEXT:    movd %xmm0, %eax
+; X32-SSE41-NEXT:    popl %ecx
 ; X32-SSE41-NEXT:    retl
 entry:
- %X = load <8 x i8>, <8 x i8>* %ptr
- %Y = sext <8 x i8> %X to <8 x i16>
- ret <8 x i16>%Y
+  %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %Ex = sext <2 x i8> %Shuf to <2 x i16>
+  %Bc = bitcast <2 x i16> %Ex to i32
+  ret i32 %Bc
 }
 
 define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
@@ -460,57 +3877,6 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
   ret <4 x i64> %extmask
 }
 
-define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
-; SSE2-LABEL: sext_16i8_to_16i16:
-; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: sext_16i8_to_16i16:
-; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    psraw $8, %xmm0
-; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    psraw $8, %xmm1
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: sext_16i8_to_16i16:
-; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
-; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: sext_16i8_to_16i16:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
-; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sext_16i8_to_16i16:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; X32-SSE41-LABEL: sext_16i8_to_16i16:
-; X32-SSE41:       # BB#0: # %entry
-; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
-; X32-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
-; X32-SSE41-NEXT:    retl
-entry:
- %X = load <16 x i8>, <16 x i8>* %ptr
- %Y = sext <16 x i8> %X to <16 x i16>
- ret <16 x i16> %Y
-}
-
 define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
 ; SSE2-LABEL: sext_4i8_to_4i64:
 ; SSE2:       # BB#0:
@@ -577,125 +3943,3 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
   %extmask = sext <4 x i8> %mask to <4 x i64>
   ret <4 x i64> %extmask
 }
-
-define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_4i8_to_4i64:
-; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movsbq 1(%rdi), %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    movsbq (%rdi), %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    movsbq 3(%rdi), %rax
-; SSE2-NEXT:    movd %rax, %xmm2
-; SSE2-NEXT:    movsbq 2(%rdi), %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: load_sext_4i8_to_4i64:
-; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movsbq 1(%rdi), %rax
-; SSSE3-NEXT:    movd %rax, %xmm1
-; SSSE3-NEXT:    movsbq (%rdi), %rax
-; SSSE3-NEXT:    movd %rax, %xmm0
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT:    movsbq 3(%rdi), %rax
-; SSSE3-NEXT:    movd %rax, %xmm2
-; SSSE3-NEXT:    movsbq 2(%rdi), %rax
-; SSSE3-NEXT:    movd %rax, %xmm1
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: load_sext_4i8_to_4i64:
-; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
-; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: load_sext_4i8_to_4i64:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_sext_4i8_to_4i64:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
-; X32-SSE41:       # BB#0: # %entry
-; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
-; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
-; X32-SSE41-NEXT:    retl
-entry:
- %X = load <4 x i8>, <4 x i8>* %ptr
- %Y = sext <4 x i8> %X to <4 x i64>
- ret <4 x i64>%Y
-}
-
-define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
-; SSE2-LABEL: load_sext_4i16_to_4i64:
-; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movswq 2(%rdi), %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    movswq (%rdi), %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    movswq 6(%rdi), %rax
-; SSE2-NEXT:    movd %rax, %xmm2
-; SSE2-NEXT:    movswq 4(%rdi), %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: load_sext_4i16_to_4i64:
-; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movswq 2(%rdi), %rax
-; SSSE3-NEXT:    movd %rax, %xmm1
-; SSSE3-NEXT:    movswq (%rdi), %rax
-; SSSE3-NEXT:    movd %rax, %xmm0
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT:    movswq 6(%rdi), %rax
-; SSSE3-NEXT:    movd %rax, %xmm2
-; SSSE3-NEXT:    movswq 4(%rdi), %rax
-; SSSE3-NEXT:    movd %rax, %xmm1
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: load_sext_4i16_to_4i64:
-; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
-; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: load_sext_4i16_to_4i64:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_sext_4i16_to_4i64:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
-; X32-SSE41:       # BB#0: # %entry
-; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
-; X32-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
-; X32-SSE41-NEXT:    retl
-entry:
- %X = load <4 x i16>, <4 x i16>* %ptr
- %Y = sext <4 x i16> %X to <4 x i64>
- ret <4 x i64>%Y
-}
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index 61b30154950d2..771445df85e0a 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -1,59 +1,115 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
 
 ;
 ; Variable Shifts
 ;
 
-define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: var_shift_v2i64:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    movd %xmm1, %rcx
-; SSE2-NEXT:    sarq %cl, %rax
-; SSE2-NEXT:    movd %rax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rcx
-; SSE2-NEXT:    sarq %cl, %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    psrlq %xmm3, %xmm4
+; SSE2-NEXT:    psrlq %xmm1, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq %xmm3, %xmm2
+; SSE2-NEXT:    psrlq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT:    xorpd %xmm4, %xmm2
+; SSE2-NEXT:    psubq %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    pextrq $1, %xmm1, %rcx
-; SSE41-NEXT:    sarq %cl, %rax
-; SSE41-NEXT:    movd %rax, %xmm2
-; SSE41-NEXT:    movd %xmm0, %rax
-; SSE41-NEXT:    movd %xmm1, %rcx
-; SSE41-NEXT:    sarq %cl, %rax
-; SSE41-NEXT:    movd %rax, %xmm0
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm4, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrlq %xmm1, %xmm3
+; SSE41-NEXT:    psrlq %xmm4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    psubq %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: var_shift_v2i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX-NEXT:    sarq %cl, %rax
-; AVX-NEXT:    vmovq %rax, %xmm2
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    vmovq %xmm1, %rcx
-; AVX-NEXT:    sarq %cl, %rax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: var_shift_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX512-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm3
+; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    psrlq %xmm2, %xmm4
+; X32-SSE-NEXT:    movq {{.*#+}} xmm5 = xmm1[0],zero
+; X32-SSE-NEXT:    psrlq %xmm5, %xmm3
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq %xmm2, %xmm1
+; X32-SSE-NEXT:    psrlq %xmm5, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-SSE-NEXT:    xorpd %xmm4, %xmm1
+; X32-SSE-NEXT:    psubq %xmm4, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = ashr <2 x i64> %a, %b
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: var_shift_v4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -119,11 +175,52 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrad %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlq $32, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrad %xmm2, %xmm4
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psrad %xmm4, %xmm5
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE-NEXT:    psrad %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    retl
   %shift = ashr <4 x i32> %a, %b
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: var_shift_v8i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    psllw $12, %xmm1
@@ -216,11 +313,58 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = ashr <8 x i16> %a, %b
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: var_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
@@ -342,6 +486,99 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512-NEXT:    vpsraw $4, %xmm3, %xmm4
+; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psraw $4, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psraw $2, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X32-SSE-NEXT:    pandn %xmm2, %xmm4
+; X32-SSE-NEXT:    psraw $1, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = ashr <16 x i8> %a, %b
   ret <16 x i8> %shift
 }
@@ -350,71 +587,65 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; Uniform Variable Shifts
 ;
 
-define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: splatvar_shift_v2i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    movd %xmm2, %rcx
-; SSE2-NEXT:    sarq %cl, %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rcx
-; SSE2-NEXT:    sarq %cl, %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    retq
+define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; SSE-LABEL: splatvar_shift_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE-NEXT:    psrlq %xmm1, %xmm2
+; SSE-NEXT:    psrlq %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    psubq %xmm2, %xmm0
+; SSE-NEXT:    retq
 ;
-; SSE41-LABEL: splatvar_shift_v2i64:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    pextrq $1, %xmm1, %rcx
-; SSE41-NEXT:    sarq %cl, %rax
-; SSE41-NEXT:    movd %rax, %xmm2
-; SSE41-NEXT:    movd %xmm0, %rax
-; SSE41-NEXT:    movd %xmm1, %rcx
-; SSE41-NEXT:    sarq %cl, %rax
-; SSE41-NEXT:    movd %rax, %xmm0
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT:    retq
+; AVX-LABEL: splatvar_shift_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
-; AVX1-LABEL: splatvar_shift_v2i64:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX1-NEXT:    sarq %cl, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vmovq %xmm1, %rcx
-; AVX1-NEXT:    sarq %cl, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT:    retq
+; XOPAVX1-LABEL: splatvar_shift_v2i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
 ;
-; AVX2-LABEL: splatvar_shift_v2i64:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT:    sarq %cl, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vmovq %xmm1, %rcx
-; AVX2-NEXT:    sarq %cl, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    retq
+; XOPAVX2-LABEL: splatvar_shift_v2i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX512-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    pxor %xmm2, %xmm0
+; X32-SSE-NEXT:    psubq %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
   %shift = ashr <2 x i64> %a, %splat
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
@@ -435,12 +666,33 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    xorps %xmm2, %xmm2
+; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X32-SSE-NEXT:    psrad %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
   %shift = ashr <4 x i32> %a, %splat
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v8i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -462,12 +714,34 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    movzwl %ax, %eax
+; X32-SSE-NEXT:    movd %eax, %xmm1
+; X32-SSE-NEXT:    psraw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = ashr <8 x i16> %a, %splat
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -626,6 +900,113 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v16i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v16i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512-NEXT:    vpsraw $4, %xmm3, %xmm4
+; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; X32-SSE-NEXT:    psllw $5, %xmm3
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm1, %xmm6
+; X32-SSE-NEXT:    psraw $4, %xmm1
+; X32-SSE-NEXT:    pand %xmm5, %xmm1
+; X32-SSE-NEXT:    por %xmm6, %xmm1
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm1, %xmm6
+; X32-SSE-NEXT:    psraw $2, %xmm1
+; X32-SSE-NEXT:    pand %xmm5, %xmm1
+; X32-SSE-NEXT:    por %xmm6, %xmm1
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X32-SSE-NEXT:    pandn %xmm1, %xmm4
+; X32-SSE-NEXT:    psraw $1, %xmm1
+; X32-SSE-NEXT:    pand %xmm5, %xmm1
+; X32-SSE-NEXT:    por %xmm4, %xmm1
+; X32-SSE-NEXT:    psrlw $8, %xmm1
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i8> %a, %splat
   ret <16 x i8> %shift
@@ -635,46 +1016,83 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; Constant Shifts
 ;
 
-define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
+define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    sarq %rax
-; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %rax
-; SSE2-NEXT:    sarq $7, %rax
-; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $7, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    movapd {{.*#+}} xmm0 = [4611686018427387904,72057594037927936]
+; SSE2-NEXT:    xorpd %xmm0, %xmm1
+; SSE2-NEXT:    psubq %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    sarq $7, %rax
-; SSE41-NEXT:    movd %rax, %xmm1
-; SSE41-NEXT:    movd %xmm0, %rax
-; SSE41-NEXT:    sarq %rax
-; SSE41-NEXT:    movd %rax, %xmm0
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq $7, %xmm1
+; SSE41-NEXT:    psrlq $1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    psubq %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: constant_shift_v2i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX-NEXT:    sarq $7, %rax
-; AVX-NEXT:    vmovq %rax, %xmm1
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    sarq %rax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: constant_shift_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlq $7, %xmm2
+; X32-SSE-NEXT:    psrlq $1, %xmm1
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq $7, %xmm1
+; X32-SSE-NEXT:    psrlq $1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-SSE-NEXT:    xorpd %xmm2, %xmm1
+; X32-SSE-NEXT:    psubq %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = ashr <2 x i64> %a, <i64 1, i64 7>
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
+define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -720,11 +1138,42 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad $7, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad $5, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad $6, %xmm2
+; X32-SSE-NEXT:    psrad $4, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
   %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
+define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v8i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -789,11 +1238,41 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psraw $4, %xmm1
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    psraw $2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm0, %xmm1
+; X32-SSE-NEXT:    psraw $1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
+define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -918,6 +1397,101 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512-NEXT:    vpsraw $4, %xmm3, %xmm4
+; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; X32-SSE-NEXT:    psllw $5, %xmm3
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm1, %xmm6
+; X32-SSE-NEXT:    psraw $4, %xmm1
+; X32-SSE-NEXT:    pand %xmm5, %xmm1
+; X32-SSE-NEXT:    por %xmm6, %xmm1
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm1, %xmm6
+; X32-SSE-NEXT:    psraw $2, %xmm1
+; X32-SSE-NEXT:    pand %xmm5, %xmm1
+; X32-SSE-NEXT:    por %xmm6, %xmm1
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X32-SSE-NEXT:    pandn %xmm1, %xmm4
+; X32-SSE-NEXT:    psraw $1, %xmm1
+; X32-SSE-NEXT:    pand %xmm5, %xmm1
+; X32-SSE-NEXT:    por %xmm4, %xmm1
+; X32-SSE-NEXT:    psrlw $8, %xmm1
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
 }
@@ -926,7 +1500,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; Uniform Constant Shifts
 ;
 
-define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
+define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-LABEL: splatconstant_shift_v2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -958,11 +1532,35 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; AVX2-NEXT:    vpsrlq $7, %xmm0, %xmm0
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; AVX2-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrad $7, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad $7, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    psrlq $7, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
   %shift = ashr <2 x i64> %a, <i64 7, i64 7>
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
+define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v4i32:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psrad $5, %xmm0
@@ -972,11 +1570,26 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpsrad $5, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpsrad $5, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psrad $5, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
+define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v8i16:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psraw $3, %xmm0
@@ -986,11 +1599,26 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpsraw $3, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpsraw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psraw $3, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
+define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v16i8:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psrlw $3, %xmm0
@@ -1008,6 +1636,31 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    pand .LCPI15_0, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT:    pxor %xmm1, %xmm0
+; X32-SSE-NEXT:    psubb %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index e4642558e0e43..0b9c318da0475 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -1,65 +1,83 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Variable Shifts
 ;
 
-define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: var_shift_v4i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rcx
-; AVX1-NEXT:    sarq %cl, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm4
-; AVX1-NEXT:    vmovq %xmm2, %rax
-; AVX1-NEXT:    vmovq %xmm3, %rcx
-; AVX1-NEXT:    sarq %cl, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX1-NEXT:    sarq %cl, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm3
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vmovq %xmm1, %rcx
-; AVX1-NEXT:    sarq %cl, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpsrlq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shift_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
-; AVX2-NEXT:    sarq %cl, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm4
-; AVX2-NEXT:    vmovq %xmm2, %rax
-; AVX2-NEXT:    vmovq %xmm3, %rcx
-; AVX2-NEXT:    sarq %cl, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT:    sarq %cl, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm3
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vmovq %xmm1, %rcx
-; AVX2-NEXT:    sarq %cl, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshaq %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
+; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX512-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
+; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = ashr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: var_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -94,11 +112,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshad %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = ashr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: var_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -147,11 +187,40 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshaw %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT:    vpshaw %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: var_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -234,6 +303,58 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshab %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT:    vpshab %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -242,65 +363,64 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; Uniform Variable Shifts
 ;
 
-define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v4i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    movb %al, %cl
-; AVX1-NEXT:    sarq %cl, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm3
-; AVX1-NEXT:    vmovq %xmm2, %rsi
-; AVX1-NEXT:    vmovq %xmm1, %rdx
-; AVX1-NEXT:    movb %dl, %cl
-; AVX1-NEXT:    sarq %cl, %rsi
-; AVX1-NEXT:    vmovq %rsi, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX1-NEXT:    movb %al, %cl
-; AVX1-NEXT:    sarq %cl, %rsi
-; AVX1-NEXT:    vmovq %rsi, %xmm2
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    movb %dl, %cl
-; AVX1-NEXT:    sarq %cl, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_shift_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
-; AVX2-NEXT:    sarq %cl, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm4
-; AVX2-NEXT:    vmovq %xmm2, %rax
-; AVX2-NEXT:    vmovq %xmm3, %rcx
-; AVX2-NEXT:    sarq %cl, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT:    sarq %cl, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm3
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vmovq %xmm1, %rcx
-; AVX2-NEXT:    sarq %cl, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX512-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
+; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = ashr <4 x i64> %a, %splat
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -317,12 +437,36 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = ashr <8 x i32> %a, %splat
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -341,12 +485,39 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vmovd %xmm1, %eax
+; XOPAVX1-NEXT:    movzwl %ax, %eax
+; XOPAVX1-NEXT:    vmovd %eax, %xmm1
+; XOPAVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vmovd %xmm1, %eax
+; XOPAVX2-NEXT:    movzwl %ax, %eax
+; XOPAVX2-NEXT:    vmovd %eax, %xmm1
+; XOPAVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movzwl %ax, %eax
+; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i16> %a, %splat
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -424,6 +595,59 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshab %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT:    vpshab %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -433,51 +657,64 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; Constant Shifts
 ;
 
-define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
+define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    sarq $62, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    sarq $31, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    sarq $7, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    sarq %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4294967296,2]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4611686018427387904,72057594037927936]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    sarq $62, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    sarq $31, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    sarq $7, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    sarq %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshaq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
+; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
+define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
@@ -500,11 +737,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
+define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -549,11 +804,39 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshaw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vpshaw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
+define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
@@ -630,6 +913,55 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshab %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vpshab %xmm1, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -638,7 +970,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; Uniform Constant Shifts
 ;
 
-define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
+define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -657,11 +989,36 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrad $7, %ymm0, %ymm1
+; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512-NEXT:    retq
   %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
+define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
@@ -674,11 +1031,29 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrad $5, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsrad $5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrad $5, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrad $5, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
+define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
@@ -691,11 +1066,29 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsraw $3, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsraw $3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsraw $3, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
+define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -720,6 +1113,34 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshab %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll
new file mode 100644
index 0000000000000..147e58f4710e2
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -0,0 +1,378 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+;
+; Variable Shifts
+;
+
+define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
+; ALL-LABEL: var_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = ashr <8 x i64> %a, %b
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
+; ALL-LABEL: var_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = ashr <16 x i32> %a, %b
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
+; AVX512DQ-LABEL: var_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpsravd %ymm5, %ymm6, %ymm5
+; AVX512DQ-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT:    vpsravd %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpsravd %ymm2, %ymm5, %ymm2
+; AVX512DQ-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT:    vpsravd %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = ashr <32 x i16> %a, %b
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; AVX512DQ-LABEL: var_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm5, %ymm6
+; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm6, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpsraw $2, %ymm5, %ymm6
+; AVX512DQ-NEXT:    vpaddw %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm6, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpsraw $1, %ymm5, %ymm6
+; AVX512DQ-NEXT:    vpaddw %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm6, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $5, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpsraw $2, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpsraw $1, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsraw $2, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsraw $1, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+  %shift = ashr <64 x i8> %a, %b
+  ret <64 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
+; ALL-LABEL: splatvar_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %shift = ashr <8 x i64> %a, %splat
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
+; ALL-LABEL: splatvar_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; ALL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+  %shift = ashr <16 x i32> %a, %splat
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
+; AVX512DQ-LABEL: splatvar_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vmovd %xmm2, %eax
+; AVX512DQ-NEXT:    movzwl %ax, %eax
+; AVX512DQ-NEXT:    vmovd %eax, %xmm2
+; AVX512DQ-NEXT:    vpsraw %xmm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vmovd %xmm1, %eax
+; AVX512BW-NEXT:    movzwl %ax, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
+  %shift = ashr <32 x i16> %a, %splat
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; AVX512DQ-LABEL: splatvar_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpsraw $2, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm3, %ymm3, %ymm6
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpsraw $1, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm6, %ymm6, %ymm7
+; AVX512DQ-NEXT:    vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm8
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm8, %ymm8, %ymm9
+; AVX512DQ-NEXT:    vpblendvb %ymm9, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsraw $2, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsraw $1, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm9, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
+  %shift = ashr <64 x i8> %a, %splat
+  ret <64 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
+; ALL-LABEL: constant_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsravq {{.*}}(%rip), %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
+; ALL-LABEL: constant_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
+; AVX512DQ-LABEL: constant_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpsravd %ymm4, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT:    vpsravd %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpsravd %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsrld $16, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT:    vpsravd %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpackusdw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
+; AVX512DQ-LABEL: constant_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpsraw $2, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm3, %ymm3, %ymm6
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpsraw $1, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm6, %ymm6, %ymm7
+; AVX512DQ-NEXT:    vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm8
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm8, %ymm8, %ymm9
+; AVX512DQ-NEXT:    vpblendvb %ymm9, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsraw $2, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsraw $1, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm9, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+  %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+  ret <64 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
+; ALL-LABEL: splatconstant_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsraq $7, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
+; ALL-LABEL: splatconstant_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsrad $5, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
+; AVX512DQ-LABEL: splatconstant_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsraw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
+; AVX512DQ-LABEL: splatconstant_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQ-NEXT:    vpxor %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_shift_v64i8:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <64 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index ca55800e2713d..86e54612ae741 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -1,13 +1,20 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
 
 ;
 ; Variable Shifts
 ;
 
-define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: var_shift_v2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -39,11 +46,39 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm2
+; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; X32-SSE-NEXT:    movapd %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = lshr <2 x i64> %a, %b
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: var_shift_v4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -109,11 +144,52 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrld %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlq $32, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrld %xmm2, %xmm4
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psrld %xmm4, %xmm5
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE-NEXT:    psrld %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    retl
   %shift = lshr <4 x i32> %a, %b
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: var_shift_v8i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    psllw $12, %xmm1
@@ -206,11 +282,58 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = lshr <8 x i16> %a, %b
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: var_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    psllw $5, %xmm1
@@ -281,6 +404,60 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand .LCPI3_0, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand .LCPI3_1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand .LCPI3_2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = lshr <16 x i8> %a, %b
   ret <16 x i8> %shift
 }
@@ -289,7 +466,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; Uniform Variable Shifts
 ;
 
-define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-LABEL: splatvar_shift_v2i64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psrlq %xmm1, %xmm0
@@ -299,12 +476,28 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
   %shift = lshr <2 x i64> %a, %splat
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
@@ -325,12 +518,33 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    xorps %xmm2, %xmm2
+; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X32-SSE-NEXT:    psrld %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
   %shift = lshr <4 x i32> %a, %splat
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v8i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -352,12 +566,34 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; XOP-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    movzwl %ax, %eax
+; X32-SSE-NEXT:    movd %eax, %xmm1
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = lshr <8 x i16> %a, %splat
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -454,6 +690,74 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v16i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v16i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; X32-SSE-NEXT:    psllw $5, %xmm2
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand .LCPI7_0, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand .LCPI7_1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand .LCPI7_2, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i8> %a, %splat
   ret <16 x i8> %shift
@@ -463,7 +767,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; Constant Shifts
 ;
 
-define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
+define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -492,61 +796,118 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq $7, %xmm1
+; X32-SSE-NEXT:    psrlq $1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-SSE-NEXT:    movapd %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = lshr <2 x i64> %a, <i64 1, i64 7>
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
+define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v4i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:  movdqa  %xmm0, %xmm1
-; SSE2-NEXT:  psrld   $7, %xmm1
-; SSE2-NEXT:  movdqa  %xmm0, %xmm2
-; SSE2-NEXT:  psrld   $5, %xmm2
-; SSE2-NEXT:  movsd   {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE2-NEXT:  pshufd  {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT:  movdqa  %xmm0, %xmm2
-; SSE2-NEXT:  psrld   $6, %xmm2
-; SSE2-NEXT:  psrld   $4, %xmm0
-; SSE2-NEXT:  movsd   {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT:  pshufd  {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT:  punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:  retq
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $7, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld $5, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld $6, %xmm2
+; SSE2-NEXT:    psrld $4, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v4i32:
-; SSE41:    # BB#0:
-; SSE41-NEXT:  movdqa %xmm0, %xmm1
-; SSE41-NEXT:  psrld  $7, %xmm1
-; SSE41-NEXT:  movdqa %xmm0, %xmm2
-; SSE41-NEXT:  psrld  $5, %xmm2
-; SSE41-NEXT:  pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:  movdqa %xmm0, %xmm1
-; SSE41-NEXT:  psrld  $6, %xmm1
-; SSE41-NEXT:  psrld  $4, %xmm0
-; SSE41-NEXT:  pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:  pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT:  retq
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $7, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrld $5, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $6, %xmm1
+; SSE41-NEXT:    psrld $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v4i32:
-; AVX1:    # BB#0:
-; AVX1-NEXT:  vpsrld  $7, %xmm0, %xmm1
-; AVX1-NEXT:  vpsrld  $5, %xmm0, %xmm2
-; AVX1-NEXT:  vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT:  vpsrld  $6, %xmm0, %xmm2
-; AVX1-NEXT:  vpsrld  $4, %xmm0, %xmm0
-; AVX1-NEXT:  vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT:  vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT:  retq
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v4i32:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld $7, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrld $5, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrld $6, %xmm2
+; X32-SSE-NEXT:    psrld $4, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
   %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
+define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v8i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -611,11 +972,41 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $4, %xmm1
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    psrlw $2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
+define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
@@ -686,6 +1077,62 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; X32-SSE-NEXT:    psllw $5, %xmm2
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand .LCPI11_1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand .LCPI11_2, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand .LCPI11_3, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
 }
@@ -694,7 +1141,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; Uniform Constant Shifts
 ;
 
-define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
+define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v2i64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psrlq $7, %xmm0
@@ -704,11 +1151,26 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpsrlq $7, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psrlq $7, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = lshr <2 x i64> %a, <i64 7, i64 7>
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
+define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v4i32:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psrld $5, %xmm0
@@ -718,11 +1180,26 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpsrld $5, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpsrld $5, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psrld $5, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
+define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v8i16:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psrlw $3, %xmm0
@@ -732,11 +1209,26 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
+define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v16i8:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psrlw $3, %xmm0
@@ -748,6 +1240,25 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    pand .LCPI15_0, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index bb0cceed77207..ecc68cf2e2785 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -1,11 +1,14 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Variable Shifts
 ;
 
-define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: var_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -25,11 +28,33 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = lshr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: var_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -64,11 +89,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshld %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = lshr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: var_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -117,11 +164,40 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: var_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -171,6 +247,46 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -179,7 +295,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; Uniform Variable Shifts
 ;
 
-define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -192,12 +308,30 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = lshr <4 x i64> %a, %splat
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -214,12 +348,36 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = lshr <8 x i32> %a, %splat
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -238,12 +396,39 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vmovd %xmm1, %eax
+; XOPAVX1-NEXT:    movzwl %ax, %eax
+; XOPAVX1-NEXT:    vmovd %eax, %xmm1
+; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vmovd %xmm1, %eax
+; XOPAVX2-NEXT:    movzwl %ax, %eax
+; XOPAVX2-NEXT:    vmovd %eax, %xmm1
+; XOPAVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movzwl %ax, %eax
+; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i16> %a, %splat
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -292,6 +477,47 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -301,7 +527,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; Constant Shifts
 ;
 
-define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
+define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -318,11 +544,32 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
+define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
@@ -345,11 +592,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
+define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -394,11 +659,39 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
+define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -446,6 +739,43 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -454,7 +784,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; Uniform Constant Shifts
 ;
 
-define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
+define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
@@ -467,11 +797,29 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
+define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
@@ -484,11 +832,29 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrld $5, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
+define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
@@ -501,11 +867,29 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
+define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -522,6 +906,28 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll
new file mode 100644
index 0000000000000..68644e61b0e52
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -0,0 +1,317 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Variable Shifts
+;
+
+define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
+; ALL-LABEL: var_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = lshr <8 x i64> %a, %b
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
+; ALL-LABEL: var_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = lshr <16 x i32> %a, %b
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
+; AVX512DQ-LABEL: var_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpsrlvd %ymm5, %ymm6, %ymm5
+; AVX512DQ-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpsrlvd %ymm2, %ymm5, %ymm2
+; AVX512DQ-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT:    vpsrlvd %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = lshr <32 x i16> %a, %b
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; AVX512DQ-LABEL: var_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512DQ-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $1, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512DQ-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $2, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $1, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+
+  %shift = lshr <64 x i8> %a, %b
+  ret <64 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
+; ALL-LABEL: splatvar_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %shift = lshr <8 x i64> %a, %splat
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
+; ALL-LABEL: splatvar_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; ALL-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+  %shift = lshr <16 x i32> %a, %splat
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
+; AVX512DQ-LABEL: splatvar_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vmovd %xmm2, %eax
+; AVX512DQ-NEXT:    movzwl %ax, %eax
+; AVX512DQ-NEXT:    vmovd %eax, %xmm2
+; AVX512DQ-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vmovd %xmm1, %eax
+; AVX512BW-NEXT:    movzwl %ax, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
+  %shift = lshr <32 x i16> %a, %splat
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; AVX512DQ-LABEL: splatvar_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm2, %ymm6
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512DQ-NEXT:    vpand %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm3
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $2, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $1, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
+  %shift = lshr <64 x i8> %a, %splat
+  ret <64 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
+; ALL-LABEL: constant_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsrlvq {{.*}}(%rip), %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
+; ALL-LABEL: constant_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
+; AVX512DQ-LABEL: constant_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpsrlvd %ymm4, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpsrlvd %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsrld $16, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT:    vpsrlvd %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpackusdw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
+; AVX512DQ-LABEL: constant_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm4, %ymm4, %ymm6
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512DQ-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $2, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $1, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+  %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+  ret <64 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
+; ALL-LABEL: splatconstant_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsrlq $7, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
+; ALL-LABEL: splatconstant_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsrld $5, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
+; AVX512DQ-LABEL: splatconstant_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
+; AVX512DQ-LABEL: splatconstant_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_shift_v64i8:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <64 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 6dbd9eab2a72e..9b59c6224ef23 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -1,13 +1,20 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
 
 ;
 ; Variable Shifts
 ;
 
-define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: var_shift_v2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -39,11 +46,37 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq %xmm3, %xmm2
+; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
+; X32-SSE-NEXT:    psllq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; X32-SSE-NEXT:    movapd %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <2 x i64> %a, %b
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: var_shift_v4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pslld $23, %xmm1
@@ -79,11 +112,41 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd .LCPI1_0, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <4 x i32> %a, %b
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: var_shift_v8i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    psllw $12, %xmm1
@@ -176,11 +239,56 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psllw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psllw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psllw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <8 x i16> %a, %b
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: var_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    psllw $5, %xmm1
@@ -248,6 +356,56 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand .LCPI3_0, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $2, %xmm0
+; X32-SSE-NEXT:    pand .LCPI3_1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm0, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <16 x i8> %a, %b
   ret <16 x i8> %shift
 }
@@ -256,7 +414,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; Uniform Variable Shifts
 ;
 
-define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-LABEL: splatvar_shift_v2i64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psllq %xmm1, %xmm0
@@ -266,12 +424,28 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
+; X32-SSE-NEXT:    psllq %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
   %shift = shl <2 x i64> %a, %splat
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
@@ -292,12 +466,33 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    xorps %xmm2, %xmm2
+; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X32-SSE-NEXT:    pslld %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
   %shift = shl <4 x i32> %a, %splat
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v8i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -319,12 +514,34 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; XOP-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movd %xmm1, %eax
+; X32-SSE-NEXT:    movzwl %ax, %eax
+; X32-SSE-NEXT:    movd %eax, %xmm1
+; X32-SSE-NEXT:    psllw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = shl <8 x i16> %a, %splat
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: splatvar_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -417,6 +634,69 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v16i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v16i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; X32-SSE-NEXT:    psllw $5, %xmm2
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand .LCPI7_0, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $2, %xmm0
+; X32-SSE-NEXT:    pand .LCPI7_1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    paddb %xmm0, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i8> %a, %splat
   ret <16 x i8> %shift
@@ -426,7 +706,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; Constant Shifts
 ;
 
-define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
+define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -455,11 +735,35 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllq $7, %xmm1
+; X32-SSE-NEXT:    psllq $1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-SSE-NEXT:    movapd %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <2 x i64> %a, <i64 1, i64 7>
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
+define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
@@ -486,11 +790,38 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
   %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
+define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; SSE-LABEL: constant_shift_v8i16:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
@@ -500,11 +831,27 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pmullw .LCPI10_0, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
+define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
@@ -572,6 +919,58 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; X32-SSE-NEXT:    psllw $5, %xmm2
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand .LCPI11_1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $2, %xmm0
+; X32-SSE-NEXT:    pand .LCPI11_2, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    paddb %xmm0, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
 }
@@ -580,7 +979,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; Uniform Constant Shifts
 ;
 
-define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
+define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v2i64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psllq $7, %xmm0
@@ -590,11 +989,26 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpsllq $7, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpsllq $7, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllq $7, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psllq $7, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <2 x i64> %a, <i64 7, i64 7>
   ret <2 x i64> %shift
 }
 
-define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
+define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v4i32:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pslld $5, %xmm0
@@ -604,11 +1018,26 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpslld $5, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpslld $5, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpslld $5, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i32:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pslld $5, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %shift
 }
 
-define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
+define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v8i16:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psllw $3, %xmm0
@@ -618,11 +1047,26 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v8i16:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpsllw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v8i16:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psllw $3, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
 }
 
-define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
+define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE-LABEL: splatconstant_shift_v16i8:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    psllw $3, %xmm0
@@ -634,6 +1078,23 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v16i8:
+; XOP:       # BB#0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v16i8:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    psllw $3, %xmm0
+; X32-SSE-NEXT:    pand .LCPI15_0, %xmm0
+; X32-SSE-NEXT:    retl
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index b287875f65417..3daf24f1a82e9 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Variable Shifts
 ;
 
-define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: var_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -25,11 +29,30 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = shl <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: var_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -50,11 +73,30 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshld %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = shl <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: var_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -103,11 +145,34 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: var_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -153,6 +218,39 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = shl <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -161,7 +259,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; Uniform Variable Shifts
 ;
 
-define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -174,12 +272,30 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpsllq %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = shl <4 x i64> %a, %splat
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -196,12 +312,36 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpslld %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = shl <8 x i32> %a, %splat
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -220,12 +360,39 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vmovd %xmm1, %eax
+; XOPAVX1-NEXT:    movzwl %ax, %eax
+; XOPAVX1-NEXT:    vmovd %eax, %xmm1
+; XOPAVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vmovd %xmm1, %eax
+; XOPAVX2-NEXT:    movzwl %ax, %eax
+; XOPAVX2-NEXT:    vmovd %eax, %xmm1
+; XOPAVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movzwl %ax, %eax
+; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i16> %a, %splat
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -270,6 +437,42 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -279,7 +482,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; Constant Shifts
 ;
 
-define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
+define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -296,11 +499,29 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
+define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
@@ -313,11 +534,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
+define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
@@ -330,11 +569,30 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
+define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -378,6 +636,40 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -386,7 +678,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; Uniform Constant Shifts
 ;
 
-define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
+define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpsllq $7, %xmm0, %xmm1
@@ -399,11 +691,29 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllq $7, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v4i64:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpsllq $7, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsllq $7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v4i64:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllq $7, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllq $7, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
 
-define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
+define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpslld $5, %xmm0, %xmm1
@@ -416,11 +726,29 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpslld $5, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v8i32:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpslld $5, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpslld $5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v8i32:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpslld $5, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpslld $5, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
 
-define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
+define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpsllw $3, %xmm0, %xmm1
@@ -433,11 +761,29 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v16i16:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vpsllw $3, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsllw $3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v16i16:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllw $3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $3, %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
 
-define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
+define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: splatconstant_shift_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -454,6 +800,27 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v32i8:
+; XOPAVX1:       # BB#0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v32i8:
+; XOPAVX2:       # BB#0:
+; XOPAVX2-NEXT:    vpsllw $3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $3, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll
new file mode 100644
index 0000000000000..26ddb1c127e1f
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Variable Shifts
+;
+
+define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
+; ALL-LABEL: var_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = shl <8 x i64> %a, %b
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
+; ALL-LABEL: var_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = shl <16 x i32> %a, %b
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
+; AVX512DQ-LABEL: var_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpsllvd %ymm5, %ymm6, %ymm5
+; AVX512DQ-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpsllvd %ymm2, %ymm5, %ymm2
+; AVX512DQ-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT:    vpsllvd %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = shl <32 x i16> %a, %b
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; AVX512DQ-LABEL: var_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $2, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512DQ-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $2, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+  %shift = shl <64 x i8> %a, %b
+  ret <64 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
+; ALL-LABEL: splatvar_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %shift = shl <8 x i64> %a, %splat
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
+; ALL-LABEL: splatvar_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; ALL-NEXT:    vpslld %xmm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+  %shift = shl <16 x i32> %a, %splat
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
+; AVX512DQ-LABEL: splatvar_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vmovd %xmm2, %eax
+; AVX512DQ-NEXT:    movzwl %ax, %eax
+; AVX512DQ-NEXT:    vmovd %eax, %xmm2
+; AVX512DQ-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vmovd %xmm1, %eax
+; AVX512BW-NEXT:    movzwl %ax, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
+  %shift = shl <32 x i16> %a, %splat
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; AVX512DQ-LABEL: splatvar_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $2, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm2, %ymm6
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpaddb %ymm6, %ymm6, %ymm7
+; AVX512DQ-NEXT:    vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $4, %ymm1, %ymm3
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $2, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+
+  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
+  %shift = shl <64 x i8> %a, %splat
+  ret <64 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
+; ALL-LABEL: constant_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsllvq {{.*}}(%rip), %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = shl <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
+; ALL-LABEL: constant_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
+; AVX512DQ-LABEL: constant_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
+; AVX512DQ-LABEL: constant_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm4, %ymm4, %ymm6
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm6, %ymm6, %ymm7
+; AVX512DQ-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $2, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+  %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+  ret <64 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
+; ALL-LABEL: splatconstant_shift_v8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpsllq $7, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+  ret <8 x i64> %shift
+}
+
+define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
+; ALL-LABEL: splatconstant_shift_v16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpslld $5, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i32> %shift
+}
+
+define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
+; AVX512DQ-LABEL: splatconstant_shift_v32i16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_shift_v32i16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <32 x i16> %shift
+}
+
+define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
+; AVX512DQ-LABEL: splatconstant_shift_v64i8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_shift_v64i8:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <64 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 124d6e8c8ba2a..13a9543ddd90c 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
@@ -469,6 +470,20 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(
   ret <16 x i8> %shuffle
 }
 
+define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) {
+; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i8> %shuffle
+}
+
 define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
 ; SSE2:       # BB#0:
@@ -1356,3 +1371,264 @@ define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
   %bitcast8  = bitcast <8 x i16> %shuffle16 to <16 x i8>
   ret <16 x i8> %bitcast8
 }
+
+define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
+; SSE2-LABEL: insert_dup_mem_v16i8_i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_mem_v16i8_i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_mem_v16i8_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_mem_v16i8_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
+  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp3
+}
+
+define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
+; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsbl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsbl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movsbl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movsbl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_mem_v16i8_sext_i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i8, i8* %ptr, align 1
+  %tmp1 = sext i8 %tmp to i32
+  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp4
+}
+
+define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
+; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v16i8_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb 1(%rdi), %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
+  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i8> %tmp3
+}
+
+define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
+; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt2_mem_v16i8_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb 2(%rdi), %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
+  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <16 x i8> %tmp3
+}
+
+define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
+; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsbl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsbl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movsbl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movsbl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movsbl (%rdi), %eax
+; AVX2-NEXT:    shrl $8, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i8, i8* %ptr, align 1
+  %tmp1 = sext i8 %tmp to i32
+  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i8> %tmp4
+}
+
+define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
+; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsbl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsbl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movsbl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movsbl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movsbl (%rdi), %eax
+; AVX2-NEXT:    shrl $16, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i8, i8* %ptr, align 1
+  %tmp1 = sext i8 %tmp to i32
+  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <16 x i8> %tmp4
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
index ee68df581bfdd..1d32f9e38523c 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -1,9 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VL
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -23,6 +25,11 @@ define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2i64_00:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0>
   ret <2 x i64> %shuffle
 }
@@ -67,6 +74,11 @@ define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2i64_22:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 2>
   ret <2 x i64> %shuffle
 }
@@ -135,6 +147,7 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX-NEXT:    retq
+
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 0>
   ret <2 x double> %shuffle
 }
@@ -191,6 +204,7 @@ define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
 ; AVX-NEXT:    retq
+
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 2>
   ret <2 x double> %shuffle
 }
@@ -329,6 +343,11 @@ define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2i64_03:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %shuffle
 }
@@ -366,6 +385,11 @@ define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2i64_03_copy:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %shuffle
 }
@@ -516,6 +540,11 @@ define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2i64_21:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
   ret <2 x i64> %shuffle
 }
@@ -553,6 +582,11 @@ define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2i64_21_copy:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
   ret <2 x i64> %shuffle
 }
@@ -725,6 +759,12 @@ define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2i64_z1:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 1>
   ret <2 x i64> %shuffle
 }
@@ -750,11 +790,23 @@ define <2 x double> @shuffle_v2f64_1z(<2 x double> %a) {
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v2f64_1z:
-; AVX:       # BB#0:
-; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v2f64_1z:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2f64_1z:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2f64_1z:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle
 }
@@ -767,11 +819,23 @@ define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) {
 ; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v2f64_z0:
-; AVX:       # BB#0:
-; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v2f64_z0:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2f64_z0:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2f64_z0:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 0>
   ret <2 x double> %shuffle
 }
@@ -817,11 +881,23 @@ define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) {
 ; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v2f64_bitcast_1z:
-; AVX:       # BB#0:
-; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v2f64_bitcast_1z:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2f64_bitcast_1z:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2f64_bitcast_1z:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; AVX512VL-NEXT:    retq
   %shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
   %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x float>
   %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
@@ -829,6 +905,66 @@ define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) {
   ret <2 x double> %bitcast64
 }
 
+define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
+; SSE2-LABEL: shuffle_v2i64_bitcast_z123:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_bitcast_z123:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_bitcast_z123:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_bitcast_z123:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_bitcast_z123:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_bitcast_z123:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v2i64_bitcast_z123:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovss {{.*}}(%rip), %xmm1
+; AVX512VL-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512VL-NEXT:    retq
+  %bitcast32 = bitcast <2 x i64> %x to <4 x float>
+  %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %bitcast64 = bitcast <4 x float> %shuffle32 to <2 x i64>
+  %and = and <2 x i64> %bitcast64, <i64 -4294967296, i64 -1>
+  ret <2 x i64> %and
+}
+
 define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) {
 ; SSE-LABEL: insert_reg_and_zero_v2i64:
 ; SSE:       # BB#0:
@@ -850,10 +986,20 @@ define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: insert_mem_and_zero_v2i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    retq
+; AVX1-LABEL: insert_mem_and_zero_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_and_zero_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_mem_and_zero_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovq (%rdi), %xmm0
+; AVX512VL-NEXT:    retq
   %a = load i64, i64* %ptr
   %v = insertelement <2 x i64> undef, i64 %a, i32 0
   %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
@@ -881,10 +1027,20 @@ define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: insert_mem_and_zero_v2f64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    retq
+; AVX1-LABEL: insert_mem_and_zero_v2f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_and_zero_v2f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_mem_and_zero_v2f64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovsd (%rdi), %xmm0
+; AVX512VL-NEXT:    retq
   %a = load double, double* %ptr
   %v = insertelement <2 x double> undef, double %a, i32 0
   %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
@@ -927,6 +1083,12 @@ define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) {
 ; AVX2-NEXT:    vmovq %rdi, %xmm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_reg_lo_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovq %rdi, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VL-NEXT:    retq
   %v = insertelement <2 x i64> undef, i64 %a, i32 0
   %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %shuffle
@@ -965,6 +1127,12 @@ define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_mem_lo_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovq (%rdi), %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VL-NEXT:    retq
   %a = load i64, i64* %ptr
   %v = insertelement <2 x i64> undef, i64 %a, i32 0
   %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
@@ -995,11 +1163,23 @@ define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: insert_mem_hi_v2i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: insert_mem_hi_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_hi_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_mem_hi_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovq (%rdi), %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    retq
   %a = load i64, i64* %ptr
   %v = insertelement <2 x i64> undef, i64 %a, i32 0
   %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
@@ -1013,10 +1193,20 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
 ; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: insert_reg_lo_v2f64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: insert_reg_lo_v2f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_reg_lo_v2f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_reg_lo_v2f64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
   %v = insertelement <2 x double> undef, double %a, i32 0
   %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %shuffle
@@ -1071,8 +1261,6 @@ define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) {
 }
 
 define <2 x double> @insert_dup_reg_v2f64(double %a) {
-; FIXME: We should match movddup for SSE3 and higher here.
-;
 ; SSE2-LABEL: insert_dup_reg_v2f64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
@@ -1101,6 +1289,7 @@ define <2 x double> @insert_dup_reg_v2f64(double %a) {
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
   ret <2 x double> %shuffle
 }
+
 define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
 ; SSE2-LABEL: insert_dup_mem_v2f64:
 ; SSE2:       # BB#0:
@@ -1133,6 +1322,66 @@ define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
   ret <2 x double> %shuffle
 }
 
+define <2 x double> @insert_dup_mem128_v2f64(<2 x double>* %ptr) nounwind {
+; SSE2-LABEL: insert_dup_mem128_v2f64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps (%rdi), %xmm0
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_dup_mem128_v2f64:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_mem128_v2f64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_mem128_v2f64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_dup_mem128_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX-NEXT:    retq
+  %v = load  <2 x double>,  <2 x double>* %ptr
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+
+
+define <2 x i64> @insert_dup_mem_v2i64(i64* %ptr) {
+; SSE-LABEL: insert_dup_mem_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_mem_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_mem_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq (%rdi), %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_dup_mem_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastq (%rdi), %xmm0
+; AVX512VL-NEXT:    retq
+  %tmp = load i64, i64* %ptr, align 1
+  %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
+  %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %tmp2
+}
+
 define <2 x double> @shuffle_mem_v2f64_10(<2 x double>* %ptr) {
 ; SSE-LABEL: shuffle_mem_v2f64_10:
 ; SSE:       # BB#0:
@@ -1144,6 +1393,7 @@ define <2 x double> @shuffle_mem_v2f64_10(<2 x double>* %ptr) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = mem[1,0]
 ; AVX-NEXT:    retq
+
   %a = load <2 x double>, <2 x double>* %ptr
   %shuffle = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   ret <2 x double> %shuffle
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 8612a5afa3d29..35c3b708fd08f 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
@@ -952,6 +953,43 @@ define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
   ret <4 x float> %shuffle
 }
 
+define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
+; SSE2-LABEL: shuffle_v4f32_0z2z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_0z2z:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_0z2z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_0z2z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0z2z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %v, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
+  ret <4 x float> %shuffle
+}
+
 define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: shuffle_v4f32_u051:
 ; SSE:       # BB#0:
@@ -1591,6 +1629,43 @@ define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
   ret <4 x i32> %bitcast32
 }
 
+define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4f32_bitcast_4401:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_bitcast_4401:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %2 = bitcast <4 x i32> %1 to <2 x double>
+  %3 = bitcast <4 x float> %a to <2 x double>
+  %4 = shufflevector <2 x double> %2, <2 x double> %3, <2 x i32> <i32 0, i32 2>
+  %5 = bitcast <2 x double> %4 to <4 x float>
+  ret <4 x float> %5
+}
+
+define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4f32_bitcast_0045:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_bitcast_0045:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %2 = bitcast <4 x i32> %b to <4 x float>
+  %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 1, i32 0, i32 4, i32 5>
+  ret <4 x float> %3
+}
+
 define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
 ; SSE-LABEL: insert_reg_and_zero_v4i32:
 ; SSE:       # BB#0:
@@ -1875,6 +1950,23 @@ define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
   ret <4 x float> %shuffle
 }
 
+define <4 x i32> @insert_dup_mem_v4i32(i32* %ptr) {
+; SSE-LABEL: insert_dup_mem_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_dup_mem_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %tmp2
+}
+
 ;
 ; Shuffle to logical bit shifts
 ;
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 6a29d33d6c5e7..168b3e33bfcf1 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
@@ -2145,3 +2146,254 @@ define <8 x i16> @shuffle_v8i16_8012345u(<8 x i16> %a) {
 
   ret <8 x i16> %shuffle
 }
+
+define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) {
+; SSE2-LABEL: insert_dup_mem_v8i16_i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_mem_v8i16_i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_mem_v8i16_i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_mem_v8i16_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_mem_v8i16_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw (%rdi), %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %tmp3
+}
+
+define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
+; SSE2-LABEL: insert_dup_mem_v8i16_sext_i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movswl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_mem_v8i16_sext_i16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movswl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_mem_v8i16_sext_i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movswl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movswl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movswl (%rdi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i16, i16* %ptr, align 2
+  %tmp1 = sext i16 %tmp to i32
+  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
+  %tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
+  %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %tmp4
+}
+
+define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) {
+; SSE2-LABEL: insert_dup_elt1_mem_v8i16_i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_elt1_mem_v8i16_i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw 2(%rdi), %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i16> %tmp3
+}
+
+define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) {
+; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_elt3_mem_v8i16_i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw 2(%rdi), %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
+  %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i16> %tmp3
+}
+
+define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
+; SSE2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movswl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movswl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movswl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movswl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movswl (%rdi), %eax
+; AVX2-NEXT:    shrl $16, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i16, i16* %ptr, align 2
+  %tmp1 = sext i16 %tmp to i32
+  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
+  %tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
+  %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i16> %tmp4
+}
+
+define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
+; SSE2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movswl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movswl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movswl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movswl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movswl (%rdi), %eax
+; AVX2-NEXT:    shrl $16, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %tmp = load i16, i16* %ptr, align 2
+  %tmp1 = sext i16 %tmp to i32
+  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 1
+  %tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
+  %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i16> %tmp4
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index df4994da69320..7e3dc6e294f8b 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
 
@@ -158,11 +159,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
 ;
 ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i16> %shuffle
@@ -1439,11 +1440,10 @@ define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_z
 define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz(<16 x i16> %a) {
 ; AVX1-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
@@ -1702,21 +1702,21 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_1
 ; AVX1-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
 ; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 11, i32 undef, i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11>
   ret <16 x i16> %shuffle
@@ -1726,21 +1726,21 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_1
 ; AVX1-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
 ; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 15, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15>
   ret <16 x i16> %shuffle
@@ -2444,13 +2444,13 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_2
 ;
 ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 27, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
@@ -2498,13 +2498,13 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_3
 ;
 ; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 31, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2647,13 +2647,13 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_1
 ;
 ; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 18, i32 2, i32 19, i32 11, i32 24, i32 8, i32 25, i32 9, i32 26, i32 10, i32 27, i32 11>
@@ -2674,13 +2674,13 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1
 ;
 ; AVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 20, i32 4, i32 21, i32 5, i32 22, i32 6, i32 23, i32 15, i32 28, i32 12, i32 29, i32 13, i32 30, i32 14, i32 31, i32 15>
@@ -3250,6 +3250,90 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u
   ret <16 x i16> %shuffle
 }
 
+define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
+; ALL-LABEL: shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
+; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i16> %shuffle
+}
+
 define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) {
 ; ALL-LABEL: insert_v16i16_0elt_into_zero_vector:
 ; ALL:       # BB#0:
@@ -3261,3 +3345,112 @@ define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) {
   ret <16 x i16> %i0
 }
 
+define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+  %alo = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuf = shufflevector <8 x i16> %alo, <8 x i16> %bhi, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %shuf
+}
+
+define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) {
+; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; ALL-NEXT:    retq
+  %ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bc0hi = bitcast <8 x i16> %ahi to <16 x i8>
+  %bc1hi = bitcast <8 x i16> %bhi to <16 x i8>
+  %shuffle8 = shufflevector <16 x i8> %bc0hi, <16 x i8> %bc1hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %shuffle16 = bitcast <32 x i8> %shuffle8 to <16 x i16>
+  ret <16 x i16> %shuffle16
+}
+
+define <16 x i16> @insert_dup_mem_v16i16_i32(i32* %ptr) {
+; AVX1-LABEL: insert_dup_mem_v16i16_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_mem_v16i16_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> zeroinitializer
+  ret <16 x i16> %tmp3
+}
+
+define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
+; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movswl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movswl (%rdi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %tmp = load i16, i16* %ptr, align 2
+  %tmp1 = sext i16 %tmp to i32
+  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
+  %tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
+  %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <16 x i32> zeroinitializer
+  ret <16 x i16> %tmp4
+}
+
+define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 {
+; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw 2(%rdi), %ymm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i16> %tmp3
+}
+
+define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 {
+; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw 2(%rdi), %ymm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
+  %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <16 x i16> %tmp3
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index a0f43de75630f..161a21cef0301 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
@@ -324,7 +325,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -947,6 +948,24 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
   ret <32 x i8> %shuffle
 }
 
+define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255]
+; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
+  ret <32 x i8> %shuffle
+}
+
 define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
 ; AVX1:       # BB#0:
@@ -1737,7 +1756,8 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_
 ; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1754,7 +1774,8 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_
 ; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1956,3 +1977,186 @@ define <32 x i8> @shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
   ret <32 x i8> %shuffle
 }
+
+define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
+; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @insert_dup_mem_v32i8_i32(i32* %ptr) {
+; AVX1-LABEL: insert_dup_mem_v32i8_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_mem_v32i8_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb (%rdi), %ymm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
+  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <32 x i32> zeroinitializer
+  ret <32 x i8> %tmp3
+}
+
+define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) {
+; AVX1-LABEL: insert_dup_mem_v32i8_sext_i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movsbl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_mem_v32i8_sext_i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb (%rdi), %ymm0
+; AVX2-NEXT:    retq
+  %tmp = load i8, i8* %ptr, align 1
+  %tmp1 = sext i8 %tmp to i32
+  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <32 x i32> zeroinitializer
+  ret <32 x i8> %tmp4
+}
+
+define <32 x i8> @insert_dup_elt1_mem_v32i8_i32(i32* %ptr) {
+; AVX1-LABEL: insert_dup_elt1_mem_v32i8_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v32i8_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb 1(%rdi), %ymm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
+  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <32 x i8> %tmp3
+}
+
+define <32 x i8> @insert_dup_elt3_mem_v32i8_i32(i32* %ptr) {
+; AVX1-LABEL: insert_dup_elt3_mem_v32i8_i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt3_mem_v32i8_i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb 3(%rdi), %ymm0
+; AVX2-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
+  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <32 x i8> %tmp3
+}
+
+define <32 x i8> @insert_dup_elt1_mem_v32i8_sext_i8(i8* %ptr) {
+; AVX1-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movsbl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movsbl (%rdi), %eax
+; AVX2-NEXT:    shrl $8, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %tmp = load i8, i8* %ptr, align 1
+  %tmp1 = sext i8 %tmp to i32
+  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <32 x i8> %tmp4
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 62bf288a870d1..7e33f5f3aa861 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+; RUN: llc < %s -mcpu=knl -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL
 
 target triple = "x86_64-unknown-unknown"
 
@@ -14,6 +16,11 @@ define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_0000:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   ret <4 x double> %shuffle
 }
@@ -29,6 +36,11 @@ define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_0001:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
   ret <4 x double> %shuffle
 }
@@ -46,6 +58,11 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_0020:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   ret <4 x double> %shuffle
 }
@@ -62,6 +79,11 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_0300:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
   ret <4 x double> %shuffle
 }
@@ -78,6 +100,11 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_1000:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   ret <4 x double> %shuffle
 }
@@ -93,6 +120,11 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_2200:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
   ret <4 x double> %shuffle
 }
@@ -109,6 +141,11 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_3330:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
   ret <4 x double> %shuffle
 }
@@ -124,6 +161,11 @@ define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_3210:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x double> %shuffle
 }
@@ -133,6 +175,7 @@ define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
 ; ALL-NEXT:    retq
+
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
   ret <4 x double> %shuffle
 }
@@ -146,6 +189,16 @@ define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
   ret <4 x double> %shuffle
 }
 
+define <4 x double> @shuffle_v4f64mem_0022(<4 x double>* %ptr, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64mem_0022:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
+; ALL-NEXT:    retq
+  %a = load  <4 x double>,  <4 x double>* %ptr
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
 define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: shuffle_v4f64_1032:
 ; ALL:       # BB#0:
@@ -183,17 +236,11 @@ define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
-; AVX1-LABEL: shuffle_v4f64_0423:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v4f64_0423:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
-; AVX2-NEXT:    retq
+; ALL-LABEL: shuffle_v4f64_0423:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; ALL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
   ret <4 x double> %shuffle
 }
@@ -273,19 +320,39 @@ define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_0145:
-; ALL:       # BB#0:
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v4f64_0145:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0145:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_0145:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x double> %shuffle
 }
 
 define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_4501:
-; ALL:       # BB#0:
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v4f64_4501:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_4501:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_4501:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
   ret <4 x double> %shuffle
 }
@@ -300,31 +367,67 @@ define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_1054:
-; ALL:       # BB#0:
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v4f64_1054:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_1054:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_1054:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
   ret <4 x double> %shuffle
 }
 
 define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_3254:
-; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v4f64_3254:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_3254:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_3254:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
   ret <4 x double> %shuffle
 }
 
 define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_3276:
-; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v4f64_3276:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_3276:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_3276:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
   ret <4 x double> %shuffle
 }
@@ -353,6 +456,13 @@ define <4 x double> @shuffle_v4f64_0415(<4 x double> %a, <4 x double> %b) {
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
 ; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_0415:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512VL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x double> %shuffle
 }
@@ -366,6 +476,65 @@ define <4 x double> @shuffle_v4f64_u062(<4 x double> %a, <4 x double> %b) {
   ret <4 x double> %shuffle
 }
 
+define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_15uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_11uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_22uu(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_22uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_22uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_22uu:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512VL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 undef, i32 undef>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_3333:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_3333:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_3333:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512VL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x double> %shuffle
+}
+
 define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_0000:
 ; AVX1:       # BB#0:
@@ -377,6 +546,11 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0000:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i64> %shuffle
 }
@@ -392,6 +566,11 @@ define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0001:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
   ret <4 x i64> %shuffle
 }
@@ -409,6 +588,11 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0020:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   ret <4 x i64> %shuffle
 }
@@ -425,6 +609,11 @@ define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0112:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
   ret <4 x i64> %shuffle
 }
@@ -441,6 +630,11 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0300:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
   ret <4 x i64> %shuffle
 }
@@ -457,6 +651,11 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_1000:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   ret <4 x i64> %shuffle
 }
@@ -472,6 +671,11 @@ define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_2200:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
   ret <4 x i64> %shuffle
 }
@@ -488,6 +692,11 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_3330:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
   ret <4 x i64> %shuffle
 }
@@ -503,6 +712,11 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_3210:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x i64> %shuffle
 }
@@ -520,6 +734,12 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0124:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i64> %shuffle
 }
@@ -527,17 +747,24 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_0142:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4i64_0142:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0142:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
   ret <4 x i64> %shuffle
 }
@@ -548,16 +775,23 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4i64_0412:
 ; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
-; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0412:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
   ret <4 x i64> %shuffle
 }
@@ -577,15 +811,31 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_4012:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
   ret <4 x i64> %shuffle
 }
 
 define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) {
-; ALL-LABEL: shuffle_v4i64_0145:
-; ALL:       # BB#0:
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v4i64_0145:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0145:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0145:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x i64> %shuffle
 }
@@ -604,15 +854,32 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0451:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
   ret <4 x i64> %shuffle
 }
 
 define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) {
-; ALL-LABEL: shuffle_v4i64_4501:
-; ALL:       # BB#0:
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v4i64_4501:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_4501:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_4501:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vinserti32x4 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
   ret <4 x i64> %shuffle
 }
@@ -631,6 +898,13 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_4015:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vinserti32x4 $1, %xmm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
   ret <4 x i64> %shuffle
 }
@@ -648,6 +922,12 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,1]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_2u35:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,1]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>
   ret <4 x i64> %shuffle
 }
@@ -668,6 +948,13 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_1251:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1>
   ret <4 x i64> %shuffle
 }
@@ -684,6 +971,12 @@ define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_1054:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
   ret <4 x i64> %shuffle
 }
@@ -700,6 +993,12 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_3254:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
   ret <4 x i64> %shuffle
 }
@@ -716,6 +1015,12 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_3276:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
   ret <4 x i64> %shuffle
 }
@@ -732,6 +1037,12 @@ define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_1076:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
   ret <4 x i64> %shuffle
 }
@@ -750,6 +1061,13 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0415:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x i64> %shuffle
 }
@@ -765,6 +1083,11 @@ define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_z4z6:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 0, i32 4, i32 0, i32 6>
   ret <4 x i64> %shuffle
 }
@@ -780,6 +1103,11 @@ define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_5zuz:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 5, i32 0, i32 undef, i32 0>
   ret <4 x i64> %shuffle
 }
@@ -794,10 +1122,74 @@ define <4 x i64> @shuffle_v4i64_40u2(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_40u2:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 2>
   ret <4 x i64> %shuffle
 }
 
+define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) {
+; ALL-LABEL: shuffle_v4i64_15uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) {
+; ALL-LABEL: shuffle_v4i64_11uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_22uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_22uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_22uu:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512VL-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 undef, i32 undef>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_3333:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_3333:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_3333:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512VL-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i64> %shuffle
+}
+
 define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
 ; ALL-LABEL: stress_test1:
 ; ALL:         retq
@@ -820,10 +1212,20 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
 }
 
 define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
-; ALL-LABEL: insert_mem_and_zero_v4i64:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT:    retq
+; AVX1-LABEL: insert_mem_and_zero_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_and_zero_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_mem_and_zero_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovq (%rdi), %xmm0
+; AVX512VL-NEXT:    retq
   %a = load i64, i64* %ptr
   %v = insertelement <4 x i64> undef, i64 %a, i64 0
   %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -831,21 +1233,43 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
 }
 
 define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
-; ALL-LABEL: insert_reg_and_zero_v4f64:
-; ALL:       # BB#0:
-; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; ALL-NEXT:    retq
+; AVX1-LABEL: insert_reg_and_zero_v4f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_reg_and_zero_v4f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_reg_and_zero_v4f64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
   %v = insertelement <4 x double> undef, double %a, i32 0
   %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x double> %shuffle
 }
 
 define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
-; ALL-LABEL: insert_mem_and_zero_v4f64:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT:    retq
+; AVX1-LABEL: insert_mem_and_zero_v4f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_and_zero_v4f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_mem_and_zero_v4f64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovsd (%rdi), %xmm0
+; AVX512VL-NEXT:    retq
   %a = load double, double* %ptr
   %v = insertelement <4 x double> undef, double %a, i32 0
   %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -864,10 +1288,20 @@ define <4 x double> @splat_mem_v4f64(double* %ptr) {
 }
 
 define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
-; ALL-LABEL: splat_mem_v4i64:
-; ALL:       # BB#0:
-; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
-; ALL-NEXT:    retq
+; AVX1-LABEL: splat_mem_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splat_mem_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: splat_mem_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastq (%rdi), %ymm0
+; AVX512VL-NEXT:    retq
   %a = load i64, i64* %ptr
   %v = insertelement <4 x i64> undef, i64 %a, i64 0
   %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -896,6 +1330,11 @@ define <4 x double> @splat_v4f64(<2 x double> %r) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: splat_v4f64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX512VL-NEXT:    retq
   %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer
   ret <4 x double> %1
 }
@@ -911,44 +1350,67 @@ define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: splat_mem_v4i64_from_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastq (%rdi), %ymm0
+; AVX512VL-NEXT:    retq
   %v = load <2 x i64>, <2 x i64>* %ptr
   %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i64> %shuffle
 }
 
 define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
-; AVX1-LABEL: splat_mem_v4f64_from_v2f64:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: splat_mem_v4f64_from_v2f64:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT:    retq
+; ALL-LABEL: splat_mem_v4f64_from_v2f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT:    retq
   %v = load <2 x double>, <2 x double>* %ptr
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   ret <4 x double> %shuffle
 }
 
 define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
-; ALL-LABEL: splat128_mem_v4i64_from_v2i64:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovaps (%rdi), %xmm0
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX1-LABEL: splat128_mem_v4i64_from_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovaps (%rdi), %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splat128_mem_v4i64_from_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps (%rdi), %xmm0
+; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: splat128_mem_v4i64_from_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa64 (%rdi), %xmm0
+; AVX512VL-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %v = load <2 x i64>, <2 x i64>* %ptr
   %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   ret <4 x i64> %shuffle
 }
 
 define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
-; ALL-LABEL: splat128_mem_v4f64_from_v2f64:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovaps (%rdi), %xmm0
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX1-LABEL: splat128_mem_v4f64_from_v2f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovaps (%rdi), %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splat128_mem_v4f64_from_v2f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps (%rdi), %xmm0
+; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: splat128_mem_v4f64_from_v2f64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovapd (%rdi), %xmm0
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %v = load <2 x double>, <2 x double>* %ptr
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   ret <4 x double> %shuffle
@@ -964,6 +1426,11 @@ define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: bitcast_v4f64_0426:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512VL-NEXT:    retq
   %shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
   %bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float>
   %shuffle32 = shufflevector <8 x float> %bitcast32, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -972,3 +1439,69 @@ define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
   %bitcast64 = bitcast <16 x i16> %shuffle16 to <4 x double>
   ret <4 x double> %bitcast64
 }
+
+define <4 x i64> @concat_v4i64_0167(<4 x i64> %a0, <4 x i64> %a1) {
+; AVX1-LABEL: concat_v4i64_0167:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: concat_v4i64_0167:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: concat_v4i64_0167:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT:    retq
+  %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1>
+  %a1hi = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 6, i32 7>
+  %shuffle64 = shufflevector <2 x i64> %a0lo, <2 x i64> %a1hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %shuffle64
+}
+
+define <4 x i64> @concat_v4i64_0145_bc(<4 x i64> %a0, <4 x i64> %a1) {
+; AVX1-LABEL: concat_v4i64_0145_bc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: concat_v4i64_0145_bc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: concat_v4i64_0145_bc:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+  %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1>
+  %a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 4, i32 5>
+  %bc0lo = bitcast <2 x i64> %a0lo to <4 x i32>
+  %bc1lo = bitcast <2 x i64> %a1lo to <4 x i32>
+  %shuffle32 = shufflevector <4 x i32> %bc0lo, <4 x i32> %bc1lo, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle64 = bitcast <8 x i32> %shuffle32 to <4 x i64>
+  ret <4 x i64> %shuffle64
+}
+
+define <4 x i64> @insert_dup_mem_v4i64(i64* %ptr) {
+; AVX1-LABEL: insert_dup_mem_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_dup_mem_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: insert_dup_mem_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastq (%rdi), %ymm0
+; AVX512VL-NEXT:    retq
+  %tmp = load i64, i64* %ptr, align 1
+  %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
+  %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <4 x i32> zeroinitializer
+  ret <4 x i64> %tmp2
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index bc72e0a661777..e8b886afd1aee 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
 
@@ -72,10 +73,10 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
 ; AVX1-LABEL: shuffle_v8f32_00040000:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8f32_00040000:
@@ -830,6 +831,87 @@ define <8 x float> @shuffle_v8f32_a2u3e6f7(<8 x float> %a, <8 x float> %b) {
   ret <8 x float> %shuffle
 }
 
+define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_uuuu1111:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_44444444:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_44444444:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_1188uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_uuuu3210:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_uuuu1188:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 8, i32 8>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_1111uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_5555uuuu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_5555uuuu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %shuffle
+}
+
 define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_00000000:
 ; AVX1:       # BB#0:
@@ -899,10 +981,10 @@ define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_00040000:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_00040000:
@@ -1895,6 +1977,73 @@ define <8 x i32> @shuffle_v8i32_80u1b4uu(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %shuffle
 }
 
+define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_uuuu1111:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_uuuu1111:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) {
+; ALL-LABEL: shuffle_v8i32_2222uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) {
+; ALL-LABEL: shuffle_v8i32_2A3Buuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_44444444:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_44444444:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_5555uuuu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_5555uuuu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
 define <8 x float> @splat_mem_v8f32_2(float* %p) {
 ; ALL-LABEL: splat_mem_v8f32_2:
 ; ALL:       # BB#0:
@@ -2098,3 +2247,59 @@ define <8 x i32> @insert_mem_and_zero_v8i32(i32* %ptr) {
   ret <8 x i32> %shuffle
 }
 
+define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: concat_v8i32_0123CDEF:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: concat_v8i32_0123CDEF:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+  %alo = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %bhi = shufflevector <8 x i32> %b, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuf = shufflevector <4 x i32> %alo, <4 x i32> %bhi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %shuf
+}
+
+define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) {
+; ALL-LABEL: concat_v8i32_4567CDEF_bc:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; ALL-NEXT:    retq
+  %a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %bc0hi = bitcast <4 x i32> %a0hi to <2 x i64>
+  %bc1hi = bitcast <4 x i32> %a1hi to <2 x i64>
+  %shuffle64 = shufflevector <2 x i64> %bc0hi, <2 x i64> %bc1hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle32 = bitcast <4 x i64> %shuffle64 to <8 x i32>
+  ret <8 x i32> %shuffle32
+}
+
+define <8 x float> @concat_v8f32_4567CDEF_bc(<8 x float> %f0, <8 x float> %f1) {
+; ALL-LABEL: concat_v8f32_4567CDEF_bc:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; ALL-NEXT:    retq
+  %a0 = bitcast <8 x float> %f0 to <4 x i64>
+  %a1 = bitcast <8 x float> %f1 to <8 x i32>
+  %a0hi = shufflevector <4 x i64> %a0, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %a1hi = shufflevector <8 x i32> %a1, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %bc0hi = bitcast <2 x i64> %a0hi to <2 x i64>
+  %bc1hi = bitcast <4 x i32> %a1hi to <2 x i64>
+  %shuffle64 = shufflevector <2 x i64> %bc0hi, <2 x i64> %bc1hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle32 = bitcast <4 x i64> %shuffle64 to <8 x float>
+  ret <8 x float> %shuffle32
+}
+
+define <8 x i32> @insert_dup_mem_v8i32(i32* %ptr) {
+; ALL-LABEL: insert_dup_mem_v8i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastss (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %tmp = load i32, i32* %ptr, align 4
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %tmp2
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 406d52406d95b..bef54b05041ba 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
@@ -12,6 +13,25 @@ define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d
   ret <16 x float> %shuffle
 }
 
+define <16 x float> @shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32><i32 0, i32 16, i32 1, i32 16, i32 4, i32 16, i32 5, i32 16, i32 8, i32 16, i32 9, i32 16, i32 12, i32 16, i32 13, i32 16>
+  ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_vunpcklps_swap(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_vunpcklps_swap:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 20, i32 4, i32 21, i32 5, i32 24, i32 8, i32 25, i32 9, i32 28, i32 12, i32 29, i32 13>
+  ret <16 x float> %shuffle
+}
+
 define <16 x i32> @shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x i32> %a, <16 x i32> %b) {
 ; ALL-LABEL: shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
 ; ALL:       # BB#0:
@@ -21,6 +41,16 @@ define <16 x i32> @shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1
   ret <16 x i32> %shuffle
 }
 
+define <16 x i32> @shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1d(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1d:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; ALL-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> zeroinitializer, <16 x i32> %b, <16 x i32><i32 15, i32 16, i32 13, i32 17, i32 11, i32 20, i32 9, i32 21, i32 7, i32 24, i32 5, i32 25, i32 3, i32 28, i32 1, i32 29>
+  ret <16 x i32> %shuffle
+}
+
 define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x float> %a, <16 x float> %b) {
 ; ALL-LABEL: shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
 ; ALL:       # BB#0:
@@ -30,6 +60,16 @@ define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f
   ret <16 x float> %shuffle
 }
 
+define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; ALL-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x float> zeroinitializer, <16 x float> %b, <16 x i32><i32 0, i32 18, i32 0, i32 19, i32 4, i32 22, i32 4, i32 23, i32 6, i32 26, i32 6, i32 27, i32 8, i32 30, i32 8, i32 31>
+  ret <16 x float> %shuffle
+}
+
 define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x i32> %a, <16 x i32> %b) {
 ; ALL-LABEL: shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
 ; ALL:       # BB#0:
@@ -38,3 +78,97 @@ define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   ret <16 x i32> %shuffle
 }
+
+define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_zz(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_zz:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> zeroinitializer, <16 x i32><i32 2, i32 30, i32 3, i32 28, i32 6, i32 26, i32 7, i32 24, i32 10, i32 22, i32 11, i32 20, i32 14, i32 18, i32 15, i32 16>
+  ret <16 x i32> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a)  {
+; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqa32 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
+; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
+  ret <16 x float> %c
+}
+
+define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x i32> %a)  {
+; ALL-LABEL: shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqa32 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
+; ALL-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
+  ret <16 x i32> %c
+}
+
+define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32> %b)  {
+; ALL-LABEL: shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    retq
+  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x i32> %c
+}
+
+define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float> %b)  {
+; ALL-LABEL: shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    retq
+  %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x float> %c
+}
+
+define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float>* %b)  {
+; ALL-LABEL: shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %c = load <16 x float>, <16 x float>* %b
+  %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x float> %d
+}
+
+define <16 x i32> @shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32>* %b)  {
+; ALL-LABEL: shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT:    vpermt2d (%rdi), %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %c = load <16 x i32>, <16 x i32>* %b
+  %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x i32> %d
+}
+
+define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b)  {
+; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqa32 {{.*#+}} zmm2 = <0,1,2,19,u,u,u,u,u,u,u,u,u,u,u,u>
+; ALL-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    retq
+  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i32> %c
+}
+
+define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
+; ALL-LABEL: shuffle_v16f32_extract_256:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovups (%rsi), %zmm0
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %ptr_a = bitcast float* %a to <16 x float>*
+  %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4
+  %v2 = shufflevector <16 x float> %v_a, <16 x float> undef, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x float> %v2
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll
new file mode 100644
index 0000000000000..ab809beb4b482
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
+target triple = "x86_64-unknown-unknown"
+
+define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a)  {
+; ALL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
+; ALL-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1, i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 31>
+  ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b)  {
+; ALL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
+; ALL-NEXT:    vpermt2w %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    retq
+  %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24, i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 56>
+  ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b)  {
+; ALL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqu16 {{.*#+}} zmm2 = <0,32,1,33,2,34,3,35,8,40,9,41,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; ALL-NEXT:    vpermt2w %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    retq
+  %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b)  {
+; ALL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovdqu16 {{.*#+}} zmm2 = <4,36,5,37,6,38,7,39,12,44,13,45,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; ALL-NEXT:    vpermt2w %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    retq
+  %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i16> %c
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 62d4af7809b6b..631968f6afa2e 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1,1452 +1,2209 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
 
 target triple = "x86_64-unknown-unknown"
 
 define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00000000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_00000000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vbroadcastsd %xmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00000000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vbroadcastsd %xmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00000010:
-; ALL:       # BB#0:
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_00000010:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00000010:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00000200:
-; ALL:       # BB#0:
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_00000200:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00000200:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00003000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_00003000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00003000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00040000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
-; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_00040000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00040000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00500000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,0]
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_00500000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00500000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_06000000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,0]
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_06000000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_06000000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_70000000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,0,0,0]
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_70000000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    movl $7, %eax
+; AVX512F-NEXT:    vpinsrq $0, %rax, %xmm1, %xmm2
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_70000000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-32-NEXT:    movl $7, %eax
+; AVX512F-32-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vinserti32x4 $0, %xmm1, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_01014545:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_01014545:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_01014545:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00112233:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_00112233:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00112233:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00001111:
-; ALL:       # BB#0:
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8f64_00001111:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00001111:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_81a3c5e7:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_81a3c5e7:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15]
+; AVX512F-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_81a3c5e7:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_08080808:
-; ALL:       # BB#0:
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_08080808:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_08080808:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_08084c4c:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vbroadcastsd %xmm3, %ymm3
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_08084c4c:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12]
+; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_08084c4c:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,4,0,12,0,4,0,12,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_8823cc67:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vbroadcastsd %xmm3, %ymm3
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
-; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_8823cc67:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15]
+; AVX512F-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_8823cc67:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_9832dc76:
-; ALL:       # BB#0:
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3]
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_9832dc76:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14]
+; AVX512F-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_9832dc76:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_9810dc54:
-; ALL:       # BB#0:
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm2
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_9810dc54:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12]
+; AVX512F-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_9810dc54:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_08194c5d:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
-; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_08194c5d:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13]
+; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_08194c5d:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,4,0,12,0,5,0,13,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_2a3b6e7f:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
-; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_2a3b6e7f:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15]
+; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_2a3b6e7f:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,0,10,0,3,0,11,0,6,0,14,0,7,0,15,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_08192a3b:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_08192a3b:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
+; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_08192a3b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,2,0,10,0,3,0,11,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_08991abb:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm1[0,0,1,1]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_08991abb:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3]
+; AVX512F-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_08991abb:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_091b2d3f:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_091b2d3f:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15]
+; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_091b2d3f:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,1,0,11,0,2,0,13,0,3,0,15,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_09ab1def:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_09ab1def:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7]
+; AVX512F-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_09ab1def:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00014445:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_00014445:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,4,4,4,5]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00014445:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,4,0,4,0,4,0,5,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00204464:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_00204464:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,4,6,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00204464:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,4,0,6,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_03004744:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_03004744:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,7,4,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_03004744:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,7,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_10005444:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_10005444:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,5,4,4,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_10005444:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,5,0,4,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_22006644:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_22006644:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,6,4,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_22006644:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,6,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_33307774:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_33307774:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,7,7,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_33307774:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,7,0,7,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_32107654:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_32107654:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,7,6,5,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_32107654:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00234467:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_00234467:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,4,4,6,7]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00234467:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,4,0,4,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00224466:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_00224466:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,4,4,6,6]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00224466:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,4,0,4,0,6,0,6,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_10325476:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_10325476:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,5,4,7,6]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_10325476:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,5,0,4,0,7,0,6,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_11335577:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_11335577:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,5,5,7,7]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_11335577:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,5,0,5,0,7,0,7,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_10235467:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_10235467:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,2,3,5,4,6,7]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_10235467:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,3,0,5,0,4,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_10225466:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_10225466:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,2,2,5,4,6,6]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_10225466:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,2,0,5,0,4,0,6,0,6,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00015444:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_00015444:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00015444:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00204644:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_00204644:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00204644:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_03004474:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_03004474:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_03004474:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_10004444:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_10004444:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_10004444:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_22006446:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_22006446:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_22006446:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_33307474:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_33307474:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_33307474:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_32104567:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_32104567:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_32104567:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00236744:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_00236744:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00236744:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00226644:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_00226644:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00226644:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_10324567:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_10324567:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_10324567:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_11334567:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_11334567:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_11334567:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_01235467:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_01235467:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_01235467:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_01235466:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_01235466:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6]
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_01235466:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0]
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_002u6u44:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_002u6u44:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4>
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_002u6u44:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0>
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_00uu66uu:
-; ALL:       # BB#0:
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_00uu66uu:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u>
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_00uu66uu:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u>
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_103245uu:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_103245uu:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u>
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_103245uu:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u>
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_1133uu67:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_1133uu67:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7>
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_1133uu67:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0>
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_0uu354uu:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_0uu354uu:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u>
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_0uu354uu:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u>
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_uuu3uu66:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_uuu3uu66:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,3,u,u,6,6>
+; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_uuu3uu66:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,u,u,3,0,u,u,u,u,6,0,6,0>
+; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_c348cda0:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1]
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vbroadcastsd %xmm1, %ymm4
-; ALL-NEXT:    vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3]
-; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_c348cda0:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8]
+; AVX512F-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_c348cda0:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,0,11,0,12,0,0,0,4,0,5,0,2,0,8,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
   ret <8 x double> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_f511235a:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vblendpd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[2,3,1,3]
-; ALL-NEXT:    vmovddup {{.*#+}} ymm4 = ymm1[0,0,2,2]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3]
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm3, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_f511235a:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,5,1,1,2,3,5,10]
+; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_f511235a:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,0,5,0,1,0,1,0,2,0,3,0,5,0,10,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
   ret <8 x double> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00000000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpbroadcastq %xmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00000000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00000000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00000010:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00000010:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00000010:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00000200:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00000200:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00000200:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00003000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00003000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00003000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00040000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
-; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
-; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00040000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00040000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00500000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
-; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00500000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00500000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_06000000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,0,0]
-; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_06000000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_06000000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_70000000:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,0,0,0]
-; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_70000000:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    movl $7, %eax
+; AVX512F-NEXT:    vpinsrq $0, %rax, %xmm1, %xmm2
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_70000000:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-32-NEXT:    movl $7, %eax
+; AVX512F-32-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
+; AVX512F-32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vinserti32x4 $0, %xmm1, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_01014545:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
-; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v8i64_01014545:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01014545:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT:    retl
+
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00112233:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00112233:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00112233:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00001111:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00001111:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00001111:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_81a3c5e7:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_81a3c5e7:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_08080808:
-; ALL:       # BB#0:
-; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_08080808:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_08080808:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_08084c4c:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm2, %ymm2
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vpbroadcastq %xmm3, %ymm3
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
-; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_08084c4c:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12]
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_08084c4c:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,4,0,12,0,4,0,12,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_8823cc67:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vpbroadcastq %xmm3, %ymm3
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_8823cc67:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_8823cc67:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_9832dc76:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; ALL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_9832dc76:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_9832dc76:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_9810dc54:
-; ALL:       # BB#0:
-; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
-; ALL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_9810dc54:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_9810dc54:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_08194c5d:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_08194c5d:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13]
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_08194c5d:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,4,0,12,0,5,0,13,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_2a3b6e7f:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_2a3b6e7f:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15]
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_2a3b6e7f:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,0,10,0,3,0,11,0,6,0,14,0,7,0,15,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_08192a3b:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3]
-; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_08192a3b:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_08192a3b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,2,0,10,0,3,0,11,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_08991abb:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[0,0,1,1]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,2,3,3]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_08991abb:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_08991abb:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_091b2d3f:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_091b2d3f:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15]
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_091b2d3f:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,1,0,11,0,2,0,13,0,3,0,15,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_09ab1def:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; ALL-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_09ab1def:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_09ab1def:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00014445:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00014445:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,4,4,4,5]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00014445:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,4,0,4,0,4,0,5,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00204464:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00204464:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,4,6,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00204464:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,4,0,6,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_03004744:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_03004744:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,7,4,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_03004744:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,7,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_10005444:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_10005444:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,5,4,4,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_10005444:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,5,0,4,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_22006644:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_22006644:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,6,4,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_22006644:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,6,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_33307774:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_33307774:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,7,7,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_33307774:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,7,0,7,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_32107654:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_32107654:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,7,6,5,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_32107654:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00234467:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00234467:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,4,4,6,7]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00234467:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,4,0,4,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00224466:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00224466:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,4,4,6,6]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00224466:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,4,0,4,0,6,0,6,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_10325476:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_10325476:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,5,4,7,6]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_10325476:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,5,0,4,0,7,0,6,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_11335577:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_11335577:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,5,5,7,7]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_11335577:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,5,0,5,0,7,0,7,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_10235467:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_10235467:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,2,3,5,4,6,7]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_10235467:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,3,0,5,0,4,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_10225466:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_10225466:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,2,2,5,4,6,6]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_10225466:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,2,0,5,0,4,0,6,0,6,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00015444:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00015444:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00015444:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00204644:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00204644:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00204644:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_03004474:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_03004474:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_03004474:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_10004444:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_10004444:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_10004444:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_22006446:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_22006446:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_22006446:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_33307474:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_33307474:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_33307474:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_32104567:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_32104567:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_32104567:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00236744:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00236744:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00236744:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00226644:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00226644:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00226644:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_10324567:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_10324567:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_10324567:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_11334567:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_11334567:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_11334567:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_01235467:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3]
-; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_01235467:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01235467:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_01235466:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2]
-; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_01235466:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01235466:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0]
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_002u6u44:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_002u6u44:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4>
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_002u6u44:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0>
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_00uu66uu:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_00uu66uu:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u>
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_00uu66uu:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u>
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_103245uu:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_103245uu:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u>
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_103245uu:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u>
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_1133uu67:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_1133uu67:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7>
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_1133uu67:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0>
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_0uu354uu:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_0uu354uu:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u>
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_0uu354uu:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u>
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_uuu3uu66:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
-; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_uuu3uu66:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,3,u,u,6,6>
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_uuu3uu66:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,u,u,3,0,u,u,u,u,6,0,6,0>
+; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
   ret <8 x i64> %shuffle
 }
 
 define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_6caa87e5:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; ALL-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_6caa87e5:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_6caa87e5:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,0,4,0,2,0,2,0,0,0,15,0,6,0,13,0]
+; AVX512F-32-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
   ret <8 x i64> %shuffle
 }
 
 define <8 x double> @shuffle_v8f64_082a4c6e(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_082a4c6e:
-; ALL:       # BB#0:
-; ALL-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_082a4c6e:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_082a4c6e:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32><i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x double> %shuffle
 }
 
+define <8 x double> @shuffle_v8f64_0z2z4z6z(<8 x double> %a, <8 x double> %b) {
+;
+; AVX512F-LABEL: shuffle_v8f64_0z2z4z6z:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_0z2z4z6z:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-32-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-32-NEXT:    retl
+  %shuffle = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <8 x i32><i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6, i32 8>
+  ret <8 x double> %shuffle
+}
+
 define <8 x i64> @shuffle_v8i64_082a4c6e(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_082a4c6e:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_082a4c6e:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_082a4c6e:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32><i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i64> %shuffle
 }
 
+define <8 x i64> @shuffle_v8i64_z8zazcze(<8 x i64> %a, <8 x i64> %b) {
+;
+; AVX512F-LABEL: shuffle_v8i64_z8zazcze:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_z8zazcze:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-32-NEXT:    retl
+  %shuffle = shufflevector <8 x i64> zeroinitializer, <8 x i64> %b, <8 x i32><i32 7, i32 8, i32 5, i32 10, i32 3, i32 12, i32 1, i32 14>
+  ret <8 x i64> %shuffle
+}
+
 define <8 x double> @shuffle_v8f64_193b5d7f(<8 x double> %a, <8 x double> %b) {
-; ALL-LABEL: shuffle_v8f64_193b5d7f:
-; ALL:       # BB#0:
-; ALL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8f64_193b5d7f:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_193b5d7f:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x double> %shuffle
 }
 
+define <8 x double> @shuffle_v8f64_z9zbzdzf(<8 x double> %a, <8 x double> %b) {
+;
+; AVX512F-LABEL: shuffle_v8f64_z9zbzdzf:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_z9zbzdzf:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-32-NEXT:    retl
+  %shuffle = shufflevector <8 x double> zeroinitializer, <8 x double> %b, <8 x i32><i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
+  ret <8 x double> %shuffle
+}
+
 define <8 x i64> @shuffle_v8i64_193b5d7f(<8 x i64> %a, <8 x i64> %b) {
-; ALL-LABEL: shuffle_v8i64_193b5d7f:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; ALL-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i64_193b5d7f:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_193b5d7f:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i64> %shuffle
 }
+
+define <8 x i64> @shuffle_v8i64_1z3z5z7z(<8 x i64> %a, <8 x i64> %b) {
+;
+; AVX512F-LABEL: shuffle_v8i64_1z3z5z7z:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_1z3z5z7z:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-32-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-32-NEXT:    retl
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32><i32 1, i32 8, i32 3, i32 15, i32 5, i32 8, i32 7, i32 15>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind {
+; AVX512F-LABEL: test_vshuff64x2_512:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_vshuff64x2_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-32-NEXT:    retl
+  %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
+; AVX512F-LABEL: test_vshuff64x2_512_maskz:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpmovsxwq %xmm2, %zmm2
+; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm2
+; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512F-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_vshuff64x2_512_maskz:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpmovsxwq %xmm2, %zmm2
+; AVX512F-32-NEXT:    vpsllvq .LCPI122_0, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512F-32-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-32-NEXT:    retl
+  %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+  %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
+; AVX512F-LABEL: test_vshufi64x2_512_mask:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpmovsxwq %xmm2, %zmm2
+; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm2
+; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_vshufi64x2_512_mask:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpmovsxwq %xmm2, %zmm2
+; AVX512F-32-NEXT:    vpsllvq .LCPI123_0, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-32-NEXT:    retl
+  %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+  %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
+  ret <8 x i64> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind {
+; AVX512F-LABEL: test_vshuff64x2_512_mem:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_vshuff64x2_512_mem:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-32-NEXT:    retl
+  %x1   = load <8 x double>,<8 x double> *%ptr,align 1
+  %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind {
+; AVX512F-LABEL: test_vshuff64x2_512_mem_mask:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512F-32-NEXT:    vpsllvq .LCPI125_0, %zmm1, %zmm1
+; AVX512F-32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-32-NEXT:    retl
+  %x1 = load <8 x double>,<8 x double> *%ptr,align 1
+  %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+  %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> %x
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind {
+; AVX512F-LABEL: test_vshuff64x2_512_mem_maskz:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512F-32-NEXT:    vpsllvq .LCPI126_0, %zmm1, %zmm1
+; AVX512F-32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-32-NEXT:    retl
+  %x1 = load <8 x double>,<8 x double> *%ptr,align 1
+  %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+  %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) nounwind {
+; AVX512F-LABEL: test_vshuff32x4_512:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1]
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_vshuff32x4_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1]
+; AVX512F-32-NEXT:    retl
+  %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
+  ret <16 x float> %res
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 92c59e2fca08b..75ce9753525b7 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
diff --git a/test/CodeGen/X86/vector-shuffle-mmx.ll b/test/CodeGen/X86/vector-shuffle-mmx.ll
index dbccd2694b070..37f9ea98949fb 100644
--- a/test/CodeGen/X86/vector-shuffle-mmx.ll
+++ b/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck --check-prefix=X32 %s
 ; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse2 | FileCheck --check-prefix=X64 %s
 
diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll
index 66e53bbb7502a..548de4ce6ea3e 100644
--- a/test/CodeGen/X86/vector-shuffle-sse1.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse1.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=x86-64 -mattr=-sse2 | FileCheck %s --check-prefix=SSE1
 
 target triple = "x86_64-unknown-unknown"
diff --git a/test/CodeGen/X86/vector-shuffle-sse4a.ll b/test/CodeGen/X86/vector-shuffle-sse4a.ll
index 26062335cc168..eec915d91bbb0 100644
--- a/test/CodeGen/X86/vector-shuffle-sse4a.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse4a.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2
 
@@ -5,6 +6,35 @@
 ; EXTRQI
 ;
 
+; A length of zero is equivalent to a bit length of 64.
+define <2 x i64> @extrqi_len0_idx0(<2 x i64> %a) {
+; ALL-LABEL: extrqi_len0_idx0:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 0, i8 0)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @extrqi_len8_idx16(<2 x i64> %a) {
+; ALL-LABEL: extrqi_len8_idx16:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 8, i8 16)
+  ret <2 x i64> %1
+}
+
+; If the length + index exceeds the bottom 64 bits the result is undefined.
+define <2 x i64> @extrqi_len32_idx48(<2 x i64> %a) {
+; ALL-LABEL: extrqi_len32_idx48:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 32, i8 48)
+  ret <2 x i64> %1
+}
+
 define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
 ; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu:
 ; BTVER1:       # BB#0:
@@ -36,6 +66,24 @@ define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
   ret <16 x i8> %s
 }
 
+define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_2zzzzzzz3zzzzzzz:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    movaps %xmm0, %xmm1
+; BTVER1-NEXT:    extrq {{.*#+}} xmm1 = xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_2zzzzzzz3zzzzzzz:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; BTVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <16 x i8> %s
+}
+
 define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
 ; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
 ; BTVER1:       # BB#0:
@@ -139,6 +187,35 @@ define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
 ; INSERTQI
 ;
 
+; A length of zero is equivalent to a bit length of 64.
+define <2 x i64> @insertqi_len0_idx0(<2 x i64> %a, <2 x i64> %b) {
+; ALL-LABEL: insertqi_len0_idx0:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7],xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 0, i8 0)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @insertqi_len8_idx16(<2 x i64> %a, <2 x i64> %b) {
+; ALL-LABEL: insertqi_len8_idx16:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 8, i8 16)
+  ret <2 x i64> %1
+}
+
+; If the length + index exceeds the bottom 64 bits the result is undefined
+define <2 x i64> @insertqi_len32_idx48(<2 x i64> %a, <2 x i64> %b) {
+; ALL-LABEL: insertqi_len32_idx48:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 32, i8 48)
+  ret <2 x i64> %1
+}
+
 define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
 ; ALL-LABEL: shuf_0_0_2_3_uuuu_uuuu_uuuu:
 ; ALL:       # BB#0:
@@ -219,3 +296,66 @@ define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
   %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %s
 }
+
+;
+; Special Cases
+;
+
+; Out of range.
+define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
+; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    psrld $16, %xmm1
+; BTVER1-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; BTVER1-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpsrld $16, %xmm1, %xmm1
+; BTVER2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; BTVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; BTVER2-NEXT:    retq
+  %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) {
+; BTVER1-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,5,5,4,4,5,5,4,4,5,5,6,6,7,7]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,5,5,4,4,5,5,4,4,5,5,6,6,7,7]
+; BTVER2-NEXT:    retq
+  %1 = shufflevector <16 x i8> %v, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 0, i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) {
+; BTVER1-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u],zero,xmm0[4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u],zero,xmm0[4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; BTVER2-NEXT:    retq
+  %1 = shufflevector <16 x i8> %v, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 16, i32 4, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @shuffle_uu_uu_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) {
+; ALL-LABEL: shuffle_uu_uu_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[2,3,4],zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %1 = shufflevector <16 x i8> %v, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 undef, i32 4, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %1
+}
+
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
new file mode 100644
index 0000000000000..a387f894a0677
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -0,0 +1,439 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ
+
+target triple = "x86_64-unknown-unknown"
+
+define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
+; AVX512F-LABEL: shuf2i1_1_0:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf2i1_1_0:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
+; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
+; VL_BW_DQ-NEXT:    retq
+  %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x i1> %b
+}
+
+define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
+; AVX512F-LABEL: shuf2i1_1_2:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movl $1, %eax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf2i1_1_2:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
+; VL_BW_DQ-NEXT:    movb $1, %al
+; VL_BW_DQ-NEXT:    kmovb %eax, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm1
+; VL_BW_DQ-NEXT:    vpalignr $8, %xmm0, %xmm1, %xmm0
+; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
+; VL_BW_DQ-NEXT:    retq
+  %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
+  ret <2 x i1> %b
+}
+
+
+define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
+; AVX512F-LABEL: shuf4i1_3_2_10:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
+; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
+; VL_BW_DQ-NEXT:    retq
+  %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i1> %b
+}
+
+define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
+; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
+; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
+; VL_BW_DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
+; VL_BW_DQ-NEXT:    retq
+  %a2 = icmp eq <8 x i64> %a, %a1
+  %b2 = icmp eq <8 x i64> %b, %b1
+  %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
+  ret <8 x i1> %c
+}
+
+define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
+; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512F-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
+; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0
+; AVX512F-NEXT:    vmovdqu32 %zmm0, %zmm1 {%k2} {z}
+; AVX512F-NEXT:    vmovdqu32 %zmm0, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512F-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512F-NEXT:    vpslld $31, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vmovdqu32 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
+; VL_BW_DQ-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; VL_BW_DQ-NEXT:    vpmovm2d %k1, %zmm0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm1
+; VL_BW_DQ-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
+; VL_BW_DQ-NEXT:    vpslld $31, %zmm1, %zmm0
+; VL_BW_DQ-NEXT:    vpmovd2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
+; VL_BW_DQ-NEXT:    retq
+  %a2 = icmp eq <16 x i32> %a, %a1
+  %b2 = icmp eq <16 x i32> %b, %b1
+  %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
+  ret <16 x i1> %c
+}
+
+define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
+; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
+; VL_BW_DQ-NEXT:    vpmovb2m %ymm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
+; VL_BW_DQ-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; VL_BW_DQ-NEXT:    vpsllw $15, %zmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2b %k0, %ymm0
+; VL_BW_DQ-NEXT:    retq
+  %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
+  ret <32 x i1> %b
+}
+
+define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
+; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movzbl %dil, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
+; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1
+; VL_BW_DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
+; VL_BW_DQ-NEXT:    retq
+  %b = bitcast i8 %a to <8 x i1>
+  %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
+  ret <8 x i1> %c
+}
+
+define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
+; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movzbl %dil, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
+; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    retq
+  %b = bitcast i8 %a to <8 x i1>
+  %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
+  %d = bitcast <8 x i1> %c to i8
+  ret i8 %d
+}
+
+define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
+; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movzbl %dil, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT:    vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
+; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    retq
+  %b = bitcast i8 %a to <8 x i1>
+  %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
+  %d = bitcast <8 x i1> %c to i8
+  ret i8 %d
+}
+
+define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
+; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movzbl %dil, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
+; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    retq
+  %b = bitcast i8 %a to <8 x i1>
+  %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
+  %d = bitcast <8 x i1>%c to i8
+  ret i8 %d
+}
+
+define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
+; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movzbl %dil, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
+; AVX512F-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
+; VL_BW_DQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; VL_BW_DQ-NEXT:    vpsllq $63, %zmm2, %zmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    retq
+  %b = bitcast i8 %a to <8 x i1>
+  %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
+  %d = bitcast <8 x i1>%c to i8
+  ret i8 %d
+}
+
+define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
+; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movzbl %dil, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    movb $51, %al
+; AVX512F-NEXT:    movzbl %al, %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm1 {%k2} {z}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    movb $51, %al
+; VL_BW_DQ-NEXT:    kmovb %eax, %k1
+; VL_BW_DQ-NEXT:    vpmovm2q %k1, %zmm0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm1
+; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
+; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    retq
+  %b = bitcast i8 %a to <8 x i1>
+  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
+  %c1 = bitcast <8 x i1>%c to i8
+  ret i8 %c1
+}
+
+define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
+; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    vpsllw $15, %xmm0, %xmm0
+; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
+; VL_BW_DQ-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
+; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; VL_BW_DQ-NEXT:    vpsllq $63, %zmm2, %zmm0
+; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    retq
+  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
+  %c1 = bitcast <8 x i1>%c to i8
+  ret i8 %c1
+}
+
+
+define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
+; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
+; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    kmovw %edi, %k0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm0
+; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovd2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    kmovw %k0, %eax
+; VL_BW_DQ-NEXT:    retq
+  %b = bitcast i16 %a to <16 x i1>
+  %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
+  %d = bitcast <16 x i1> %c to i16
+  ret i16 %d
+}
+
+define i64 @shuf64i1_zero(i64 %a) {
+; AVX512F-LABEL: shuf64i1_zero:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:  .Ltmp0:
+; AVX512F-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-NEXT:  .Ltmp1:
+; AVX512F-NEXT:    .cfi_offset %rbp, -16
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:  .Ltmp2:
+; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $32, %rsp
+; AVX512F-NEXT:    movb $0, (%rsp)
+; AVX512F-NEXT:    movl (%rsp), %ecx
+; AVX512F-NEXT:    movq %rcx, %rax
+; AVX512F-NEXT:    shlq $32, %rax
+; AVX512F-NEXT:    orq %rcx, %rax
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    retq
+;
+; VL_BW_DQ-LABEL: shuf64i1_zero:
+; VL_BW_DQ:       # BB#0:
+; VL_BW_DQ-NEXT:    kxorq %k0, %k0, %k0
+; VL_BW_DQ-NEXT:    kmovq %k0, %rax
+; VL_BW_DQ-NEXT:    retq
+  %b = bitcast i64 %a to <64 x i1>
+  %c = shufflevector < 64 x i1> zeroinitializer, <64 x i1> undef, <64 x i32> zeroinitializer
+  %d = bitcast <64 x i1> %c to i64
+  ret i64 %d
+}
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index 2480e676cad08..8c02c5a5433f9 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -1,38 +1,590 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
 
-define <4 x i32> @trunc2x2i64(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: trunc2x2i64:
+define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
+; SSE2-LABEL: trunc8i64_8i32:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: trunc2x2i64:
+; SSSE3-LABEL: trunc8i64_8i32:
 ; SSSE3:       # BB#0: # %entry
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: trunc2x2i64:
+; SSE41-LABEL: trunc8i64_8i32:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc2x2i64:
-; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc8i64_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i64_8i32:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %0 = trunc <8 x i64> %a to <8 x i32>
+  ret <8 x i32> %0
+}
+
+define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
+; SSE2-LABEL: trunc8i64_8i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pextrw $4, %xmm1, %eax
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT:    pextrw $4, %xmm0, %ecx
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pextrw $4, %xmm3, %edx
+; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT:    pextrw $4, %xmm2, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i64_8i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pextrw $4, %xmm1, %eax
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSSE3-NEXT:    pextrw $4, %xmm0, %ecx
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pextrw $4, %xmm3, %edx
+; SSSE3-NEXT:    movd %edx, %xmm1
+; SSSE3-NEXT:    movd %eax, %xmm3
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSSE3-NEXT:    pextrw $4, %xmm2, %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i64_8i16:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %0 = trunc <8 x i64> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define void @trunc8i64_8i8(<8 x i64> %a) {
+; SSE-LABEL: trunc8i64_8i8:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    movq %xmm0, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i8:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i8:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i64_8i8:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmovqb %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
+entry:
+  %0 = trunc <8 x i64> %a to <8 x i8>
+  store <8 x i8> %0, <8 x i8>* undef, align 4
+  ret void
+}
+
+define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
+; SSE2-LABEL: trunc8i32_8i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i32_8i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i16:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %0 = trunc <8 x i32> %a to <8 x i16>
+  ret <8 x i16> %0
+}
 
+define void @trunc8i32_8i8(<8 x i32> %a) {
+; SSE2-LABEL: trunc8i32_8i8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i32_8i8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i8:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i8:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i8:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
+entry:
+  %0 = trunc <8 x i32> %a to <8 x i8>
+  store <8 x i8> %0, <8 x i8>* undef, align 4
+  ret void
+}
 
+define void @trunc16i32_16i8(<16 x i32> %a) {
+; SSE-LABEL: trunc16i32_16i8:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    movdqu %xmm0, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i8:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i8:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc16i32_16i8:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
+entry:
+  %0 = trunc <16 x i32> %a to <16 x i8>
+  store <16 x i8> %0, <16 x i8>* undef, align 4
+  ret void
+}
+
+define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: trunc2x4i64_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x4i64_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i64_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x4i64_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i64_8i32:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %0 = trunc <4 x i64> %a to <4 x i32>
+  %1 = trunc <4 x i64> %b to <4 x i32>
+  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: trunc2x4i64_8i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pextrw $4, %xmm1, %eax
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT:    pextrw $4, %xmm0, %ecx
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pextrw $4, %xmm3, %edx
+; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT:    pextrw $4, %xmm2, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x4i64_8i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pextrw $4, %xmm1, %eax
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSSE3-NEXT:    pextrw $4, %xmm0, %ecx
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pextrw $4, %xmm3, %edx
+; SSSE3-NEXT:    movd %edx, %xmm1
+; SSSE3-NEXT:    movd %eax, %xmm3
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSSE3-NEXT:    pextrw $4, %xmm2, %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i64_8i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pextrw $4, %xmm0, %eax
+; SSE41-NEXT:    pinsrw $1, %eax, %xmm0
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    pinsrw $2, %eax, %xmm0
+; SSE41-NEXT:    pextrw $4, %xmm1, %eax
+; SSE41-NEXT:    pinsrw $3, %eax, %xmm0
+; SSE41-NEXT:    movd %xmm2, %eax
+; SSE41-NEXT:    pinsrw $4, %eax, %xmm0
+; SSE41-NEXT:    pextrw $4, %xmm2, %eax
+; SSE41-NEXT:    pinsrw $5, %eax, %xmm0
+; SSE41-NEXT:    movd %xmm3, %eax
+; SSE41-NEXT:    pinsrw $6, %eax, %xmm0
+; SSE41-NEXT:    pextrw $4, %xmm3, %eax
+; SSE41-NEXT:    pinsrw $7, %eax, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x4i64_8i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i64_8i16:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    retq
+entry:
+  %0 = trunc <4 x i64> %a to <4 x i16>
+  %1 = trunc <4 x i64> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: trunc2x2i64_4i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x2i64_4i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x2i64_4i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x2i64_4i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x2i64_4i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x2i64_4i32:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT:    retq
 entry:
   %0 = trunc <2 x i64> %a to <2 x i32>
   %1 = trunc <2 x i64> %b to <2 x i32>
@@ -40,28 +592,32 @@ entry:
   ret <4 x i32> %2
 }
 
-define i64 @trunc2i64(<2 x i64> %inval) {
-; SSE-LABEL: trunc2i64:
+define i64 @trunc2i64_i64(<2 x i64> %inval) {
+; SSE-LABEL: trunc2i64_i64:
 ; SSE:       # BB#0: # %entry
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    movd %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: trunc2i64:
+; AVX-LABEL: trunc2i64_i64:
 ; AVX:       # BB#0: # %entry
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
-
-
+;
+; AVX512BW-LABEL: trunc2i64_i64:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    retq
 entry:
   %0 = trunc <2 x i64> %inval to <2 x i32>
   %1 = bitcast <2 x i32> %0 to i64
   ret i64 %1
 }
 
-define <8 x i16> @trunc2x4i32(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: trunc2x4i32:
+define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: trunc2x4i32_8i16:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
@@ -72,7 +628,7 @@ define <8 x i16> @trunc2x4i32(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: trunc2x4i32:
+; SSSE3-LABEL: trunc2x4i32_8i16:
 ; SSSE3:       # BB#0: # %entry
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
@@ -80,7 +636,7 @@ define <8 x i16> @trunc2x4i32(<4 x i32> %a, <4 x i32> %b) {
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: trunc2x4i32:
+; SSE41-LABEL: trunc2x4i32_8i16:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; SSE41-NEXT:    pshufb %xmm2, %xmm1
@@ -88,17 +644,21 @@ define <8 x i16> @trunc2x4i32(<4 x i32> %a, <4 x i32> %b) {
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc2x4i32:
+; AVX-LABEL: trunc2x4i32_8i16:
 ; AVX:       # BB#0: # %entry
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
-
-
-
-
+;
+; AVX512BW-LABEL: trunc2x4i32_8i16:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    retq
 entry:
   %0 = trunc <4 x i32> %a to <4 x i16>
   %1 = trunc <4 x i32> %b to <4 x i16>
@@ -107,8 +667,8 @@ entry:
 }
 
 ; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
-define i64 @trunc4i32(<4 x i32> %inval) {
-; SSE2-LABEL: trunc4i32:
+define i64 @trunc4i32_i64(<4 x i32> %inval) {
+; SSE2-LABEL: trunc4i32_i64:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
@@ -116,35 +676,37 @@ define i64 @trunc4i32(<4 x i32> %inval) {
 ; SSE2-NEXT:    movd %xmm0, %rax
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: trunc4i32:
+; SSSE3-LABEL: trunc4i32_i64:
 ; SSSE3:       # BB#0: # %entry
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; SSSE3-NEXT:    movd %xmm0, %rax
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: trunc4i32:
+; SSE41-LABEL: trunc4i32_i64:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; SSE41-NEXT:    movd %xmm0, %rax
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc4i32:
+; AVX-LABEL: trunc4i32_i64:
 ; AVX:       # BB#0: # %entry
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
-
-
-
-
+;
+; AVX512BW-LABEL: trunc4i32_i64:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    retq
 entry:
   %0 = trunc <4 x i32> %inval to <4 x i16>
   %1 = bitcast <4 x i16> %0 to i64
   ret i64 %1
 }
 
-define <16 x i8> @trunc2x8i16(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: trunc2x8i16:
+define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: trunc2x8i16_16i8:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
@@ -152,7 +714,7 @@ define <16 x i8> @trunc2x8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: trunc2x8i16:
+; SSSE3-LABEL: trunc2x8i16_16i8:
 ; SSSE3:       # BB#0: # %entry
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
@@ -160,7 +722,7 @@ define <16 x i8> @trunc2x8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: trunc2x8i16:
+; SSE41-LABEL: trunc2x8i16_16i8:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
 ; SSE41-NEXT:    pshufb %xmm2, %xmm1
@@ -168,17 +730,21 @@ define <16 x i8> @trunc2x8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc2x8i16:
+; AVX-LABEL: trunc2x8i16_16i8:
 ; AVX:       # BB#0: # %entry
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
 ; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
-
-
-
-
+;
+; AVX512BW-LABEL: trunc2x8i16_16i8:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    retq
 entry:
   %0 = trunc <8 x i16> %a to <8 x i8>
   %1 = trunc <8 x i16> %b to <8 x i8>
@@ -187,51 +753,58 @@ entry:
 }
 
 ; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
-define i64 @trunc8i16(<8 x i16> %inval) {
-; SSE2-LABEL: trunc8i16:
+define i64 @trunc8i16_i64(<8 x i16> %inval) {
+; SSE2-LABEL: trunc8i16_i64:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    movd %xmm0, %rax
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: trunc8i16:
+; SSSE3-LABEL: trunc8i16_i64:
 ; SSSE3:       # BB#0: # %entry
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    movd %xmm0, %rax
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: trunc8i16:
+; SSE41-LABEL: trunc8i16_i64:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    movd %xmm0, %rax
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc8i16:
+; AVX-LABEL: trunc8i16_i64:
 ; AVX:       # BB#0: # %entry
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
-
-
-
-
+;
+; AVX512BW-LABEL: trunc8i16_i64:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    retq
 entry:
   %0 = trunc <8 x i16> %inval to <8 x i8>
   %1 = bitcast <8 x i8> %0 to i64
   ret i64 %1
 }
 
-define <16 x i8> @trunc16i64_const() {
-; SSE-LABEL: trunc16i64_const:
+define <16 x i8> @trunc16i64_16i8_const() {
+; SSE-LABEL: trunc16i64_16i8_const:
 ; SSE:       # BB#0: # %entry
 ; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: trunc16i64_const:
+; AVX-LABEL: trunc16i64_16i8_const:
 ; AVX:       # BB#0: # %entry
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc16i64_16i8_const:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
 
 entry:
   %0 = trunc <16 x i64> zeroinitializer to <16 x i8>
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 422fe052d38b9..f1714d4845de9 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -1,13 +1,12 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
-target triple = "x86_64-unknown-unknown"
-
-define <2 x i64> @testv2i64(<2 x i64> %in) {
+define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movd %xmm0, %rax
@@ -87,7 +86,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) {
   ret <2 x i64> %out
 }
 
-define <2 x i64> @testv2i64u(<2 x i64> %in) {
+define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64u:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movd %xmm0, %rax
@@ -152,1521 +151,755 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) {
   ret <2 x i64> %out
 }
 
-define <4 x i32> @testv4i32(<4 x i32> %in) {
+define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    movl $32, %ecx
-; SSE2-NEXT:    cmovel %ecx, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm2, %eax
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    cmovel %ecx, %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    cmovel %ecx, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    cmovel %ecx, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    psubd {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    psrld $2, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrld $4, %xmm0
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv4i32:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE3-NEXT:    movd %xmm1, %eax
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    movl $32, %ecx
-; SSE3-NEXT:    cmovel %ecx, %eax
-; SSE3-NEXT:    movd %eax, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSE3-NEXT:    movd %xmm2, %eax
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    cmovel %ecx, %eax
-; SSE3-NEXT:    movd %eax, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE3-NEXT:    movd %xmm0, %eax
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    cmovel %ecx, %eax
-; SSE3-NEXT:    movd %eax, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE3-NEXT:    movd %xmm0, %eax
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    cmovel %ecx, %eax
-; SSE3-NEXT:    movd %eax, %xmm0
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    pxor %xmm2, %xmm2
+; SSE3-NEXT:    psubd %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm0, %xmm2
+; SSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
+; SSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE3-NEXT:    psrld $1, %xmm0
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    psubd %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSE3-NEXT:    pand %xmm0, %xmm3
+; SSE3-NEXT:    psrld $2, %xmm2
+; SSE3-NEXT:    pand %xmm0, %xmm2
+; SSE3-NEXT:    paddd %xmm3, %xmm2
+; SSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE3-NEXT:    psrld $4, %xmm0
+; SSE3-NEXT:    paddd %xmm2, %xmm0
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv4i32:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSSE3-NEXT:    movd %xmm1, %eax
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    movl $32, %ecx
-; SSSE3-NEXT:    cmovel %ecx, %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSSE3-NEXT:    movd %xmm2, %eax
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    cmovel %ecx, %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT:    movd %xmm0, %eax
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    cmovel %ecx, %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    movd %xmm0, %eax
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    cmovel %ecx, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    psubd %xmm0, %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pand %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    pshufb %xmm4, %xmm5
+; SSSE3-NEXT:    psrlw $4, %xmm2
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    paddb %xmm5, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSSE3-NEXT:    packuswb %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrd $1, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    movl $32, %ecx
-; SSE41-NEXT:    cmovel %ecx, %eax
-; SSE41-NEXT:    movd %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    movd %edx, %xmm1
-; SSE41-NEXT:    pinsrd $1, %eax, %xmm1
-; SSE41-NEXT:    pextrd $2, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    cmovel %ecx, %eax
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
-; SSE41-NEXT:    pextrd $3, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    cmovel %ecx, %eax
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    psubd %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm0, %xmm2
+; SSE41-NEXT:    psubd {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pshufb %xmm4, %xmm5
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    paddb %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE41-NEXT:    psadbw %xmm1, %xmm2
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE41-NEXT:    psadbw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv4i32:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    movl $32, %ecx
-; AVX-NEXT:    cmovel %ecx, %eax
-; AVX-NEXT:    vmovd %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vmovd %edx, %xmm1
-; AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    cmovel %ecx, %eax
-; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    cmovel %ecx, %eax
-; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
   ret <4 x i32> %out
 }
 
-define <4 x i32> @testv4i32u(<4 x i32> %in) {
+define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32u:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm2, %eax
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    psubd {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    psrld $2, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrld $4, %xmm0
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv4i32u:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE3-NEXT:    movd %xmm1, %eax
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    movd %eax, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSE3-NEXT:    movd %xmm2, %eax
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    movd %eax, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE3-NEXT:    movd %xmm0, %eax
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    movd %eax, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE3-NEXT:    movd %xmm0, %eax
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    movd %eax, %xmm0
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    pxor %xmm2, %xmm2
+; SSE3-NEXT:    psubd %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm0, %xmm2
+; SSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
+; SSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE3-NEXT:    psrld $1, %xmm0
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    psubd %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSE3-NEXT:    pand %xmm0, %xmm3
+; SSE3-NEXT:    psrld $2, %xmm2
+; SSE3-NEXT:    pand %xmm0, %xmm2
+; SSE3-NEXT:    paddd %xmm3, %xmm2
+; SSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE3-NEXT:    psrld $4, %xmm0
+; SSE3-NEXT:    paddd %xmm2, %xmm0
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv4i32u:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSSE3-NEXT:    movd %xmm1, %eax
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSSE3-NEXT:    movd %xmm2, %eax
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT:    movd %xmm0, %eax
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    movd %xmm0, %eax
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    psubd %xmm0, %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pand %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    pshufb %xmm4, %xmm5
+; SSSE3-NEXT:    psrlw $4, %xmm2
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    paddb %xmm5, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSSE3-NEXT:    packuswb %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv4i32u:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrd $1, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    movd %xmm0, %ecx
-; SSE41-NEXT:    bsfl %ecx, %ecx
-; SSE41-NEXT:    movd %ecx, %xmm1
-; SSE41-NEXT:    pinsrd $1, %eax, %xmm1
-; SSE41-NEXT:    pextrd $2, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
-; SSE41-NEXT:    pextrd $3, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    psubd %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm0, %xmm2
+; SSE41-NEXT:    psubd {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pshufb %xmm4, %xmm5
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    paddb %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE41-NEXT:    psadbw %xmm1, %xmm2
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE41-NEXT:    psadbw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv4i32u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vmovd %xmm0, %ecx
-; AVX-NEXT:    bsfl %ecx, %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm1
-; AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv4i32u:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv4i32u:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
   ret <4 x i32> %out
 }
 
-define <8 x i16> @testv8i16(<8 x i16> %in) {
+define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE2-LABEL: testv8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pextrw $7, %xmm0, %eax
-; SSE2-NEXT:    bsfw %ax, %cx
-; SSE2-NEXT:    movw $16, %ax
-; SSE2-NEXT:    cmovew %ax, %cx
-; SSE2-NEXT:    movd %ecx, %xmm1
-; SSE2-NEXT:    pextrw $3, %xmm0, %ecx
-; SSE2-NEXT:    bsfw %cx, %cx
-; SSE2-NEXT:    cmovew %ax, %cx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    pextrw $5, %xmm0, %ecx
-; SSE2-NEXT:    bsfw %cx, %cx
-; SSE2-NEXT:    cmovew %ax, %cx
-; SSE2-NEXT:    movd %ecx, %xmm3
-; SSE2-NEXT:    pextrw $1, %xmm0, %ecx
-; SSE2-NEXT:    bsfw %cx, %cx
-; SSE2-NEXT:    cmovew %ax, %cx
-; SSE2-NEXT:    movd %ecx, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT:    pextrw $6, %xmm0, %ecx
-; SSE2-NEXT:    bsfw %cx, %cx
-; SSE2-NEXT:    cmovew %ax, %cx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    pextrw $2, %xmm0, %ecx
-; SSE2-NEXT:    bsfw %cx, %cx
-; SSE2-NEXT:    cmovew %ax, %cx
-; SSE2-NEXT:    movd %ecx, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT:    pextrw $4, %xmm0, %ecx
-; SSE2-NEXT:    bsfw %cx, %cx
-; SSE2-NEXT:    cmovew %ax, %cx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    bsfw %cx, %cx
-; SSE2-NEXT:    cmovew %ax, %cx
-; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    psubw %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    psubw {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psubw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    paddw %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $4, %xmm2
+; SSE2-NEXT:    paddw %xmm1, %xmm2
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv8i16:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    pextrw $7, %xmm0, %eax
-; SSE3-NEXT:    bsfw %ax, %cx
-; SSE3-NEXT:    movw $16, %ax
-; SSE3-NEXT:    cmovew %ax, %cx
-; SSE3-NEXT:    movd %ecx, %xmm1
-; SSE3-NEXT:    pextrw $3, %xmm0, %ecx
-; SSE3-NEXT:    bsfw %cx, %cx
-; SSE3-NEXT:    cmovew %ax, %cx
-; SSE3-NEXT:    movd %ecx, %xmm2
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE3-NEXT:    pextrw $5, %xmm0, %ecx
-; SSE3-NEXT:    bsfw %cx, %cx
-; SSE3-NEXT:    cmovew %ax, %cx
-; SSE3-NEXT:    movd %ecx, %xmm3
-; SSE3-NEXT:    pextrw $1, %xmm0, %ecx
-; SSE3-NEXT:    bsfw %cx, %cx
-; SSE3-NEXT:    cmovew %ax, %cx
-; SSE3-NEXT:    movd %ecx, %xmm1
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE3-NEXT:    pextrw $6, %xmm0, %ecx
-; SSE3-NEXT:    bsfw %cx, %cx
-; SSE3-NEXT:    cmovew %ax, %cx
-; SSE3-NEXT:    movd %ecx, %xmm2
-; SSE3-NEXT:    pextrw $2, %xmm0, %ecx
-; SSE3-NEXT:    bsfw %cx, %cx
-; SSE3-NEXT:    cmovew %ax, %cx
-; SSE3-NEXT:    movd %ecx, %xmm3
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE3-NEXT:    pextrw $4, %xmm0, %ecx
-; SSE3-NEXT:    bsfw %cx, %cx
-; SSE3-NEXT:    cmovew %ax, %cx
-; SSE3-NEXT:    movd %ecx, %xmm2
-; SSE3-NEXT:    movd %xmm0, %ecx
-; SSE3-NEXT:    bsfw %cx, %cx
-; SSE3-NEXT:    cmovew %ax, %cx
-; SSE3-NEXT:    movd %ecx, %xmm0
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    psubw %xmm0, %xmm1
+; SSE3-NEXT:    pand %xmm0, %xmm1
+; SSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    psubw %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE3-NEXT:    pand %xmm0, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm1
+; SSE3-NEXT:    pand %xmm0, %xmm1
+; SSE3-NEXT:    paddw %xmm2, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $4, %xmm2
+; SSE3-NEXT:    paddw %xmm1, %xmm2
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv8i16:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pextrw $7, %xmm0, %eax
-; SSSE3-NEXT:    bsfw %ax, %cx
-; SSSE3-NEXT:    movw $16, %ax
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    pextrw $3, %xmm0, %ecx
-; SSSE3-NEXT:    bsfw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    pextrw $5, %xmm0, %ecx
-; SSSE3-NEXT:    bsfw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    pextrw $1, %xmm0, %ecx
-; SSSE3-NEXT:    bsfw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT:    pextrw $6, %xmm0, %ecx
-; SSSE3-NEXT:    bsfw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    pextrw $2, %xmm0, %ecx
-; SSSE3-NEXT:    bsfw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT:    pextrw $4, %xmm0, %ecx
-; SSSE3-NEXT:    bsfw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    movd %xmm0, %ecx
-; SSSE3-NEXT:    bsfw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    psubw %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm1
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm3
+; SSSE3-NEXT:    paddb %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    psllw $8, %xmm0
+; SSSE3-NEXT:    paddb %xmm3, %xmm0
+; SSSE3-NEXT:    psrlw $8, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrw $1, %xmm0, %eax
-; SSE41-NEXT:    bsfw %ax, %cx
-; SSE41-NEXT:    movw $16, %ax
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    movd %xmm0, %edx
-; SSE41-NEXT:    bsfw %dx, %dx
-; SSE41-NEXT:    cmovew %ax, %dx
-; SSE41-NEXT:    movd %edx, %xmm1
-; SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $2, %xmm0, %ecx
-; SSE41-NEXT:    bsfw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $3, %xmm0, %ecx
-; SSE41-NEXT:    bsfw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $4, %xmm0, %ecx
-; SSE41-NEXT:    bsfw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $5, %xmm0, %ecx
-; SSE41-NEXT:    bsfw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $6, %xmm0, %ecx
-; SSE41-NEXT:    bsfw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $7, %xmm0, %ecx
-; SSE41-NEXT:    bsfw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    pinsrw $7, %ecx, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    psubw %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    psubw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pand %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pshufb %xmm2, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    paddb %xmm3, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv8i16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX-NEXT:    bsfw %ax, %cx
-; AVX-NEXT:    movw $16, %ax
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    vmovd %xmm0, %edx
-; AVX-NEXT:    bsfw %dx, %dx
-; AVX-NEXT:    cmovew %ax, %dx
-; AVX-NEXT:    vmovd %edx, %xmm1
-; AVX-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $2, %xmm0, %ecx
-; AVX-NEXT:    bsfw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $3, %xmm0, %ecx
-; AVX-NEXT:    bsfw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $4, %xmm0, %ecx
-; AVX-NEXT:    bsfw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $5, %xmm0, %ecx
-; AVX-NEXT:    bsfw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $6, %xmm0, %ecx
-; AVX-NEXT:    bsfw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $7, %xmm0, %ecx
-; AVX-NEXT:    bsfw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0)
   ret <8 x i16> %out
 }
 
-define <8 x i16> @testv8i16u(<8 x i16> %in) {
+define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE2-LABEL: testv8i16u:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pextrw $7, %xmm0, %eax
-; SSE2-NEXT:    bsfw %ax, %ax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pextrw $3, %xmm0, %eax
-; SSE2-NEXT:    bsfw %ax, %ax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    pextrw $5, %xmm0, %eax
-; SSE2-NEXT:    bsfw %ax, %ax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pextrw $1, %xmm0, %eax
-; SSE2-NEXT:    bsfw %ax, %ax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT:    pextrw $6, %xmm0, %eax
-; SSE2-NEXT:    bsfw %ax, %ax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pextrw $2, %xmm0, %eax
-; SSE2-NEXT:    bsfw %ax, %ax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    pextrw $4, %xmm0, %eax
-; SSE2-NEXT:    bsfw %ax, %ax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    bsfw %ax, %ax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    psubw %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    psubw {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psubw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    paddw %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $4, %xmm2
+; SSE2-NEXT:    paddw %xmm1, %xmm2
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv8i16u:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    pextrw $7, %xmm0, %eax
-; SSE3-NEXT:    bsfw %ax, %ax
-; SSE3-NEXT:    movd %eax, %xmm1
-; SSE3-NEXT:    pextrw $3, %xmm0, %eax
-; SSE3-NEXT:    bsfw %ax, %ax
-; SSE3-NEXT:    movd %eax, %xmm2
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE3-NEXT:    pextrw $5, %xmm0, %eax
-; SSE3-NEXT:    bsfw %ax, %ax
-; SSE3-NEXT:    movd %eax, %xmm1
-; SSE3-NEXT:    pextrw $1, %xmm0, %eax
-; SSE3-NEXT:    bsfw %ax, %ax
-; SSE3-NEXT:    movd %eax, %xmm3
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE3-NEXT:    pextrw $6, %xmm0, %eax
-; SSE3-NEXT:    bsfw %ax, %ax
-; SSE3-NEXT:    movd %eax, %xmm1
-; SSE3-NEXT:    pextrw $2, %xmm0, %eax
-; SSE3-NEXT:    bsfw %ax, %ax
-; SSE3-NEXT:    movd %eax, %xmm2
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE3-NEXT:    pextrw $4, %xmm0, %eax
-; SSE3-NEXT:    bsfw %ax, %ax
-; SSE3-NEXT:    movd %eax, %xmm1
-; SSE3-NEXT:    movd %xmm0, %eax
-; SSE3-NEXT:    bsfw %ax, %ax
-; SSE3-NEXT:    movd %eax, %xmm0
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    psubw %xmm0, %xmm1
+; SSE3-NEXT:    pand %xmm0, %xmm1
+; SSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    psubw %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE3-NEXT:    pand %xmm0, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm1
+; SSE3-NEXT:    pand %xmm0, %xmm1
+; SSE3-NEXT:    paddw %xmm2, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $4, %xmm2
+; SSE3-NEXT:    paddw %xmm1, %xmm2
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv8i16u:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pextrw $7, %xmm0, %eax
-; SSSE3-NEXT:    bsfw %ax, %ax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    pextrw $3, %xmm0, %eax
-; SSSE3-NEXT:    bsfw %ax, %ax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    pextrw $5, %xmm0, %eax
-; SSSE3-NEXT:    bsfw %ax, %ax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    pextrw $1, %xmm0, %eax
-; SSSE3-NEXT:    bsfw %ax, %ax
-; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT:    pextrw $6, %xmm0, %eax
-; SSSE3-NEXT:    bsfw %ax, %ax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    pextrw $2, %xmm0, %eax
-; SSSE3-NEXT:    bsfw %ax, %ax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    pextrw $4, %xmm0, %eax
-; SSSE3-NEXT:    bsfw %ax, %ax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    movd %xmm0, %eax
-; SSSE3-NEXT:    bsfw %ax, %ax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    psubw %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm1
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm3
+; SSSE3-NEXT:    paddb %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    psllw $8, %xmm0
+; SSSE3-NEXT:    paddb %xmm3, %xmm0
+; SSSE3-NEXT:    psrlw $8, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv8i16u:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:   pextrw $1, %xmm0, %eax
-; SSE41-NEXT:   bsfw %ax, %ax
-; SSE41-NEXT:   movd %xmm0, %ecx
-; SSE41-NEXT:   bsfw %cx, %cx
-; SSE41-NEXT:   movd %ecx, %xmm1
-; SSE41-NEXT:   pinsrw $1, %eax, %xmm1
-; SSE41-NEXT:   pextrw $2, %xmm0, %eax
-; SSE41-NEXT:   bsfw %ax, %ax
-; SSE41-NEXT:   pinsrw $2, %eax, %xmm1
-; SSE41-NEXT:   pextrw $3, %xmm0, %eax
-; SSE41-NEXT:   bsfw %ax, %ax
-; SSE41-NEXT:   pinsrw $3, %eax, %xmm1
-; SSE41-NEXT:   pextrw $4, %xmm0, %eax
-; SSE41-NEXT:   bsfw %ax, %ax
-; SSE41-NEXT:   pinsrw $4, %eax, %xmm1
-; SSE41-NEXT:   pextrw $5, %xmm0, %eax
-; SSE41-NEXT:   bsfw %ax, %ax
-; SSE41-NEXT:   pinsrw $5, %eax, %xmm1
-; SSE41-NEXT:   pextrw $6, %xmm0, %eax
-; SSE41-NEXT:   bsfw %ax, %ax
-; SSE41-NEXT:   pinsrw $6, %eax, %xmm1
-; SSE41-NEXT:   pextrw $7, %xmm0, %eax
-; SSE41-NEXT:   bsfw %ax, %ax
-; SSE41-NEXT:   pinsrw $7, %eax, %xmm1
-; SSE41-NEXT:   movdqa %xmm1, %xmm0
-; SSE41-NEXT:   retq
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    psubw %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    psubw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pand %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pshufb %xmm2, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    paddb %xmm3, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv8i16u:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX-NEXT:    bsfw %ax, %ax
-; AVX-NEXT:    vmovd %xmm0, %ecx
-; AVX-NEXT:    bsfw %cx, %cx
-; AVX-NEXT:    vmovd %ecx, %xmm1
-; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX-NEXT:    bsfw %ax, %ax
-; AVX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX-NEXT:    bsfw %ax, %ax
-; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $4, %xmm0, %eax
-; AVX-NEXT:    bsfw %ax, %ax
-; AVX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $5, %xmm0, %eax
-; AVX-NEXT:    bsfw %ax, %ax
-; AVX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX-NEXT:    bsfw %ax, %ax
-; AVX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $7, %xmm0, %eax
-; AVX-NEXT:    bsfw %ax, %ax
-; AVX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1)
   ret <8 x i16> %out
 }
 
-define <16 x i8> @testv16i8(<16 x i8> %in) {
+define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; SSE2-LABEL: testv16i8:
 ; SSE2:       # BB#0:
-; SSE2:         pushq %rbp
-; SSE2:         pushq %r14
-; SSE2:         pushq %rbx
-; SSE2:         movaps %xmm0, -16(%rsp)
-; SSE2-NEXT:    movzbl -1(%rsp), %eax
-; SSE2-NEXT:    bsfl %eax, %edx
-; SSE2-NEXT:    movl $32, %eax
-; SSE2-NEXT:    cmovel %eax, %edx
-; SSE2-NEXT:    cmpl $32, %edx
-; SSE2-NEXT:    movl $8, %ecx
-; SSE2-NEXT:    cmovel %ecx, %edx
-; SSE2-NEXT:    movd %edx, %xmm0
-; SSE2-NEXT:    movzbl -2(%rsp), %r14d
-; SSE2-NEXT:    movzbl -3(%rsp), %ebx
-; SSE2-NEXT:    movzbl -4(%rsp), %r9d
-; SSE2-NEXT:    movzbl -5(%rsp), %edi
-; SSE2-NEXT:    movzbl -6(%rsp), %r11d
-; SSE2-NEXT:    movzbl -7(%rsp), %edx
-; SSE2-NEXT:    movzbl -8(%rsp), %r8d
-; SSE2-NEXT:    movzbl -9(%rsp), %esi
-; SSE2-NEXT:    bsfl %esi, %esi
-; SSE2-NEXT:    cmovel %eax, %esi
-; SSE2-NEXT:    cmpl $32, %esi
-; SSE2-NEXT:    cmovel %ecx, %esi
-; SSE2-NEXT:    movd %esi, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    bsfl %edi, %esi
-; SSE2-NEXT:    cmovel %eax, %esi
-; SSE2-NEXT:    cmpl $32, %esi
-; SSE2-NEXT:    cmovel %ecx, %esi
-; SSE2-NEXT:    movd %esi, %xmm2
-; SSE2-NEXT:    movzbl -10(%rsp), %edi
-; SSE2-NEXT:    movzbl -11(%rsp), %esi
-; SSE2-NEXT:    movzbl -12(%rsp), %r10d
-; SSE2-NEXT:    movzbl -13(%rsp), %ebp
-; SSE2-NEXT:    bsfl %ebp, %ebp
-; SSE2-NEXT:    cmovel %eax, %ebp
-; SSE2-NEXT:    cmpl $32, %ebp
-; SSE2-NEXT:    cmovel %ecx, %ebp
-; SSE2-NEXT:    movd %ebp, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    bsfl %ebx, %ebx
-; SSE2-NEXT:    cmovel %eax, %ebx
-; SSE2-NEXT:    cmpl $32, %ebx
-; SSE2-NEXT:    cmovel %ecx, %ebx
-; SSE2-NEXT:    movd %ebx, %xmm1
-; SSE2-NEXT:    bsfl %esi, %esi
-; SSE2-NEXT:    cmovel %eax, %esi
-; SSE2-NEXT:    cmpl $32, %esi
-; SSE2-NEXT:    cmovel %ecx, %esi
-; SSE2-NEXT:    movd %esi, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    bsfl %edx, %edx
-; SSE2-NEXT:    cmovel %eax, %edx
-; SSE2-NEXT:    cmpl $32, %edx
-; SSE2-NEXT:    cmovel %ecx, %edx
-; SSE2-NEXT:    movd %edx, %xmm3
-; SSE2-NEXT:    movzbl -14(%rsp), %edx
-; SSE2-NEXT:    movzbl -15(%rsp), %esi
-; SSE2-NEXT:    bsfl %esi, %esi
-; SSE2-NEXT:    cmovel %eax, %esi
-; SSE2-NEXT:    cmpl $32, %esi
-; SSE2-NEXT:    cmovel %ecx, %esi
-; SSE2-NEXT:    movd %esi, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    bsfl %r14d, %esi
-; SSE2-NEXT:    cmovel %eax, %esi
-; SSE2-NEXT:    cmpl $32, %esi
-; SSE2-NEXT:    cmovel %ecx, %esi
-; SSE2-NEXT:    movd %esi, %xmm0
-; SSE2-NEXT:    bsfl %edi, %esi
-; SSE2-NEXT:    cmovel %eax, %esi
-; SSE2-NEXT:    cmpl $32, %esi
-; SSE2-NEXT:    cmovel %ecx, %esi
-; SSE2-NEXT:    movd %esi, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    bsfl %r11d, %esi
-; SSE2-NEXT:    cmovel %eax, %esi
-; SSE2-NEXT:    cmpl $32, %esi
-; SSE2-NEXT:    cmovel %ecx, %esi
-; SSE2-NEXT:    movd %esi, %xmm0
-; SSE2-NEXT:    bsfl %edx, %edx
-; SSE2-NEXT:    cmovel %eax, %edx
-; SSE2-NEXT:    cmpl $32, %edx
-; SSE2-NEXT:    cmovel %ecx, %edx
-; SSE2-NEXT:    movd %edx, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT:    bsfl %r9d, %edx
-; SSE2-NEXT:    cmovel %eax, %edx
-; SSE2-NEXT:    cmpl $32, %edx
-; SSE2-NEXT:    cmovel %ecx, %edx
-; SSE2-NEXT:    movd %edx, %xmm0
-; SSE2-NEXT:    bsfl %r10d, %edx
-; SSE2-NEXT:    cmovel %eax, %edx
-; SSE2-NEXT:    cmpl $32, %edx
-; SSE2-NEXT:    cmovel %ecx, %edx
-; SSE2-NEXT:    movd %edx, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    bsfl %r8d, %edx
-; SSE2-NEXT:    cmovel %eax, %edx
-; SSE2-NEXT:    cmpl $32, %edx
-; SSE2-NEXT:    cmovel %ecx, %edx
-; SSE2-NEXT:    movd %edx, %xmm4
-; SSE2-NEXT:    movzbl -16(%rsp), %edx
-; SSE2-NEXT:    bsfl %edx, %edx
-; SSE2-NEXT:    cmovel %eax, %edx
-; SSE2-NEXT:    cmpl $32, %edx
-; SSE2-NEXT:    cmovel %ecx, %edx
-; SSE2-NEXT:    movd %edx, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    psubb {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv16i8:
 ; SSE3:       # BB#0:
-; SSE3:         pushq %rbp
-; SSE3:         pushq %r14
-; SSE3:         pushq %rbx
-; SSE3:         movaps %xmm0, -16(%rsp)
-; SSE3-NEXT:    movzbl -1(%rsp), %eax
-; SSE3-NEXT:    bsfl %eax, %edx
-; SSE3-NEXT:    movl $32, %eax
-; SSE3-NEXT:    cmovel %eax, %edx
-; SSE3-NEXT:    cmpl $32, %edx
-; SSE3-NEXT:    movl $8, %ecx
-; SSE3-NEXT:    cmovel %ecx, %edx
-; SSE3-NEXT:    movd %edx, %xmm0
-; SSE3-NEXT:    movzbl -2(%rsp), %r14d
-; SSE3-NEXT:    movzbl -3(%rsp), %ebx
-; SSE3-NEXT:    movzbl -4(%rsp), %r9d
-; SSE3-NEXT:    movzbl -5(%rsp), %edi
-; SSE3-NEXT:    movzbl -6(%rsp), %r11d
-; SSE3-NEXT:    movzbl -7(%rsp), %edx
-; SSE3-NEXT:    movzbl -8(%rsp), %r8d
-; SSE3-NEXT:    movzbl -9(%rsp), %esi
-; SSE3-NEXT:    bsfl %esi, %esi
-; SSE3-NEXT:    cmovel %eax, %esi
-; SSE3-NEXT:    cmpl $32, %esi
-; SSE3-NEXT:    cmovel %ecx, %esi
-; SSE3-NEXT:    movd %esi, %xmm1
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE3-NEXT:    bsfl %edi, %esi
-; SSE3-NEXT:    cmovel %eax, %esi
-; SSE3-NEXT:    cmpl $32, %esi
-; SSE3-NEXT:    cmovel %ecx, %esi
-; SSE3-NEXT:    movd %esi, %xmm2
-; SSE3-NEXT:    movzbl -10(%rsp), %edi
-; SSE3-NEXT:    movzbl -11(%rsp), %esi
-; SSE3-NEXT:    movzbl -12(%rsp), %r10d
-; SSE3-NEXT:    movzbl -13(%rsp), %ebp
-; SSE3-NEXT:    bsfl %ebp, %ebp
-; SSE3-NEXT:    cmovel %eax, %ebp
-; SSE3-NEXT:    cmpl $32, %ebp
-; SSE3-NEXT:    cmovel %ecx, %ebp
-; SSE3-NEXT:    movd %ebp, %xmm0
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE3-NEXT:    bsfl %ebx, %ebx
-; SSE3-NEXT:    cmovel %eax, %ebx
-; SSE3-NEXT:    cmpl $32, %ebx
-; SSE3-NEXT:    cmovel %ecx, %ebx
-; SSE3-NEXT:    movd %ebx, %xmm1
-; SSE3-NEXT:    bsfl %esi, %esi
-; SSE3-NEXT:    cmovel %eax, %esi
-; SSE3-NEXT:    cmpl $32, %esi
-; SSE3-NEXT:    cmovel %ecx, %esi
-; SSE3-NEXT:    movd %esi, %xmm2
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE3-NEXT:    bsfl %edx, %edx
-; SSE3-NEXT:    cmovel %eax, %edx
-; SSE3-NEXT:    cmpl $32, %edx
-; SSE3-NEXT:    cmovel %ecx, %edx
-; SSE3-NEXT:    movd %edx, %xmm3
-; SSE3-NEXT:    movzbl -14(%rsp), %edx
-; SSE3-NEXT:    movzbl -15(%rsp), %esi
-; SSE3-NEXT:    bsfl %esi, %esi
-; SSE3-NEXT:    cmovel %eax, %esi
-; SSE3-NEXT:    cmpl $32, %esi
-; SSE3-NEXT:    cmovel %ecx, %esi
-; SSE3-NEXT:    movd %esi, %xmm1
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE3-NEXT:    bsfl %r14d, %esi
-; SSE3-NEXT:    cmovel %eax, %esi
-; SSE3-NEXT:    cmpl $32, %esi
-; SSE3-NEXT:    cmovel %ecx, %esi
-; SSE3-NEXT:    movd %esi, %xmm0
-; SSE3-NEXT:    bsfl %edi, %esi
-; SSE3-NEXT:    cmovel %eax, %esi
-; SSE3-NEXT:    cmpl $32, %esi
-; SSE3-NEXT:    cmovel %ecx, %esi
-; SSE3-NEXT:    movd %esi, %xmm3
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE3-NEXT:    bsfl %r11d, %esi
-; SSE3-NEXT:    cmovel %eax, %esi
-; SSE3-NEXT:    cmpl $32, %esi
-; SSE3-NEXT:    cmovel %ecx, %esi
-; SSE3-NEXT:    movd %esi, %xmm0
-; SSE3-NEXT:    bsfl %edx, %edx
-; SSE3-NEXT:    cmovel %eax, %edx
-; SSE3-NEXT:    cmpl $32, %edx
-; SSE3-NEXT:    cmovel %ecx, %edx
-; SSE3-NEXT:    movd %edx, %xmm2
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE3-NEXT:    bsfl %r9d, %edx
-; SSE3-NEXT:    cmovel %eax, %edx
-; SSE3-NEXT:    cmpl $32, %edx
-; SSE3-NEXT:    cmovel %ecx, %edx
-; SSE3-NEXT:    movd %edx, %xmm0
-; SSE3-NEXT:    bsfl %r10d, %edx
-; SSE3-NEXT:    cmovel %eax, %edx
-; SSE3-NEXT:    cmpl $32, %edx
-; SSE3-NEXT:    cmovel %ecx, %edx
-; SSE3-NEXT:    movd %edx, %xmm3
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE3-NEXT:    bsfl %r8d, %edx
-; SSE3-NEXT:    cmovel %eax, %edx
-; SSE3-NEXT:    cmpl $32, %edx
-; SSE3-NEXT:    cmovel %ecx, %edx
-; SSE3-NEXT:    movd %edx, %xmm4
-; SSE3-NEXT:    movzbl -16(%rsp), %edx
-; SSE3-NEXT:    bsfl %edx, %edx
-; SSE3-NEXT:    cmovel %eax, %edx
-; SSE3-NEXT:    cmpl $32, %edx
-; SSE3-NEXT:    cmovel %ecx, %edx
-; SSE3-NEXT:    movd %edx, %xmm0
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE3-NEXT:    popq %rbx
-; SSE3-NEXT:    popq %r14
-; SSE3-NEXT:    popq %rbp
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    pand %xmm0, %xmm1
+; SSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE3-NEXT:    pand %xmm0, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm1
+; SSE3-NEXT:    pand %xmm0, %xmm1
+; SSE3-NEXT:    paddb %xmm2, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv16i8:
 ; SSSE3:       # BB#0:
-; SSSE3:         pushq %rbp
-; SSSE3:         pushq %r14
-; SSSE3:         pushq %rbx
-; SSSE3:         movaps %xmm0, -16(%rsp)
-; SSSE3-NEXT:    movzbl -1(%rsp), %eax
-; SSSE3-NEXT:    bsfl %eax, %edx
-; SSSE3-NEXT:    movl $32, %eax
-; SSSE3-NEXT:    cmovel %eax, %edx
-; SSSE3-NEXT:    cmpl $32, %edx
-; SSSE3-NEXT:    movl $8, %ecx
-; SSSE3-NEXT:    cmovel %ecx, %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    movzbl -2(%rsp), %r14d
-; SSSE3-NEXT:    movzbl -3(%rsp), %ebx
-; SSSE3-NEXT:    movzbl -4(%rsp), %r9d
-; SSSE3-NEXT:    movzbl -5(%rsp), %edi
-; SSSE3-NEXT:    movzbl -6(%rsp), %r11d
-; SSSE3-NEXT:    movzbl -7(%rsp), %edx
-; SSSE3-NEXT:    movzbl -8(%rsp), %r8d
-; SSSE3-NEXT:    movzbl -9(%rsp), %esi
-; SSSE3-NEXT:    bsfl %esi, %esi
-; SSSE3-NEXT:    cmovel %eax, %esi
-; SSSE3-NEXT:    cmpl $32, %esi
-; SSSE3-NEXT:    cmovel %ecx, %esi
-; SSSE3-NEXT:    movd %esi, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    bsfl %edi, %esi
-; SSSE3-NEXT:    cmovel %eax, %esi
-; SSSE3-NEXT:    cmpl $32, %esi
-; SSSE3-NEXT:    cmovel %ecx, %esi
-; SSSE3-NEXT:    movd %esi, %xmm2
-; SSSE3-NEXT:    movzbl -10(%rsp), %edi
-; SSSE3-NEXT:    movzbl -11(%rsp), %esi
-; SSSE3-NEXT:    movzbl -12(%rsp), %r10d
-; SSSE3-NEXT:    movzbl -13(%rsp), %ebp
-; SSSE3-NEXT:    bsfl %ebp, %ebp
-; SSSE3-NEXT:    cmovel %eax, %ebp
-; SSSE3-NEXT:    cmpl $32, %ebp
-; SSSE3-NEXT:    cmovel %ecx, %ebp
-; SSSE3-NEXT:    movd %ebp, %xmm0
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT:    bsfl %ebx, %ebx
-; SSSE3-NEXT:    cmovel %eax, %ebx
-; SSSE3-NEXT:    cmpl $32, %ebx
-; SSSE3-NEXT:    cmovel %ecx, %ebx
-; SSSE3-NEXT:    movd %ebx, %xmm1
-; SSSE3-NEXT:    bsfl %esi, %esi
-; SSSE3-NEXT:    cmovel %eax, %esi
-; SSSE3-NEXT:    cmpl $32, %esi
-; SSSE3-NEXT:    cmovel %ecx, %esi
-; SSSE3-NEXT:    movd %esi, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT:    bsfl %edx, %edx
-; SSSE3-NEXT:    cmovel %eax, %edx
-; SSSE3-NEXT:    cmpl $32, %edx
-; SSSE3-NEXT:    cmovel %ecx, %edx
-; SSSE3-NEXT:    movd %edx, %xmm3
-; SSSE3-NEXT:    movzbl -14(%rsp), %edx
-; SSSE3-NEXT:    movzbl -15(%rsp), %esi
-; SSSE3-NEXT:    bsfl %esi, %esi
-; SSSE3-NEXT:    cmovel %eax, %esi
-; SSSE3-NEXT:    cmpl $32, %esi
-; SSSE3-NEXT:    cmovel %ecx, %esi
-; SSSE3-NEXT:    movd %esi, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    bsfl %r14d, %esi
-; SSSE3-NEXT:    cmovel %eax, %esi
-; SSSE3-NEXT:    cmpl $32, %esi
-; SSSE3-NEXT:    cmovel %ecx, %esi
-; SSSE3-NEXT:    movd %esi, %xmm0
-; SSSE3-NEXT:    bsfl %edi, %esi
-; SSSE3-NEXT:    cmovel %eax, %esi
-; SSSE3-NEXT:    cmpl $32, %esi
-; SSSE3-NEXT:    cmovel %ecx, %esi
-; SSSE3-NEXT:    movd %esi, %xmm3
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT:    bsfl %r11d, %esi
-; SSSE3-NEXT:    cmovel %eax, %esi
-; SSSE3-NEXT:    cmpl $32, %esi
-; SSSE3-NEXT:    cmovel %ecx, %esi
-; SSSE3-NEXT:    movd %esi, %xmm0
-; SSSE3-NEXT:    bsfl %edx, %edx
-; SSSE3-NEXT:    cmovel %eax, %edx
-; SSSE3-NEXT:    cmpl $32, %edx
-; SSSE3-NEXT:    cmovel %ecx, %edx
-; SSSE3-NEXT:    movd %edx, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSSE3-NEXT:    bsfl %r9d, %edx
-; SSSE3-NEXT:    cmovel %eax, %edx
-; SSSE3-NEXT:    cmpl $32, %edx
-; SSSE3-NEXT:    cmovel %ecx, %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    bsfl %r10d, %edx
-; SSSE3-NEXT:    cmovel %eax, %edx
-; SSSE3-NEXT:    cmpl $32, %edx
-; SSSE3-NEXT:    cmovel %ecx, %edx
-; SSSE3-NEXT:    movd %edx, %xmm3
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT:    bsfl %r8d, %edx
-; SSSE3-NEXT:    cmovel %eax, %edx
-; SSSE3-NEXT:    cmpl $32, %edx
-; SSSE3-NEXT:    cmovel %ecx, %edx
-; SSSE3-NEXT:    movd %edx, %xmm4
-; SSSE3-NEXT:    movzbl -16(%rsp), %edx
-; SSSE3-NEXT:    bsfl %edx, %edx
-; SSSE3-NEXT:    cmovel %eax, %edx
-; SSSE3-NEXT:    cmpl $32, %edx
-; SSSE3-NEXT:    cmovel %ecx, %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT:    popq %rbx
-; SSSE3-NEXT:    popq %r14
-; SSSE3-NEXT:    popq %rbp
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    psubb %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    paddb %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrb $1, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %edx
-; SSE41-NEXT:    movl $32, %eax
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    movl $8, %ecx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pextrb $0, %xmm0, %esi
-; SSE41-NEXT:    bsfl %esi, %esi
-; SSE41-NEXT:    cmovel %eax, %esi
-; SSE41-NEXT:    cmpl $32, %esi
-; SSE41-NEXT:    cmovel %ecx, %esi
-; SSE41-NEXT:    movd %esi, %xmm1
-; SSE41-NEXT:    pinsrb $1, %edx, %xmm1
-; SSE41-NEXT:    pextrb $2, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $2, %edx, %xmm1
-; SSE41-NEXT:    pextrb $3, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $3, %edx, %xmm1
-; SSE41-NEXT:    pextrb $4, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $4, %edx, %xmm1
-; SSE41-NEXT:    pextrb $5, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $5, %edx, %xmm1
-; SSE41-NEXT:    pextrb $6, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $6, %edx, %xmm1
-; SSE41-NEXT:    pextrb $7, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $7, %edx, %xmm1
-; SSE41-NEXT:    pextrb $8, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $8, %edx, %xmm1
-; SSE41-NEXT:    pextrb $9, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $9, %edx, %xmm1
-; SSE41-NEXT:    pextrb $10, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $10, %edx, %xmm1
-; SSE41-NEXT:    pextrb $11, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $11, %edx, %xmm1
-; SSE41-NEXT:    pextrb $12, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $12, %edx, %xmm1
-; SSE41-NEXT:    pextrb $13, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $13, %edx, %xmm1
-; SSE41-NEXT:    pextrb $14, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $14, %edx, %xmm1
-; SSE41-NEXT:    pextrb $15, %xmm0, %edx
-; SSE41-NEXT:    bsfl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    cmpl $32, %edx
-; SSE41-NEXT:    cmovel %ecx, %edx
-; SSE41-NEXT:    pinsrb $15, %edx, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    psubb %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    psubb {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pshufb %xmm3, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    paddb %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %edx
-; AVX-NEXT:    movl $32, %eax
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    movl $8, %ecx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX-NEXT:    bsfl %esi, %esi
-; AVX-NEXT:    cmovel %eax, %esi
-; AVX-NEXT:    cmpl $32, %esi
-; AVX-NEXT:    cmovel %ecx, %esi
-; AVX-NEXT:    vmovd %esi, %xmm1
-; AVX-NEXT:    vpinsrb $1, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $3, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $4, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $4, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $5, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $6, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $7, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $8, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $8, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $9, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $10, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $11, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $12, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $12, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $13, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX-NEXT:    bsfl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    cmpl $32, %edx
-; AVX-NEXT:    cmovel %ecx, %edx
-; AVX-NEXT:    vpinsrb $15, %edx, %xmm1, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0)
   ret <16 x i8> %out
 }
 
-define <16 x i8> @testv16i8u(<16 x i8> %in) {
+define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; SSE2-LABEL: testv16i8u:
 ; SSE2:       # BB#0:
-; SSE2:         pushq %rbx
-; SSE2:         movaps %xmm0, -16(%rsp)
-; SSE2-NEXT:    movzbl -1(%rsp), %eax
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -2(%rsp), %r11d
-; SSE2-NEXT:    movzbl -3(%rsp), %eax
-; SSE2-NEXT:    movzbl -4(%rsp), %r9d
-; SSE2-NEXT:    movzbl -5(%rsp), %edi
-; SSE2-NEXT:    movzbl -6(%rsp), %r10d
-; SSE2-NEXT:    movzbl -7(%rsp), %ecx
-; SSE2-NEXT:    movzbl -8(%rsp), %r8d
-; SSE2-NEXT:    movzbl -9(%rsp), %edx
-; SSE2-NEXT:    bsfl %edx, %edx
-; SSE2-NEXT:    movd %edx, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    bsfl %edi, %edx
-; SSE2-NEXT:    movd %edx, %xmm0
-; SSE2-NEXT:    movzbl -10(%rsp), %edx
-; SSE2-NEXT:    movzbl -11(%rsp), %esi
-; SSE2-NEXT:    movzbl -12(%rsp), %edi
-; SSE2-NEXT:    movzbl -13(%rsp), %ebx
-; SSE2-NEXT:    bsfl %ebx, %ebx
-; SSE2-NEXT:    movd %ebx, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    bsfl %esi, %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw %xmm0, %xmm3    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    bsfl %ecx, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -14(%rsp), %eax
-; SSE2-NEXT:    movzbl -15(%rsp), %ecx
-; SSE2-NEXT:    bsfl %ecx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT:    bsfl %r11d, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    bsfl %edx, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    bsfl %r10d, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT:    bsfl %r9d, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    bsfl %edi, %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    bsfl %r8d, %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movzbl -16(%rsp), %eax
-; SSE2-NEXT:    bsfl %eax, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    psubb {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv16i8u:
 ; SSE3:       # BB#0:
-; SSE3:         pushq %rbx
-; SSE3:         movaps %xmm0, -16(%rsp)
-; SSE3-NEXT:    movzbl -1(%rsp), %eax
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    movd %eax, %xmm0
-; SSE3-NEXT:    movzbl -2(%rsp), %r11d
-; SSE3-NEXT:    movzbl -3(%rsp), %eax
-; SSE3-NEXT:    movzbl -4(%rsp), %r9d
-; SSE3-NEXT:    movzbl -5(%rsp), %edi
-; SSE3-NEXT:    movzbl -6(%rsp), %r10d
-; SSE3-NEXT:    movzbl -7(%rsp), %ecx
-; SSE3-NEXT:    movzbl -8(%rsp), %r8d
-; SSE3-NEXT:    movzbl -9(%rsp), %edx
-; SSE3-NEXT:    bsfl %edx, %edx
-; SSE3-NEXT:    movd %edx, %xmm1
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE3-NEXT:    bsfl %edi, %edx
-; SSE3-NEXT:    movd %edx, %xmm0
-; SSE3-NEXT:    movzbl -10(%rsp), %edx
-; SSE3-NEXT:    movzbl -11(%rsp), %esi
-; SSE3-NEXT:    movzbl -12(%rsp), %edi
-; SSE3-NEXT:    movzbl -13(%rsp), %ebx
-; SSE3-NEXT:    bsfl %ebx, %ebx
-; SSE3-NEXT:    movd %ebx, %xmm2
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    movd %eax, %xmm0
-; SSE3-NEXT:    bsfl %esi, %eax
-; SSE3-NEXT:    movd %eax, %xmm3
-; SSE3-NEXT:    punpcklbw %xmm0, %xmm3    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE3-NEXT:    bsfl %ecx, %eax
-; SSE3-NEXT:    movd %eax, %xmm0
-; SSE3-NEXT:    movzbl -14(%rsp), %eax
-; SSE3-NEXT:    movzbl -15(%rsp), %ecx
-; SSE3-NEXT:    bsfl %ecx, %ecx
-; SSE3-NEXT:    movd %ecx, %xmm1
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE3-NEXT:    bsfl %r11d, %ecx
-; SSE3-NEXT:    movd %ecx, %xmm0
-; SSE3-NEXT:    bsfl %edx, %ecx
-; SSE3-NEXT:    movd %ecx, %xmm2
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE3-NEXT:    bsfl %r10d, %ecx
-; SSE3-NEXT:    movd %ecx, %xmm0
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    movd %eax, %xmm3
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE3-NEXT:    bsfl %r9d, %eax
-; SSE3-NEXT:    movd %eax, %xmm0
-; SSE3-NEXT:    bsfl %edi, %eax
-; SSE3-NEXT:    movd %eax, %xmm2
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE3-NEXT:    bsfl %r8d, %eax
-; SSE3-NEXT:    movd %eax, %xmm4
-; SSE3-NEXT:    movzbl -16(%rsp), %eax
-; SSE3-NEXT:    bsfl %eax, %eax
-; SSE3-NEXT:    movd %eax, %xmm0
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE3-NEXT:    popq %rbx
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    pand %xmm0, %xmm1
+; SSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE3-NEXT:    pand %xmm0, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm1
+; SSE3-NEXT:    pand %xmm0, %xmm1
+; SSE3-NEXT:    paddb %xmm2, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv16i8u:
 ; SSSE3:       # BB#0:
-; SSSE3:         pushq %rbx
-; SSSE3:         movaps %xmm0, -16(%rsp)
-; SSSE3-NEXT:    movzbl -1(%rsp), %eax
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzbl -2(%rsp), %r11d
-; SSSE3-NEXT:    movzbl -3(%rsp), %eax
-; SSSE3-NEXT:    movzbl -4(%rsp), %r9d
-; SSSE3-NEXT:    movzbl -5(%rsp), %edi
-; SSSE3-NEXT:    movzbl -6(%rsp), %r10d
-; SSSE3-NEXT:    movzbl -7(%rsp), %ecx
-; SSSE3-NEXT:    movzbl -8(%rsp), %r8d
-; SSSE3-NEXT:    movzbl -9(%rsp), %edx
-; SSSE3-NEXT:    bsfl %edx, %edx
-; SSSE3-NEXT:    movd %edx, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    bsfl %edi, %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    movzbl -10(%rsp), %edx
-; SSSE3-NEXT:    movzbl -11(%rsp), %esi
-; SSSE3-NEXT:    movzbl -12(%rsp), %edi
-; SSSE3-NEXT:    movzbl -13(%rsp), %ebx
-; SSSE3-NEXT:    bsfl %ebx, %ebx
-; SSSE3-NEXT:    movd %ebx, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    bsfl %esi, %eax
-; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    punpcklbw %xmm0, %xmm3    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT:    bsfl %ecx, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzbl -14(%rsp), %eax
-; SSSE3-NEXT:    movzbl -15(%rsp), %ecx
-; SSSE3-NEXT:    bsfl %ecx, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT:    bsfl %r11d, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    bsfl %edx, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    bsfl %r10d, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT:    bsfl %r9d, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    bsfl %edi, %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    bsfl %r8d, %eax
-; SSSE3-NEXT:    movd %eax, %xmm4
-; SSSE3-NEXT:    movzbl -16(%rsp), %eax
-; SSSE3-NEXT:    bsfl %eax, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT:    popq %rbx
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    psubb %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    paddb %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8u:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrb $1, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE41-NEXT:    bsfl %ecx, %ecx
-; SSE41-NEXT:    movd %ecx, %xmm1
-; SSE41-NEXT:    pinsrb $1, %eax, %xmm1
-; SSE41-NEXT:    pextrb $2, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $2, %eax, %xmm1
-; SSE41-NEXT:    pextrb $3, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $3, %eax, %xmm1
-; SSE41-NEXT:    pextrb $4, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $4, %eax, %xmm1
-; SSE41-NEXT:    pextrb $5, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $5, %eax, %xmm1
-; SSE41-NEXT:    pextrb $6, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $6, %eax, %xmm1
-; SSE41-NEXT:    pextrb $7, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $7, %eax, %xmm1
-; SSE41-NEXT:    pextrb $8, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $8, %eax, %xmm1
-; SSE41-NEXT:    pextrb $9, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $9, %eax, %xmm1
-; SSE41-NEXT:    pextrb $10, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $10, %eax, %xmm1
-; SSE41-NEXT:    pextrb $11, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $11, %eax, %xmm1
-; SSE41-NEXT:    pextrb $12, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $12, %eax, %xmm1
-; SSE41-NEXT:    pextrb $13, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $13, %eax, %xmm1
-; SSE41-NEXT:    pextrb $14, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $14, %eax, %xmm1
-; SSE41-NEXT:    pextrb $15, %xmm0, %eax
-; SSE41-NEXT:    bsfl %eax, %eax
-; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    psubb %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    psubb {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pshufb %xmm3, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    paddb %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8u:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX-NEXT:    bsfl %ecx, %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm1
-; AVX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX-NEXT:    bsfl %eax, %eax
-; AVX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1)
   ret <16 x i8> %out
 }
 
-define <2 x i64> @foldv2i64() {
+define <2 x i64> @foldv2i64() nounwind {
 ; SSE-LABEL: foldv2i64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movl $8, %eax
@@ -1682,7 +915,7 @@ define <2 x i64> @foldv2i64() {
   ret <2 x i64> %out
 }
 
-define <2 x i64> @foldv2i64u() {
+define <2 x i64> @foldv2i64u() nounwind {
 ; SSE-LABEL: foldv2i64u:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movl $8, %eax
@@ -1698,7 +931,7 @@ define <2 x i64> @foldv2i64u() {
   ret <2 x i64> %out
 }
 
-define <4 x i32> @foldv4i32() {
+define <4 x i32> @foldv4i32() nounwind {
 ; SSE-LABEL: foldv4i32:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
@@ -1712,7 +945,7 @@ define <4 x i32> @foldv4i32() {
   ret <4 x i32> %out
 }
 
-define <4 x i32> @foldv4i32u() {
+define <4 x i32> @foldv4i32u() nounwind {
 ; SSE-LABEL: foldv4i32u:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
@@ -1726,7 +959,7 @@ define <4 x i32> @foldv4i32u() {
   ret <4 x i32> %out
 }
 
-define <8 x i16> @foldv8i16() {
+define <8 x i16> @foldv8i16() nounwind {
 ; SSE-LABEL: foldv8i16:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
@@ -1740,7 +973,7 @@ define <8 x i16> @foldv8i16() {
   ret <8 x i16> %out
 }
 
-define <8 x i16> @foldv8i16u() {
+define <8 x i16> @foldv8i16u() nounwind {
 ; SSE-LABEL: foldv8i16u:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
@@ -1754,7 +987,7 @@ define <8 x i16> @foldv8i16u() {
   ret <8 x i16> %out
 }
 
-define <16 x i8> @foldv16i8() {
+define <16 x i8> @foldv16i8() nounwind {
 ; SSE-LABEL: foldv16i8:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
@@ -1768,7 +1001,7 @@ define <16 x i8> @foldv16i8() {
   ret <16 x i8> %out
 }
 
-define <16 x i8> @foldv16i8u() {
+define <16 x i8> @foldv16i8u() nounwind {
 ; SSE-LABEL: foldv16i8u:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 8f744f79f85fa..a9ee27cc51f0f 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -1,1190 +1,525 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
-target triple = "x86_64-unknown-unknown"
-
-define <4 x i64> @testv4i64(<4 x i64> %in) {
+define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    bsfq %rax, %rax
-; AVX1-NEXT:    movl $64, %ecx
-; AVX1-NEXT:    cmoveq %rcx, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    bsfq %rax, %rax
-; AVX1-NEXT:    cmoveq %rcx, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    bsfq %rax, %rax
-; AVX1-NEXT:    cmoveq %rcx, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    bsfq %rax, %rax
-; AVX1-NEXT:    cmoveq %rcx, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    bsfq %rax, %rax
-; AVX2-NEXT:    movl $64, %ecx
-; AVX2-NEXT:    cmoveq %rcx, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    bsfq %rax, %rax
-; AVX2-NEXT:    cmoveq %rcx, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    bsfq %rax, %rax
-; AVX2-NEXT:    cmoveq %rcx, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    bsfq %rax, %rax
-; AVX2-NEXT:    cmoveq %rcx, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
   ret <4 x i64> %out
 }
 
-define <4 x i64> @testv4i64u(<4 x i64> %in) {
+define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    bsfq %rax, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    bsfq %rax, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    bsfq %rax, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    bsfq %rax, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv4i64u:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    bsfq %rax, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    bsfq %rax, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    bsfq %rax, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    bsfq %rax, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
   ret <4 x i64> %out
 }
 
-define <8 x i32> @testv8i32(<8 x i32> %in) {
+define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrd $1, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %ecx
-; AVX1-NEXT:    movl $32, %eax
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    vmovd %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    bsfl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    bsfl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm1
-; AVX1-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX1-NEXT:    bsfl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    vmovd %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX1-NEXT:    bsfl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX1-NEXT:    bsfl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrd $1, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %ecx
-; AVX2-NEXT:    movl $32, %eax
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vmovd %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT:    bsfl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT:    bsfl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm1
-; AVX2-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX2-NEXT:    bsfl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vmovd %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX2-NEXT:    bsfl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX2-NEXT:    bsfl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
   ret <8 x i32> %out
 }
 
-define <8 x i32> @testv8i32u(<8 x i32> %in) {
+define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32u:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrd $1, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vmovd %xmm1, %ecx
-; AVX1-NEXT:    bsfl %ecx, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vmovd %xmm0, %ecx
-; AVX1-NEXT:    bsfl %ecx, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv8i32u:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrd $1, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vmovd %xmm1, %ecx
-; AVX2-NEXT:    bsfl %ecx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vmovd %xmm0, %ecx
-; AVX2-NEXT:    bsfl %ecx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
   ret <8 x i32> %out
 }
 
-define <16 x i16> @testv16i16(<16 x i16> %in) {
+define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrw $1, %xmm1, %eax
-; AVX1-NEXT:    bsfw %ax, %cx
-; AVX1-NEXT:    movw $16, %ax
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vmovd %xmm1, %edx
-; AVX1-NEXT:    bsfw %dx, %dx
-; AVX1-NEXT:    cmovew %ax, %dx
-; AVX1-NEXT:    vmovd %edx, %xmm2
-; AVX1-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $2, %xmm1, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $3, %xmm1, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $4, %xmm1, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $6, %xmm1, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $7, %xmm1, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm1
-; AVX1-NEXT:    vpextrw $1, %xmm0, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vmovd %xmm0, %edx
-; AVX1-NEXT:    bsfw %dx, %dx
-; AVX1-NEXT:    cmovew %ax, %dx
-; AVX1-NEXT:    vmovd %edx, %xmm2
-; AVX1-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $2, %xmm0, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $3, %xmm0, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $4, %xmm0, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $5, %xmm0, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $6, %xmm0, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $7, %xmm0, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv16i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrw $1, %xmm1, %eax
-; AVX2-NEXT:    bsfw %ax, %cx
-; AVX2-NEXT:    movw $16, %ax
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vmovd %xmm1, %edx
-; AVX2-NEXT:    bsfw %dx, %dx
-; AVX2-NEXT:    cmovew %ax, %dx
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $2, %xmm1, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $3, %xmm1, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $4, %xmm1, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $6, %xmm1, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $7, %xmm1, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm1
-; AVX2-NEXT:    vpextrw $1, %xmm0, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vmovd %xmm0, %edx
-; AVX2-NEXT:    bsfw %dx, %dx
-; AVX2-NEXT:    cmovew %ax, %dx
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $2, %xmm0, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $3, %xmm0, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $4, %xmm0, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $5, %xmm0, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $6, %xmm0, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $7, %xmm0, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
+; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0)
   ret <16 x i16> %out
 }
 
-define <16 x i16> @testv16i16u(<16 x i16> %in) {
+define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrw $1, %xmm1, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vmovd %xmm1, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $2, %xmm1, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $4, %xmm1, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $5, %xmm1, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $6, %xmm1, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm1
-; AVX1-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vmovd %xmm0, %ecx
-; AVX1-NEXT:    bsfw %cx, %cx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $4, %xmm0, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $5, %xmm0, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $7, %xmm0, %eax
-; AVX1-NEXT:    bsfw %ax, %ax
-; AVX1-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv16i16u:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrw $1, %xmm1, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vmovd %xmm1, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $2, %xmm1, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $4, %xmm1, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $5, %xmm1, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $6, %xmm1, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm1
-; AVX2-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vmovd %xmm0, %ecx
-; AVX2-NEXT:    bsfw %cx, %cx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $4, %xmm0, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $5, %xmm0, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $7, %xmm0, %eax
-; AVX2-NEXT:    bsfw %ax, %ax
-; AVX2-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
+; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1)
   ret <16 x i16> %out
 }
 
-define <32 x i8> @testv32i8(<32 x i8> %in) {
+define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %edx
-; AVX1-NEXT:    movl $32, %eax
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    movl $8, %ecx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm1, %esi
-; AVX1-NEXT:    bsfl %esi, %esi
-; AVX1-NEXT:    cmovel %eax, %esi
-; AVX1-NEXT:    cmpl $32, %esi
-; AVX1-NEXT:    cmovel %ecx, %esi
-; AVX1-NEXT:    vmovd %esi, %xmm2
-; AVX1-NEXT:    vpinsrb $1, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $2, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $2, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $3, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $3, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $4, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $4, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $5, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $5, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $6, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $6, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $7, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $7, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $8, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $8, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $9, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $9, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $10, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $10, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $11, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $11, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $12, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $12, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $13, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $13, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $14, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $14, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $15, %xmm1, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $15, %edx, %xmm2, %xmm1
-; AVX1-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX1-NEXT:    bsfl %esi, %esi
-; AVX1-NEXT:    cmovel %eax, %esi
-; AVX1-NEXT:    cmpl $32, %esi
-; AVX1-NEXT:    cmovel %ecx, %esi
-; AVX1-NEXT:    vmovd %esi, %xmm2
-; AVX1-NEXT:    vpinsrb $1, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $2, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $3, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $4, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $4, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $5, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $6, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $7, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $8, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $8, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $9, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $10, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $11, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $12, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $12, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $13, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $14, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX1-NEXT:    bsfl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    cmpl $32, %edx
-; AVX1-NEXT:    cmovel %ecx, %edx
-; AVX1-NEXT:    vpinsrb $15, %edx, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %edx
-; AVX2-NEXT:    movl $32, %eax
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    movl $8, %ecx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpextrb $0, %xmm1, %esi
-; AVX2-NEXT:    bsfl %esi, %esi
-; AVX2-NEXT:    cmovel %eax, %esi
-; AVX2-NEXT:    cmpl $32, %esi
-; AVX2-NEXT:    cmovel %ecx, %esi
-; AVX2-NEXT:    vmovd %esi, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $2, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $3, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $3, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $4, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $5, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $5, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $6, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $7, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $7, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $8, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $9, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $9, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $10, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $11, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $11, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $12, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $13, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $13, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $14, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $15, %xmm1, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $15, %edx, %xmm2, %xmm1
-; AVX2-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX2-NEXT:    bsfl %esi, %esi
-; AVX2-NEXT:    cmovel %eax, %esi
-; AVX2-NEXT:    cmpl $32, %esi
-; AVX2-NEXT:    cmovel %ecx, %esi
-; AVX2-NEXT:    vmovd %esi, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $2, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $3, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $4, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $5, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $6, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $7, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $8, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $9, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $10, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $11, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $12, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $13, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $14, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX2-NEXT:    bsfl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    cmpl $32, %edx
-; AVX2-NEXT:    cmovel %ecx, %edx
-; AVX2-NEXT:    vpinsrb $15, %edx, %xmm2, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0)
   ret <32 x i8> %out
 }
 
-define <32 x i8> @testv32i8u(<32 x i8> %in) {
+define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX1-NEXT:    bsfl %ecx, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX1-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT:    bsfl %ecx, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX1-NEXT:    bsfl %eax, %eax
-; AVX1-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8u:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX2-NEXT:    bsfl %ecx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    bsfl %ecx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX2-NEXT:    bsfl %eax, %eax
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1)
   ret <32 x i8> %out
 }
 
-define <4 x i64> @foldv4i64() {
-; AVX-LABEL: foldv4i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
-; AVX-NEXT:    retq
+define <4 x i64> @foldv4i64() nounwind {
+; ALL-LABEL: foldv4i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; ALL-NEXT:    retq
   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
   ret <4 x i64> %out
 }
 
-define <4 x i64> @foldv4i64u() {
-; AVX-LABEL: foldv4i64u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
-; AVX-NEXT:    retq
+define <4 x i64> @foldv4i64u() nounwind {
+; ALL-LABEL: foldv4i64u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; ALL-NEXT:    retq
   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
   ret <4 x i64> %out
 }
 
-define <8 x i32> @foldv8i32() {
-; AVX-LABEL: foldv8i32:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; AVX-NEXT:    retq
+define <8 x i32> @foldv8i32() nounwind {
+; ALL-LABEL: foldv8i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; ALL-NEXT:    retq
   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
   ret <8 x i32> %out
 }
 
-define <8 x i32> @foldv8i32u() {
-; AVX-LABEL: foldv8i32u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; AVX-NEXT:    retq
+define <8 x i32> @foldv8i32u() nounwind {
+; ALL-LABEL: foldv8i32u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; ALL-NEXT:    retq
   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
   ret <8 x i32> %out
 }
 
-define <16 x i16> @foldv16i16() {
-; AVX-LABEL: foldv16i16:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; AVX-NEXT:    retq
+define <16 x i16> @foldv16i16() nounwind {
+; ALL-LABEL: foldv16i16:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; ALL-NEXT:    retq
   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
   ret <16 x i16> %out
 }
 
-define <16 x i16> @foldv16i16u() {
-; AVX-LABEL: foldv16i16u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; AVX-NEXT:    retq
+define <16 x i16> @foldv16i16u() nounwind {
+; ALL-LABEL: foldv16i16u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; ALL-NEXT:    retq
   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
   ret <16 x i16> %out
 }
 
-define <32 x i8> @foldv32i8() {
-; AVX-LABEL: foldv32i8:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; AVX-NEXT:    retq
+define <32 x i8> @foldv32i8() nounwind {
+; ALL-LABEL: foldv32i8:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; ALL-NEXT:    retq
   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
   ret <32 x i8> %out
 }
 
-define <32 x i8> @foldv32i8u() {
-; AVX-LABEL: foldv32i8u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; AVX-NEXT:    retq
+define <32 x i8> @foldv32i8u() nounwind {
+; ALL-LABEL: foldv32i8u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; ALL-NEXT:    retq
   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
   ret <32 x i8> %out
 }
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
new file mode 100644
index 0000000000000..9265fad0176cc
--- /dev/null
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+
+define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
+; ALL-LABEL: testv8i64:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; ALL-NEXT:    vpextrq $1, %xmm1, %rax
+; ALL-NEXT:    tzcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm2
+; ALL-NEXT:    vmovq %xmm1, %rax
+; ALL-NEXT:    tzcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm1
+; ALL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; ALL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; ALL-NEXT:    vpextrq $1, %xmm2, %rax
+; ALL-NEXT:    tzcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm3
+; ALL-NEXT:    vmovq %xmm2, %rax
+; ALL-NEXT:    tzcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm2
+; ALL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; ALL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; ALL-NEXT:    vpextrq $1, %xmm2, %rax
+; ALL-NEXT:    tzcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm3
+; ALL-NEXT:    vmovq %xmm2, %rax
+; ALL-NEXT:    tzcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm2
+; ALL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT:    vpextrq $1, %xmm0, %rax
+; ALL-NEXT:    tzcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm3
+; ALL-NEXT:    vmovq %xmm0, %rax
+; ALL-NEXT:    tzcntq %rax, %rax
+; ALL-NEXT:    vmovq %rax, %xmm0
+; ALL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
+  ret <8 x i64> %out
+}
+
+define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
+; ALL-LABEL: testv8i64u:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
+; ALL-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vplzcntq %zmm0, %zmm0
+; ALL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1
+; ALL-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
+  ret <8 x i64> %out
+}
+
+define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
+; ALL-LABEL: testv16i32:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; ALL-NEXT:    vpextrd $1, %xmm1, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vmovd %xmm1, %ecx
+; ALL-NEXT:    tzcntl %ecx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm2
+; ALL-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
+; ALL-NEXT:    vpextrd $2, %xmm1, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
+; ALL-NEXT:    vpextrd $3, %xmm1, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
+; ALL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; ALL-NEXT:    vpextrd $1, %xmm2, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vmovd %xmm2, %ecx
+; ALL-NEXT:    tzcntl %ecx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm3
+; ALL-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $2, %xmm2, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $3, %xmm2, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm2
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; ALL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; ALL-NEXT:    vpextrd $1, %xmm2, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vmovd %xmm2, %ecx
+; ALL-NEXT:    tzcntl %ecx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm3
+; ALL-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $2, %xmm2, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $3, %xmm2, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm2
+; ALL-NEXT:    vpextrd $1, %xmm0, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vmovd %xmm0, %ecx
+; ALL-NEXT:    tzcntl %ecx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm3
+; ALL-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $2, %xmm0, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
+; ALL-NEXT:    vpextrd $3, %xmm0, %eax
+; ALL-NEXT:    tzcntl %eax, %eax
+; ALL-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
+; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0)
+  ret <16 x i32> %out
+}
+
+define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
+; ALL-LABEL: testv16i32u:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
+; ALL-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vplzcntd %zmm0, %zmm0
+; ALL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1
+; ALL-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1)
+  ret <16 x i32> %out
+}
+
+define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
+; ALL-LABEL: testv32i16:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; ALL-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
+; ALL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; ALL-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; ALL-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; ALL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; ALL-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; ALL-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; ALL-NEXT:    vpsllw $8, %ymm0, %ymm5
+; ALL-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; ALL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; ALL-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; ALL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    vpsubw %ymm3, %ymm1, %ymm1
+; ALL-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; ALL-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; ALL-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; ALL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; ALL-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; ALL-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    vpsllw $8, %ymm1, %ymm2
+; ALL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; ALL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; ALL-NEXT:    retq
+  %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
+  ret <32 x i16> %out
+}
+
+define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
+; ALL-LABEL: testv32i16u:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; ALL-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
+; ALL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; ALL-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; ALL-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; ALL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; ALL-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; ALL-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; ALL-NEXT:    vpsllw $8, %ymm0, %ymm5
+; ALL-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; ALL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; ALL-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; ALL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    vpsubw %ymm3, %ymm1, %ymm1
+; ALL-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; ALL-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; ALL-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; ALL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; ALL-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; ALL-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    vpsllw $8, %ymm1, %ymm2
+; ALL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; ALL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; ALL-NEXT:    retq
+  %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
+  ret <32 x i16> %out
+}
+
+define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
+; ALL-LABEL: testv64i8:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; ALL-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
+; ALL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; ALL-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; ALL-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; ALL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; ALL-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; ALL-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; ALL-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; ALL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; ALL-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; ALL-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; ALL-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; ALL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; ALL-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; ALL-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    retq
+  %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0)
+  ret <64 x i8> %out
+}
+
+define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
+; ALL-LABEL: testv64i8u:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; ALL-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
+; ALL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; ALL-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; ALL-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; ALL-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; ALL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; ALL-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; ALL-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; ALL-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; ALL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; ALL-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; ALL-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; ALL-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; ALL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; ALL-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; ALL-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; ALL-NEXT:    retq
+  %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1)
+  ret <64 x i8> %out
+}
+
+declare <8 x i64> @llvm.cttz.v8i64(<8 x i64>, i1)
+declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
+declare <32 x i16> @llvm.cttz.v32i16(<32 x i16>, i1)
+declare <64 x i8> @llvm.cttz.v64i8(<64 x i8>, i1)
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index b119f5eb89f67..b8024203ab2fc 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -1,332 +1,270 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 
-define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_8i16_to_8i32:
+define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_8i16:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: zext_8i16_to_8i32:
+; SSSE3-LABEL: zext_16i8_to_8i16:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: zext_8i16_to_8i32:
+; SSE41-LABEL: zext_16i8_to_8i16:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: zext_8i16_to_8i32:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: zext_8i16_to_8i32:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    retq
+; AVX-LABEL: zext_16i8_to_8i16:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    retq
 entry:
-  %B = zext <8 x i16> %A to <8 x i32>
-  ret <8 x i32>%B
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %C = zext <8 x i8> %B to <8 x i16>
+  ret <8 x i16> %C
 }
 
-define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_4i32_to_4i64:
+; PR17654
+define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
+; SSE2-LABEL: zext_16i8_to_16i16:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; SSE2-NEXT:    pand %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: zext_4i32_to_4i64:
+; SSSE3-LABEL: zext_16i8_to_16i16:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; SSSE3-NEXT:    pand %xmm3, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: zext_4i32_to_4i64:
+; SSE41-LABEL: zext_16i8_to_16i16:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; SSE41-NEXT:    pand %xmm3, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: zext_4i32_to_4i64:
+; AVX1-LABEL: zext_16i8_to_16i16:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: zext_4i32_to_4i64:
+; AVX2-LABEL: zext_16i8_to_16i16:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX2-NEXT:    retq
 entry:
-  %B = zext <4 x i32> %A to <4 x i64>
-  ret <4 x i64>%B
+  %B = zext <16 x i8> %A to <16 x i16>
+  ret <16 x i16> %B
 }
 
-define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
-; SSE2-LABEL: zext_8i8_to_8i32:
+define <4 x i32> @zext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_4i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: zext_8i8_to_8i32:
+; SSSE3-LABEL: zext_16i8_to_4i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: zext_8i8_to_8i32:
+; SSE41-LABEL: zext_16i8_to_4i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: zext_8i8_to_8i32:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: zext_8i8_to_8i32:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: zext_16i8_to_4i32:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT:    retq
 entry:
-  %t = zext <8 x i8> %z to <8 x i32>
-  ret <8 x i32> %t
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = zext <4 x i8> %B to <4 x i32>
+  ret <4 x i32> %C
 }
 
-; PR17654
-define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
-; SSE2-LABEL: zext_16i8_to_16i16:
+define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_8i32:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: zext_16i8_to_16i16:
+; SSSE3-LABEL: zext_16i8_to_8i32:
 ; SSSE3:       # BB#0: # %entry
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: zext_16i8_to_16i16:
+; SSE41-LABEL: zext_16i8_to_8i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: zext_16i8_to_16i16:
+; AVX1-LABEL: zext_16i8_to_8i32:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: zext_16i8_to_16i16:
+; AVX2-LABEL: zext_16i8_to_8i32:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
-  %t = zext <16 x i8> %z to <16 x i16>
-  ret <16 x i16> %t
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %C = zext <8 x i8> %B to <8 x i32>
+  ret <8 x i32> %C
 }
 
-define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
-; SSE2-LABEL: load_zext_16i8_to_16i16:
+define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_2i64:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: load_zext_16i8_to_16i16:
+; SSSE3-LABEL: zext_16i8_to_2i64:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa (%rdi), %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: load_zext_16i8_to_16i16:
+; SSE41-LABEL: zext_16i8_to_2i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: load_zext_16i8_to_16i16:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_zext_16i8_to_16i16:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX2-NEXT:    retq
+; AVX-LABEL: zext_16i8_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
 entry:
- %X = load <16 x i8>, <16 x i8>* %ptr
- %Y = zext <16 x i8> %X to <16 x i16>
- ret <16 x i16> %Y
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %C = zext <2 x i8> %B to <2 x i64>
+  ret <2 x i64> %C
 }
 
-define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
-; SSE2-LABEL: load_zext_8i16_to_8i32:
+define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_4i64:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: load_zext_8i16_to_8i32:
+; SSSE3-LABEL: zext_16i8_to_4i64:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa (%rdi), %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: load_zext_8i16_to_8i32:
+; SSE41-LABEL: zext_16i8_to_4i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: load_zext_8i16_to_8i32:
+; AVX1-LABEL: zext_16i8_to_4i64:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: load_zext_8i16_to_8i32:
+; AVX2-LABEL: zext_16i8_to_4i64:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
- %X = load <8 x i16>, <8 x i16>* %ptr
- %Y = zext <8 x i16> %X to <8 x i32>
- ret <8 x i32>%Y
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = zext <4 x i8> %B to <4 x i64>
+  ret <4 x i64> %C
 }
 
-define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
-; SSE2-LABEL: load_zext_4i32_to_4i64:
+define <4 x i32> @zext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_4i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: load_zext_4i32_to_4i64:
+; SSSE3-LABEL: zext_8i16_to_4i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa (%rdi), %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: load_zext_4i32_to_4i64:
+; SSE41-LABEL: zext_8i16_to_4i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: load_zext_4i32_to_4i64:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_zext_4i32_to_4i64:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX2-NEXT:    retq
+; AVX-LABEL: zext_8i16_to_4i32:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    retq
 entry:
- %X = load <4 x i32>, <4 x i32>* %ptr
- %Y = zext <4 x i32> %X to <4 x i64>
- ret <4 x i64>%Y
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = zext <4 x i16> %B to <4 x i32>
+  ret <4 x i32> %C
 }
 
-define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_8i16_to_8i32:
+define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_8i32:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -334,7 +272,7 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: shuf_zext_8i16_to_8i32:
+; SSSE3-LABEL: zext_8i16_to_8i32:
 ; SSSE3:       # BB#0: # %entry
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
@@ -342,7 +280,7 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: shuf_zext_8i16_to_8i32:
+; SSE41-LABEL: zext_8i16_to_8i32:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
@@ -350,7 +288,7 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone
 ; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: shuf_zext_8i16_to_8i32:
+; AVX1-LABEL: zext_8i16_to_8i32:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -358,101 +296,791 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuf_zext_8i16_to_8i32:
+; AVX2-LABEL: zext_8i16_to_8i32:
 ; AVX2:       # BB#0: # %entry
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    retq
 entry:
-  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8>
-  %Z = bitcast <16 x i16> %B to <8 x i32>
-  ret <8 x i32> %Z
+  %B = zext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
 }
 
-define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_4i32_to_4i64:
+define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_2i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i16_to_2i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i16_to_2i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: zext_8i16_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %C = zext <2 x i16> %B to <2 x i64>
+  ret <2 x i64> %C
+}
+
+define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_4i64:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: shuf_zext_4i32_to_4i64:
+; SSSE3-LABEL: zext_8i16_to_4i64:
 ; SSSE3:       # BB#0: # %entry
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: shuf_zext_4i32_to_4i64:
+; SSE41-LABEL: zext_8i16_to_4i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: shuf_zext_4i32_to_4i64:
+; AVX1-LABEL: zext_8i16_to_4i64:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuf_zext_4i32_to_4i64:
+; AVX2-LABEL: zext_8i16_to_4i64:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
 ; AVX2-NEXT:    retq
 entry:
-  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
-  %Z = bitcast <8 x i32> %B to <4 x i64>
-  ret <4 x i64> %Z
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = zext <4 x i16> %B to <4 x i64>
+  ret <4 x i64> %C
 }
 
-define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
-; SSE2-LABEL: shuf_zext_8i8_to_8i32:
+define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_4i32_to_2i64:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
+; SSSE3-LABEL: zext_4i32_to_2i64:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
-; SSE41-LABEL: shuf_zext_8i8_to_8i32:
+; SSE41-LABEL: zext_4i32_to_2i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: shuf_zext_8i8_to_8i32:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX-LABEL: zext_4i32_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %C = zext <2 x i32> %B to <2 x i64>
+  ret <2 x i64> %C
+}
+
+define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_4i32_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_4i32_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_4i32_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    retq
+entry:
+  %B = zext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+define <2 x i64> @load_zext_2i8_to_2i64(<2 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_2i8_to_2i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_2i8_to_2i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_2i8_to_2i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_2i8_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <2 x i8>, <2 x i8>* %ptr
+ %Y = zext <2 x i8> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i32> @load_zext_4i8_to_4i32(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_4i8_to_4i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i8_to_4i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i8_to_4i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_4i8_to_4i32:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <4 x i8>, <4 x i8>* %ptr
+ %Y = zext <4 x i8> %X to <4 x i32>
+ ret <4 x i32> %Y
+}
+
+define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_4i8_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i8_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,zero,zero,zero,zero,xmm1[12],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i8_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_4i8_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_4i8_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    retq
+entry:
+ %X = load <4 x i8>, <4 x i8>* %ptr
+ %Y = zext <4 x i8> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <8 x i16> @load_zext_8i8_to_8i16(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_8i8_to_8i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_8i8_to_8i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_8i8_to_8i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_8i8_to_8i16:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = zext <8 x i8> %X to <8 x i16>
+ ret <8 x i16> %Y
+}
+
+define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_8i8_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_8i8_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[6],zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[14],zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_8i8_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_8i8_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_8i8_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    retq
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = zext <8 x i8> %X to <8 x i32>
+ ret <8 x i32> %Y
+}
+
+define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_16i8_to_16i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_16i8_to_16i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_16i8_to_16i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_16i8_to_16i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_16i8_to_16i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX2-NEXT:    retq
+entry:
+ %X = load <16 x i8>, <16 x i8>* %ptr
+ %Y = zext <16 x i8> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
+define <2 x i64> @load_zext_2i16_to_2i64(<2 x i16> *%ptr) {
+; SSE2-LABEL: load_zext_2i16_to_2i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_2i16_to_2i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_2i16_to_2i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_2i16_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <2 x i16>, <2 x i16>* %ptr
+ %Y = zext <2 x i16> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i32> @load_zext_4i16_to_4i32(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_zext_4i16_to_4i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i16_to_4i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i16_to_4i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_4i16_to_4i32:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <4 x i16>, <4 x i16>* %ptr
+ %Y = zext <4 x i16> %X to <4 x i32>
+ ret <4 x i32> %Y
+}
+
+define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_zext_4i16_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i16_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[8,9],zero,zero,zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i16_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_4i16_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_4i16_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX2-NEXT:    retq
+entry:
+ %X = load <4 x i16>, <4 x i16>* %ptr
+ %Y = zext <4 x i16> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
+; SSE2-LABEL: load_zext_8i16_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_8i16_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_8i16_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_8i16_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_8i16_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    retq
+entry:
+ %X = load <8 x i16>, <8 x i16>* %ptr
+ %Y = zext <8 x i16> %X to <8 x i32>
+ ret <8 x i32> %Y
+}
+
+define <2 x i64> @load_zext_2i32_to_2i64(<2 x i32> *%ptr) {
+; SSE2-LABEL: load_zext_2i32_to_2i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_2i32_to_2i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_2i32_to_2i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_2i32_to_2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <2 x i32>, <2 x i32>* %ptr
+ %Y = zext <2 x i32> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
+; SSE2-LABEL: load_zext_4i32_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i32_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i32_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-NEXT:    retq
+entry:
+ %X = load <4 x i32>, <4 x i32>* %ptr
+ %Y = zext <4 x i32> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
+; SSE2-LABEL: zext_8i8_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i8_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i8_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_8i8_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_8i8_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+entry:
+  %t = zext <8 x i8> %z to <8 x i32>
+  ret <8 x i32> %t
+}
+
+define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_8i16_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i16_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_8i16_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8>
+  %Z = bitcast <16 x i16> %B to <8 x i32>
+  ret <8 x i32> %Z
+}
+
+define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_4i32_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_4i32_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_4i32_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    retq
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
+  %Z = bitcast <8 x i32> %B to <4 x i64>
+  ret <4 x i64> %Z
+}
+
+define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
+; SSE2-LABEL: shuf_zext_8i8_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i8_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i8_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuf_zext_8i8_to_8i32:
@@ -465,3 +1093,346 @@ entry:
   %Z = bitcast <32 x i8> %B to <8 x i32>
   ret <8 x i32> %Z
 }
+
+define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_16i8_to_2i64_offset6:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_16i8_to_2i64_offset6:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_16i8_to_2i64_offset6:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    psrlq $48, %xmm0
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuf_zext_16i8_to_2i64_offset6:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <16 x i32> <i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %Z = bitcast <16 x i8> %B to <2 x i64>
+  ret <2 x i64> %Z
+}
+
+define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[13],zero,zero,zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> <i32 11, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %Z = bitcast <32 x i8> %B to <4 x i64>
+  ret <4 x i64> %Z
+}
+
+define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_8i16_to_2i64_offset6:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuf_zext_8i16_to_2i64_offset6:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8>
+  %Z = bitcast <8 x i16> %B to <2 x i64>
+  ret <2 x i64> %Z
+}
+
+define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
+  %Z = bitcast <16 x i16> %B to <4 x i64>
+  ret <4 x i64> %Z
+}
+
+define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8>
+  %Z = bitcast <8 x i16> %B to <4 x i32>
+  ret <4 x i32> %Z
+}
+
+define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE41-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8, i32 undef, i32 8, i32 undef, i32 8, i32 undef, i32 8>
+  %Z = bitcast <16 x i16> %B to <8 x i32>
+  ret <8 x i32> %Z
+}
+
+define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> <i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 undef, i32 16, i32 14, i32 16, i32 undef, i32 16>
+  %Z = bitcast <16 x i16> %B to <8 x i32>
+  ret <8 x i32> %Z
+}
+
+define <2 x i64> @shuf_zext_4i32_to_2i64_offset2(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE-LABEL: shuf_zext_4i32_to_2i64_offset2:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuf_zext_4i32_to_2i64_offset2:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 3, i32 4>
+  %Z = bitcast <4 x i32> %B to <2 x i64>
+  ret <2 x i64> %Z
+}
+
+define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,4294967295,0]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,4294967295,0]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    retq
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 4, i32 2, i32 4, i32 3, i32 4, i32 undef, i32 4>
+  %Z = bitcast <8 x i32> %B to <4 x i64>
+  ret <4 x i64> %Z
+}
diff --git a/test/CodeGen/X86/vector-zmov.ll b/test/CodeGen/X86/vector-zmov.ll
index 2986835590543..e378a3244b4e6 100644
--- a/test/CodeGen/X86/vector-zmov.ll
+++ b/test/CodeGen/X86/vector-zmov.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
diff --git a/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll b/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll
new file mode 100644
index 0000000000000..2ff8c3a9028f6
--- /dev/null
+++ b/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -o /dev/null -stop-after machine-scheduler %s | FileCheck %s --check-prefix=PRE-RA
+; RUN: llc -mtriple=x86_64-unknown-unknown -o /dev/null -stop-after prologepilog %s | FileCheck %s --check-prefix=POST-RA
+
+; This test verifies that the virtual register references in machine function's
+; liveins are cleared after register allocation.
+
+define i32 @test(i32 %a, i32 %b) {
+body:
+  %c = mul i32 %a, %b
+  ret i32 %c
+}
+
+; PRE-RA: liveins:
+; PRE-RA-NEXT: - { reg: '%edi', virtual-reg: '%0' }
+; PRE-RA-NEXT: - { reg: '%esi', virtual-reg: '%1' }
+
+; POST-RA: liveins:
+; POST-RA-NEXT: - { reg: '%edi' }
+; POST-RA-NEXT: - { reg: '%esi' }
diff --git a/test/CodeGen/X86/vmovq.ll b/test/CodeGen/X86/vmovq.ll
new file mode 100644
index 0000000000000..45d350c743e25
--- /dev/null
+++ b/test/CodeGen/X86/vmovq.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx    | FileCheck %s --check-prefix=AVX
+
+define <2 x i64> @PR25554(<2 x i64> %v0, <2 x i64> %v1) {
+; SSE-LABEL: PR25554:
+; SSE:       # BB#0:
+; SSE-NEXT:    movl $1, %eax
+; SSE-NEXT:    movd %rax, %xmm1
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; SSE-NEXT:    paddq %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR25554:
+; AVX:       # BB#0:
+; AVX-NEXT:    movl $1, %eax
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %c1 = or <2 x i64> %v0, <i64 1, i64 0>
+  %c2 = add <2 x i64> %c1, <i64 0, i64 1>
+  ret <2 x i64> %c2
+}
+
diff --git a/test/CodeGen/X86/vselect-2.ll b/test/CodeGen/X86/vselect-2.ll
index fe4cfba08b8ae..8e0f4a4ef4475 100644
--- a/test/CodeGen/X86/vselect-2.ll
+++ b/test/CodeGen/X86/vselect-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
 
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
index de04a097de02c..002561042688a 100644
--- a/test/CodeGen/X86/vselect-avx.ll
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -62,13 +62,15 @@ bb:
 
 ; CHECK-LABEL: test3:
 ; Compute the mask.
-;	CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[MASK:%xmm[0-9]+]]
+; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[MASK:%xmm[0-9]+]]
 ; Do not shrink the bit of the mask.
-; CHECK-NOT: vpslld	$31, [[MASK]], {{%xmm[0-9]+}}
+; CHECK-NOT: vpslld $31, [[MASK]], {{%xmm[0-9]+}}
 ; Use the mask in the blend.
-; CHECK-NEXT:	vblendvps	[[MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
-; Use the mask in the and.
-; CHECK-NEXT: vpand LCPI2_2(%rip), [[MASK]], {{%xmm[0-9]+}} 
+; CHECK-NEXT: vblendvps [[MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; Shuffle mask to truncate.
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK: vpshufb %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; CHECK: vpshufb %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
 ; CHECK: retq
 define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,  <4 x i16> %tmp3, <4 x i16> %tmp12) {
   %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
diff --git a/test/CodeGen/X86/vselect-minmax.ll b/test/CodeGen/X86/vselect-minmax.ll
index 5ed687f505768..edf2a442918a6 100644
--- a/test/CodeGen/X86/vselect-minmax.ll
+++ b/test/CodeGen/X86/vselect-minmax.ll
@@ -1,5578 +1,11060 @@
-; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2
-; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE4
-; RUN: llc -march=x86-64 -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
-; RUN: llc -march=x86-64 -mcpu=core-avx2 -mattr=+avx2 < %s | FileCheck %s -check-prefix=AVX2
-; RUN: llc -march=x86-64 -mcpu=knl < %s | FileCheck %s  -check-prefix=AVX2 -check-prefix=AVX512F
-; RUN: llc -march=x86-64 -mcpu=skx < %s | FileCheck %s  -check-prefix=AVX512BW -check-prefix=AVX512VL -check-prefix=AVX512F
-
-define void @test1(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp slt <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F --check-prefix=AVX512BW --check-prefix=AVX512VL
+
+define <16 x i8> @test1(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: test1:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test1:
-; SSE4: pminsb
-
-; AVX1-LABEL: test1:
-; AVX1: vpminsb
-
-; AVX2-LABEL: test1:
-; AVX2: vpminsb
-
-; AVX512VL-LABEL: test1:
-; AVX512VL: vpminsb
-}
-
-define void @test2(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp sle <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test1:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp slt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test2(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: test2:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test2:
-; SSE4: pminsb
-
-; AVX1-LABEL: test2:
-; AVX1: vpminsb
-
-; AVX2-LABEL: test2:
-; AVX2: vpminsb
-
-; AVX512VL-LABEL: test2:
-; AVX512VL: vpminsb
-}
-
-define void @test3(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp sgt <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sle <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test3(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: test3:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test3:
-; SSE4: pmaxsb
-
-; AVX1-LABEL: test3:
-; AVX1: vpmaxsb
-
-; AVX2-LABEL: test3:
-; AVX2: vpmaxsb
-
-; AVX512VL-LABEL: test3:
-; AVX512VL: vpmaxsb
-}
-
-define void @test4(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp sge <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test3:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sgt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test4(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: test4:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test4:
-; SSE4: pmaxsb
-
-; AVX1-LABEL: test4:
-; AVX1: vpmaxsb
-
-; AVX2-LABEL: test4:
-; AVX2: vpmaxsb
-
-; AVX512VL-LABEL: test4:
-; AVX512VL: vpmaxsb
-}
-
-define void @test5(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp ult <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test5:
-; SSE2: pminub
-
-; AVX1-LABEL: test5:
-; AVX1: vpminub
-
-; AVX2-LABEL: test5:
-; AVX2: vpminub
-
-; AVX512VL-LABEL: test5:
-; AVX512VL: vpminub 
-}
-
-define void @test6(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp ule <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test6:
-; SSE2: pminub
-
-; AVX1-LABEL: test6:
-; AVX1: vpminub
-
-; AVX2-LABEL: test6:
-; AVX2: vpminub
-
-; AVX512VL-LABEL: test6:
-; AVX512VL: vpminub
-}
-
-define void @test7(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp ugt <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test7:
-; SSE2: pmaxub
-
-; AVX1-LABEL: test7:
-; AVX1: vpmaxub
-
-; AVX2-LABEL: test7:
-; AVX2: vpmaxub
-
-; AVX512VL-LABEL: test7:
-; AVX512VL: vpmaxub
-}
-
-define void @test8(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp uge <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test8:
-; SSE2: pmaxub
-
-; AVX1-LABEL: test8:
-; AVX1: vpmaxub
-
-; AVX2-LABEL: test8:
-; AVX2: vpmaxub
-
-; AVX512VL-LABEL: test8:
-; AVX512VL: vpmaxub
-}
-
-define void @test9(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp slt <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test9:
-; SSE2: pminsw
-
-; AVX1-LABEL: test9:
-; AVX1: vpminsw
-
-; AVX2-LABEL: test9:
-; AVX2: vpminsw
-
-; AVX512VL-LABEL: test9:
-; AVX512VL: vpminsw 
-}
-
-define void @test10(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp sle <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test10:
-; SSE2: pminsw
-
-; AVX1-LABEL: test10:
-; AVX1: vpminsw
-
-; AVX2-LABEL: test10:
-; AVX2: vpminsw
-
-; AVX512VL-LABEL: test10:
-; AVX512VL: vpminsw
-}
-
-define void @test11(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp sgt <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test11:
-; SSE2: pmaxsw
-
-; AVX1-LABEL: test11:
-; AVX1: vpmaxsw
-
-; AVX2-LABEL: test11:
-; AVX2: vpmaxsw
-
-; AVX512VL-LABEL: test11:
-; AVX512VL: vpmaxsw
-}
-
-define void @test12(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp sge <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test12:
-; SSE2: pmaxsw
-
-; AVX1-LABEL: test12:
-; AVX1: vpmaxsw
-
-; AVX2-LABEL: test12:
-; AVX2: vpmaxsw
-
-; AVX512VL-LABEL: test12:
-; AVX512VL: vpmaxsw
-}
-
-define void @test13(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp ult <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test4:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sge <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test5(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: test5:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test5:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ult <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test6(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: test6:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test6:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ule <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test7(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: test7:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test7:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ugt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test8(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: test8:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test8:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp uge <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %sel
+}
+
+define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: test9:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test9:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp slt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: test10:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test10:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sle <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: test11:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test11:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sgt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: test12:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test12:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sge <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: test13:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test13:
-; SSE4: pminuw
-
-; AVX1-LABEL: test13:
-; AVX1: vpminuw
-
-; AVX2-LABEL: test13:
-; AVX2: vpminuw
-
-; AVX512VL-LABEL: test13:
-; AVX512VL: vpminuw
-}
-
-define void @test14(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp ule <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test13:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ult <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test14(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: test14:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psubusw %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqw %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test14:
-; SSE4: pminuw
-
-; AVX1-LABEL: test14:
-; AVX1: vpminuw
-
-; AVX2-LABEL: test14:
-; AVX2: vpminuw
-
-; AVX512VL-LABEL: test14:
-; AVX512VL: vpminuw 
-}
-
-define void @test15(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp ugt <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test14:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ule <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: test15:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test15:
-; SSE4: pmaxuw
-
-; AVX1-LABEL: test15:
-; AVX1: vpmaxuw
-
-; AVX2-LABEL: test15:
-; AVX2: vpmaxuw
-
-; AVX512VL-LABEL: test15:
-; AVX512VL: vpmaxuw
-}
-
-define void @test16(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp uge <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test15:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ugt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: test16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psubusw %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqw %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test16:
-; SSE4: pmaxuw
-
-; AVX1-LABEL: test16:
-; AVX1: vpmaxuw
-
-; AVX2-LABEL: test16:
-; AVX2: vpmaxuw
-
-; AVX512VL-LABEL: test16:
-; AVX512VL: vpmaxuw
-}
-
-define void @test17(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp slt <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test16:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp uge <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %sel
+}
+
+define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test17:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test17:
-; SSE4: pminsd
-
-; AVX1-LABEL: test17:
-; AVX1: vpminsd
-
-; AVX2-LABEL: test17:
-; AVX2: vpminsd
-
-; AVX512VL-LABEL: test17:
-; AVX512VL: vpminsd
-}
-
-define void @test18(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp sle <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test17:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp slt <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test18:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test18:
-; SSE4: pminsd
-
-; AVX1-LABEL: test18:
-; AVX1: vpminsd
-
-; AVX2-LABEL: test18:
-; AVX2: vpminsd
-
-; AVX512VL-LABEL: test18:
-; AVX512VL: vpminsd
-}
-
-define void @test19(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp sgt <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test18:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sle <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test19:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test19:
-; SSE4: pmaxsd
-
-; AVX1-LABEL: test19:
-; AVX1: vpmaxsd
-
-; AVX2-LABEL: test19:
-; AVX2: vpmaxsd
-
-; AVX512VL-LABEL: test19:
-; AVX512VL: vpmaxsd
-}
-
-define void @test20(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp sge <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test19:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sgt <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test20:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test20:
-; SSE4: pmaxsd
-
-; AVX1-LABEL: test20:
-; AVX1: vpmaxsd
-
-; AVX2-LABEL: test20:
-; AVX2: vpmaxsd
-
-; AVX512VL-LABEL: test20:
-; AVX512VL: vpmaxsd
-}
-
-define void @test21(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp ult <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test20:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sge <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test21(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test21:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test21:
-; SSE4: pminud
-
-; AVX1-LABEL: test21:
-; AVX1: vpminud
-
-; AVX2-LABEL: test21:
-; AVX2: vpminud
-
-; AVX512VL-LABEL: test21:
-; AVX512VL: vpminud
-}
-
-define void @test22(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp ule <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test21:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ult <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test22:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test22:
-; SSE4: pminud
-
-; AVX1-LABEL: test22:
-; AVX1: vpminud
-
-; AVX2-LABEL: test22:
-; AVX2: vpminud
-
-; AVX512VL-LABEL: test22:
-; AVX512VL: vpminud
-}
-
-define void @test23(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp ugt <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test22:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ule <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test23:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test23:
-; SSE4: pmaxud
-
-; AVX1-LABEL: test23:
-; AVX1: vpmaxud
-
-; AVX2-LABEL: test23:
-; AVX2: vpmaxud
-
-; AVX512VL-LABEL: test23:
-; AVX512VL: vpmaxud
-}
-
-define void @test24(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp uge <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test23:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ugt <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test24(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test24:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test24:
-; SSE4: pmaxud
-
-; AVX1-LABEL: test24:
-; AVX1: vpmaxud
-
-; AVX2-LABEL: test24:
-; AVX2: vpmaxud
-
-; AVX512VL-LABEL: test24:
-; AVX512VL: vpmaxud
-}
-
-define void @test25(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp slt <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test24:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp uge <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+define <32 x i8> @test25(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: test25:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test25:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm2, %xmm0
+; SSE4-NEXT:    pminsb %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test25:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test25:
-; AVX2: vpminsb
-
-; AVX512VL-LABEL: test25:
-; AVX512VL: vpminsb
-}
-
-define void @test26(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp sle <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test25:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp slt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test26(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: test26:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm7
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test26:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm2, %xmm0
+; SSE4-NEXT:    pminsb %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test26:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test26:
-; AVX2: vpminsb
-
-; AVX512VL-LABEL: test26:
-; AVX512VL: vpminsb
-}
-
-define void @test27(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp sgt <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test26:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sle <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test27(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: test27:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test27:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test27:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test27:
-; AVX2: vpmaxsb
-
-; AVX512VL-LABEL: test27:
-; AVX512VL: vpmaxsb
-}
-
-define void @test28(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp sge <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test27:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sgt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test28(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: test28:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm7
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test28:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test28:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test28:
-; AVX2: vpmaxsb
-
-; AVX512VL-LABEL: test28:
-; AVX512VL: vpmaxsb
-}
-
-define void @test29(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp ult <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test28:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sge <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test29(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: test29:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm2, %xmm0
+; SSE-NEXT:    pminub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test29:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test29:
-; AVX2: vpminub
-
-; AVX512VL-LABEL: test29:
-; AVX512VL: vpminub
-}
-
-define void @test30(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp ule <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test29:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ult <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test30(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: test30:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm2, %xmm0
+; SSE-NEXT:    pminub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test30:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test30:
-; AVX2: vpminub
-
-; AVX512VL-LABEL: test30:
-; AVX512VL: vpminub
-}
-
-define void @test31(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp ugt <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test30:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ule <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test31(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: test31:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm2, %xmm0
+; SSE-NEXT:    pmaxub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test31:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test31:
-; AVX2: vpmaxub
-
-; AVX512VL-LABEL: test31:
-; AVX512VL: vpmaxub
-}
-
-define void @test32(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp uge <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test31:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ugt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test32(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: test32:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm2, %xmm0
+; SSE-NEXT:    pmaxub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test32:
-; AVX2: vpmaxub
-
-; AVX512VL-LABEL: test32:
-; AVX512VL: vpmaxub
-}
-
-define void @test33(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp slt <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test32:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp uge <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %sel
+}
+
+define <16 x i16> @test33(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: test33:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm2, %xmm0
+; SSE-NEXT:    pminsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test33:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test33:
-; AVX2: vpminsw
-
-; AVX512VL-LABEL: test33:
-; AVX512VL: vpminsw 
-}
-
-define void @test34(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp sle <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test33:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp slt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test34(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: test34:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm2, %xmm0
+; SSE-NEXT:    pminsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test34:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test34:
-; AVX2: vpminsw
-
-; AVX512VL-LABEL: test34:
-; AVX512VL: vpminsw
-}
-
-define void @test35(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp sgt <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test34:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sle <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test35(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: test35:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test35:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test35:
-; AVX2: vpmaxsw
-
-; AVX512VL-LABEL: test35:
-; AVX512VL: vpmaxsw
-}
-
-define void @test36(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp sge <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test35:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sgt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test36(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: test36:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test36:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test36:
-; AVX2: vpmaxsw
-
-; AVX512VL-LABEL: test36:
-; AVX512VL: vpmaxsw
-}
-
-define void @test37(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp ult <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test36:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sge <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test37(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: test37:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtw %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test37:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm2, %xmm0
+; SSE4-NEXT:    pminuw %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test37:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test37:
-; AVX2: vpminuw
-
-; AVX512VL-LABEL: test37:
-; AVX512VL: vpminuw
-}
-
-define void @test38(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp ule <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test37:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ult <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test38(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: test38:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psubusw %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpeqw %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psubusw %xmm2, %xmm5
+; SSE2-NEXT:    pcmpeqw %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test38:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm2, %xmm0
+; SSE4-NEXT:    pminuw %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test38:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test38:
-; AVX2: vpminuw
-
-; AVX512VL-LABEL: test38:
-; AVX512VL: vpminuw
-}
-
-define void @test39(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp ugt <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test38:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ule <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test39(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: test39:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test39:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test39:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test39:
-; AVX2: vpmaxuw
-
-; AVX512VL-LABEL: test39:
-; AVX512VL: vpmaxuw
-}
-
-define void @test40(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp uge <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test39:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ugt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test40(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: test40:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psubusw %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpeqw %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    psubusw %xmm0, %xmm6
+; SSE2-NEXT:    pcmpeqw %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test40:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test40:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test40:
-; AVX2: vpmaxuw
-
-; AVX512VL-LABEL: test40:
-; AVX512VL: vpmaxuw
-}
-
-define void @test41(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp slt <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test40:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp uge <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %sel
+}
+
+define <8 x i32> @test41(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test41:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test41:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm2, %xmm0
+; SSE4-NEXT:    pminsd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test41:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test41:
-; AVX2: vpminsd
-
-; AVX512VL-LABEL: test41:
-; AVX512VL: vpminsd
-}
-
-define void @test42(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp sle <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test41:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp slt <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test42(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test42:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm7
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test42:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm2, %xmm0
+; SSE4-NEXT:    pminsd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test42:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test42:
-; AVX2: vpminsd
-
-; AVX512VL-LABEL: test42:
-; AVX512VL: vpminsd
-}
-
-define void @test43(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp sgt <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test42:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sle <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test43(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test43:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test43:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test43:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test43:
-; AVX2: vpmaxsd
-
-; AVX512VL-LABEL: test43:
-; AVX512VL: vpmaxsd
-}
-
-define void @test44(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp sge <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test43:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sgt <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test44(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test44:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm7
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test44:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test44:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test44:
-; AVX2: vpmaxsd
-
-; AVX512VL-LABEL: test44:
-; AVX512VL: vpmaxsd
-}
-
-define void @test45(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp ult <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test44:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sge <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test45(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test45:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test45:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm2, %xmm0
+; SSE4-NEXT:    pminud %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test45:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test45:
-; AVX2: vpminud
-
-; AVX512VL-LABEL: test45:
-; AVX512VL: vpminud
-}
-
-define void @test46(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp ule <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test45:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ult <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test46(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test46:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    pxor %xmm6, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test46:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm2, %xmm0
+; SSE4-NEXT:    pminud %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test46:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test46:
-; AVX2: vpminud
-
-; AVX512VL-LABEL: test46:
-; AVX512VL: vpminud
-}
-
-define void @test47(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp ugt <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test46:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ule <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test47(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test47:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test47:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm2, %xmm0
+; SSE4-NEXT:    pmaxud %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test47:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test47:
-; AVX2: vpmaxud
-
-; AVX512VL-LABEL: test47:
-; AVX512VL: vpmaxud
-}
-
-define void @test48(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp uge <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test47:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ugt <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test48(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test48:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm8
+; SSE2-NEXT:    pxor %xmm6, %xmm8
+; SSE2-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test48:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm2, %xmm0
+; SSE4-NEXT:    pmaxud %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test48:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test48:
-; AVX2: vpmaxud
-
-; AVX512VL-LABEL: test48:
-; AVX512VL: vpmaxud
-}
-
-define void @test49(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp slt <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test48:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp uge <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %sel
+}
+
+define <16 x i8> @test49(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: test49:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test49:
-; SSE4: pmaxsb
-
-; AVX1-LABEL: test49:
-; AVX1: vpmaxsb
-
-; AVX2-LABEL: test49:
-; AVX2: vpmaxsb
-
-; AVX512VL-LABEL: test49:
-; AVX512VL: vpmaxsb
-}
-
-define void @test50(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp sle <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test49:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp slt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %b, <16 x i8> %a
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test50(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: test50:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test50:
-; SSE4: pmaxsb
-
-; AVX1-LABEL: test50:
-; AVX1: vpmaxsb
-
-; AVX2-LABEL: test50:
-; AVX2: vpmaxsb
-
-; AVX512VL-LABEL: test50:
-; AVX512VL: vpmaxsb
-}
-
-define void @test51(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp sgt <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test50:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sle <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %b, <16 x i8> %a
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test51(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: test51:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test51:
-; SSE4: pminsb
-
-; AVX1-LABEL: test51:
-; AVX1: vpminsb
-
-; AVX2-LABEL: test51:
-; AVX2: vpminsb
-
-; AVX512VL-LABEL: test51:
-; AVX512VL: vpminsb
-}
-
-define void @test52(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp sge <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test51:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sgt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %b, <16 x i8> %a
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test52(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: test52:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test52:
-; SSE4: pminsb
-
-; AVX1-LABEL: test52:
-; AVX1: vpminsb
-
-; AVX2-LABEL: test52:
-; AVX2: vpminsb
-
-; AVX512VL-LABEL: test52:
-; AVX512VL: vpminsb
-}
-
-define void @test53(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp ult <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test53:
-; SSE2: pmaxub
-
-; AVX1-LABEL: test53:
-; AVX1: vpmaxub
-
-; AVX2-LABEL: test53:
-; AVX2: vpmaxub
-
-; AVX512VL-LABEL: test53:
-; AVX512VL: vpmaxub
-}
-
-define void @test54(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp ule <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test54:
-; SSE2: pmaxub
-
-; AVX1-LABEL: test54:
-; AVX1: vpmaxub
-
-; AVX2-LABEL: test54:
-; AVX2: vpmaxub
-
-; AVX512VL-LABEL: test54:
-; AVX512VL: vpmaxub
-}
-
-define void @test55(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp ugt <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test55:
-; SSE2: pminub
-
-; AVX1-LABEL: test55:
-; AVX1: vpminub
-
-; AVX2-LABEL: test55:
-; AVX2: vpminub
-
-; AVX512VL-LABEL: test55:
-; AVX512VL: vpminub
-}
-
-define void @test56(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
-  %load.a = load <16 x i8>, <16 x i8>* %ptr.a, align 2
-  %load.b = load <16 x i8>, <16 x i8>* %ptr.b, align 2
-  %cmp = icmp uge <16 x i8> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
-  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test56:
-; SSE2: pminub
-
-; AVX1-LABEL: test56:
-; AVX1: vpminub
-
-; AVX2-LABEL: test56:
-; AVX2: vpminub
-
-; AVX512VL-LABEL: test56:
-; AVX512VL: vpminub
-}
-
-define void @test57(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp slt <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test57:
-; SSE2: pmaxsw
-
-; AVX1-LABEL: test57:
-; AVX1: vpmaxsw
-
-; AVX2-LABEL: test57:
-; AVX2: vpmaxsw
-
-; AVX512VL-LABEL: test57:
-; AVX512VL: vpmaxsw
-}
-
-define void @test58(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp sle <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test58:
-; SSE2: pmaxsw
-
-; AVX1-LABEL: test58:
-; AVX1: vpmaxsw
-
-; AVX2-LABEL: test58:
-; AVX2: vpmaxsw
-
-; AVX512VL-LABEL: test58:
-; AVX512VL: vpmaxsw
-}
-
-define void @test59(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp sgt <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test59:
-; SSE2: pminsw
-
-; AVX1-LABEL: test59:
-; AVX1: vpminsw
-
-; AVX2-LABEL: test59:
-; AVX2: vpminsw
-
-; AVX512VL-LABEL: test59:
-; AVX512VL: vpminsw
-}
-
-define void @test60(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp sge <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; SSE2-LABEL: test60:
-; SSE2: pminsw
-
-; AVX1-LABEL: test60:
-; AVX1: vpminsw
-
-; AVX2-LABEL: test60:
-; AVX2: vpminsw
-
-; AVX512VL-LABEL: test60:
-; AVX512VL: vpminsw
-}
-
-define void @test61(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp ult <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test52:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sge <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %b, <16 x i8> %a
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test53(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: test53:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test53:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ult <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %b, <16 x i8> %a
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test54(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: test54:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test54:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ule <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %b, <16 x i8> %a
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test55(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: test55:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test55:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ugt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %b, <16 x i8> %a
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @test56(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: test56:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test56:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp uge <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %b, <16 x i8> %a
+  ret <16 x i8> %sel
+}
+
+define <8 x i16> @test57(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: test57:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test57:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp slt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %b, <8 x i16> %a
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test58(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: test58:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test58:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sle <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %b, <8 x i16> %a
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test59(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: test59:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test59:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sgt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %b, <8 x i16> %a
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test60(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: test60:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test60:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sge <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %b, <8 x i16> %a
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test61(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: test61:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test61:
-; SSE4: pmaxuw
-
-; AVX1-LABEL: test61:
-; AVX1: vpmaxuw
-
-; AVX2-LABEL: test61:
-; AVX2: vpmaxuw
-
-; AVX512VL-LABEL: test61:
-; AVX512VL: vpmaxuw
-}
-
-define void @test62(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp ule <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test61:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ult <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %b, <8 x i16> %a
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test62(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: test62:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubusw %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqw %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test62:
-; SSE4: pmaxuw
-
-; AVX1-LABEL: test62:
-; AVX1: vpmaxuw
-
-; AVX2-LABEL: test62:
-; AVX2: vpmaxuw
-
-; AVX512VL-LABEL: test62:
-; AVX512VL: vpmaxuw
-}
-
-define void @test63(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp ugt <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test62:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ule <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %b, <8 x i16> %a
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test63(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: test63:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test63:
-; SSE4: pminuw
-
-; AVX1-LABEL: test63:
-; AVX1: vpminuw
-
-; AVX2-LABEL: test63:
-; AVX2: vpminuw
-
-; AVX512VL-LABEL: test63:
-; AVX512VL: vpminuw
-}
-
-define void @test64(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
-  %load.a = load <8 x i16>, <8 x i16>* %ptr.a, align 2
-  %load.b = load <8 x i16>, <8 x i16>* %ptr.b, align 2
-  %cmp = icmp uge <8 x i16> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
-  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test63:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ugt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %b, <8 x i16> %a
+  ret <8 x i16> %sel
+}
+
+define <8 x i16> @test64(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: test64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psubusw %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqw %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test64:
-; SSE4: pminuw
-
-; AVX1-LABEL: test64:
-; AVX1: vpminuw
-
-; AVX2-LABEL: test64:
-; AVX2: vpminuw
-
-; AVX512VL-LABEL: test64:
-; AVX512VL: vpminuw
-}
-
-define void @test65(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp slt <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp uge <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %b, <8 x i16> %a
+  ret <8 x i16> %sel
+}
+
+define <4 x i32> @test65(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test65:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test65:
-; SSE4: pmaxsd
-
-; AVX1-LABEL: test65:
-; AVX1: vpmaxsd
-
-; AVX2-LABEL: test65:
-; AVX2: vpmaxsd
-
-; AVX512VL-LABEL: test65:
-; AVX512VL: vpmaxsd
-}
-
-define void @test66(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp sle <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test65:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp slt <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %b, <4 x i32> %a
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test66(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test66:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test66:
-; SSE4: pmaxsd
-
-; AVX1-LABEL: test66:
-; AVX1: vpmaxsd
-
-; AVX2-LABEL: test66:
-; AVX2: vpmaxsd
-
-; AVX512VL-LABEL: test66:
-; AVX512VL: vpmaxsd
-}
-
-define void @test67(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp sgt <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test66:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sle <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %b, <4 x i32> %a
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test67(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test67:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test67:
-; SSE4: pminsd
-
-; AVX1-LABEL: test67:
-; AVX1: vpminsd
-
-; AVX2-LABEL: test67:
-; AVX2: vpminsd
-
-; AVX512VL-LABEL: test67:
-; AVX512VL: vpminsd
-}
-
-define void @test68(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp sge <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test67:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sgt <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %b, <4 x i32> %a
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test68(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test68:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test68:
-; SSE4: pminsd
-
-; AVX1-LABEL: test68:
-; AVX1: vpminsd
-
-; AVX2-LABEL: test68:
-; AVX2: vpminsd
-
-; AVX512VL-LABEL: test68:
-; AVX512VL: vpminsd
-}
-
-define void @test69(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp ult <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test68:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp sge <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %b, <4 x i32> %a
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test69(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test69:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test69:
-; SSE4: pmaxud
-
-; AVX1-LABEL: test69:
-; AVX1: vpmaxud
-
-; AVX2-LABEL: test69:
-; AVX2: vpmaxud
-
-; AVX512VL-LABEL: test69:
-; AVX512VL: vpmaxud
-}
-
-define void @test70(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp ule <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test69:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ult <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %b, <4 x i32> %a
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test70(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test70:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test70:
-; SSE4: pmaxud
-
-; AVX1-LABEL: test70:
-; AVX1: vpmaxud
-
-; AVX2-LABEL: test70:
-; AVX2: vpmaxud
-
-; AVX512VL-LABEL: test70:
-; AVX512VL: vpmaxud
-}
-
-define void @test71(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp ugt <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test70:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ule <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %b, <4 x i32> %a
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test71(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test71:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test71:
-; SSE4: pminud
-
-; AVX1-LABEL: test71:
-; AVX1: vpminud
-
-; AVX2-LABEL: test71:
-; AVX2: vpminud
-
-; AVX512VL-LABEL: test71:
-; AVX512VL: vpminud
-}
-
-define void @test72(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
-  %load.a = load <4 x i32>, <4 x i32>* %ptr.a, align 2
-  %load.b = load <4 x i32>, <4 x i32>* %ptr.b, align 2
-  %cmp = icmp uge <4 x i32> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
-  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 4
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test71:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp ugt <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %b, <4 x i32> %a
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test72(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test72:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
 ; SSE4-LABEL: test72:
-; SSE4: pminud
-
-; AVX1-LABEL: test72:
-; AVX1: vpminud
-
-; AVX2-LABEL: test72:
-; AVX2: vpminud
-
-; AVX512VL-LABEL: test72:
-; AVX512VL: vpminud
-}
-
-define void @test73(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp slt <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test72:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %cmp = icmp uge <4 x i32> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i32> %b, <4 x i32> %a
+  ret <4 x i32> %sel
+}
+
+define <32 x i8> @test73(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: test73:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test73:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test73:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test73:
-; AVX2: vpmaxsb
-
-; AVX512VL-LABEL: test73:
-; AVX512VL: vpmaxsb
-}
-
-define void @test74(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp sle <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test73:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp slt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %b, <32 x i8> %a
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test74(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: test74:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm7
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test74:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test74:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test74:
-; AVX2: vpmaxsb
-
-; AVX512VL-LABEL: test74:
-; AVX512VL: vpmaxsb 
-}
-
-define void @test75(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp sgt <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test74:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sle <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %b, <32 x i8> %a
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test75(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: test75:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test75:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm2, %xmm0
+; SSE4-NEXT:    pminsb %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test75:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test75:
-; AVX2: vpminsb
-
-; AVX512VL-LABEL: test75:
-; AVX512VL: vpminsb
-}
-
-define void @test76(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp sge <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test75:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sgt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %b, <32 x i8> %a
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test76(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-LABEL: test76:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm7
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test76:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm2, %xmm0
+; SSE4-NEXT:    pminsb %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test76:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test76:
-; AVX2: vpminsb
-
-; AVX512VL-LABEL: test76:
-; AVX512VL: vpminsb
-}
-
-define void @test77(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp ult <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test76:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sge <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %b, <32 x i8> %a
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test77(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: test77:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm2, %xmm0
+; SSE-NEXT:    pmaxub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test77:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test77:
-; AVX2: vpmaxub
-
-; AVX512VL-LABEL: test77:
-; AVX512VL: vpmaxub
-}
-
-define void @test78(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp ule <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test77:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ult <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %b, <32 x i8> %a
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test78(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: test78:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm2, %xmm0
+; SSE-NEXT:    pmaxub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test78:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test78:
-; AVX2: vpmaxub
-
-; AVX512VL-LABEL: test78:
-; AVX512VL: vpmaxub
-}
-
-define void @test79(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp ugt <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test78:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ule <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %b, <32 x i8> %a
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test79(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: test79:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm2, %xmm0
+; SSE-NEXT:    pminub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test79:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test79:
-; AVX2: vpminub
-
-; AVX512VL-LABEL: test79:
-; AVX512VL: vpminub
-}
-
-define void @test80(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
-  %load.a = load <32 x i8>, <32 x i8>* %ptr.a, align 2
-  %load.b = load <32 x i8>, <32 x i8>* %ptr.b, align 2
-  %cmp = icmp uge <32 x i8> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
-  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test79:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ugt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %b, <32 x i8> %a
+  ret <32 x i8> %sel
+}
+
+define <32 x i8> @test80(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: test80:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm2, %xmm0
+; SSE-NEXT:    pminub %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test80:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test80:
-; AVX2: vpminub
-
-; AVX512VL-LABEL: test80:
-; AVX512VL: vpminub
-}
-
-define void @test81(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp slt <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test80:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp uge <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %b, <32 x i8> %a
+  ret <32 x i8> %sel
+}
+
+define <16 x i16> @test81(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: test81:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test81:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test81:
-; AVX2: vpmaxsw
-
-; AVX512VL-LABEL: test81:
-; AVX512VL: vpmaxsw
-}
-
-define void @test82(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp sle <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test81:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp slt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %b, <16 x i16> %a
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test82(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: test82:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test82:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test82:
-; AVX2: vpmaxsw
-
-; AVX512VL-LABEL: test82:
-; AVX512VL: vpmaxsw
-}
-
-define void @test83(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp sgt <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test82:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sle <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %b, <16 x i16> %a
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test83(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: test83:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm2, %xmm0
+; SSE-NEXT:    pminsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test83:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test83:
-; AVX2: vpminsw
-
-; AVX512VL-LABEL: test83:
-; AVX512VL: vpminsw
-}
-
-define void @test84(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp sge <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test83:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sgt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %b, <16 x i16> %a
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test84(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: test84:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm2, %xmm0
+; SSE-NEXT:    pminsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test84:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test84:
-; AVX2: vpminsw
-
-; AVX512VL-LABEL: test84:
-; AVX512VL: vpminsw
-}
-
-define void @test85(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp ult <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test84:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sge <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %b, <16 x i16> %a
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test85(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: test85:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test85:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test85:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test85:
-; AVX2: vpmaxuw
-
-; AVX512VL-LABEL: test85:
-; AVX512VL: vpmaxuw
-}
-
-define void @test86(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp ule <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test85:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ult <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %b, <16 x i16> %a
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test86(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: test86:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psubusw %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpeqw %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psubusw %xmm2, %xmm5
+; SSE2-NEXT:    pcmpeqw %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test86:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test86:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test86:
-; AVX2: vpmaxuw
-
-; AVX512VL-LABEL: test86:
-; AVX512VL: vpmaxuw
-}
-
-define void @test87(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp ugt <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test86:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ule <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %b, <16 x i16> %a
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test87(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: test87:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test87:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm2, %xmm0
+; SSE4-NEXT:    pminuw %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test87:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test87:
-; AVX2: vpminuw
-
-; AVX512VL-LABEL: test87:
-; AVX512VL: vpminuw
-}
-
-define void @test88(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
-  %load.a = load <16 x i16>, <16 x i16>* %ptr.a, align 2
-  %load.b = load <16 x i16>, <16 x i16>* %ptr.b, align 2
-  %cmp = icmp uge <16 x i16> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
-  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test87:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ugt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %b, <16 x i16> %a
+  ret <16 x i16> %sel
+}
+
+define <16 x i16> @test88(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-LABEL: test88:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psubusw %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpeqw %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psubusw %xmm0, %xmm5
+; SSE2-NEXT:    pcmpeqw %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test88:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm2, %xmm0
+; SSE4-NEXT:    pminuw %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test88:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test88:
-; AVX2: vpminuw
-
-; AVX512VL-LABEL: test88:
-; AVX512VL: vpminuw
-}
-
-define void @test89(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp slt <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test88:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp uge <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %b, <16 x i16> %a
+  ret <16 x i16> %sel
+}
+
+define <8 x i32> @test89(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test89:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test89:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test89:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test89:
-; AVX2: vpmaxsd
-
-; AVX512VL-LABEL: test89:
-; AVX512VL: vpmaxsd
-}
-
-define void @test90(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp sle <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test89:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp slt <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %b, <8 x i32> %a
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test90(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test90:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm7
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test90:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test90:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test90:
-; AVX2: vpmaxsd
-
-; AVX512VL-LABEL: test90:
-; AVX512VL: vpmaxsd
-}
-
-define void @test91(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp sgt <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test90:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sle <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %b, <8 x i32> %a
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test91(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test91:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test91:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm2, %xmm0
+; SSE4-NEXT:    pminsd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test91:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test91:
-; AVX2: vpminsd
-
-; AVX512VL-LABEL: test91:
-; AVX512VL: vpminsd
-}
-
-define void @test92(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp sge <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test91:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sgt <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %b, <8 x i32> %a
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test92(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test92:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm7
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test92:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm2, %xmm0
+; SSE4-NEXT:    pminsd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test92:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test92:
-; AVX2: vpminsd
-
-; AVX512VL-LABEL: test92:
-; AVX512VL: vpminsd
-}
-
-define void @test93(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp ult <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test92:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sge <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %b, <8 x i32> %a
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test93(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test93:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test93:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm2, %xmm0
+; SSE4-NEXT:    pmaxud %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test93:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test93:
-; AVX2: vpmaxud
-
-; AVX512VL-LABEL: test93:
-; AVX512VL: vpmaxud
-}
-
-define void @test94(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp ule <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test93:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ult <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %b, <8 x i32> %a
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test94(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test94:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    pxor %xmm6, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm7
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test94:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm2, %xmm0
+; SSE4-NEXT:    pmaxud %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test94:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test94:
-; AVX2: vpmaxud
-
-; AVX512VL-LABEL: test94:
-; AVX512VL: vpmaxud
-}
-
-define void @test95(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp ugt <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test94:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ule <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %b, <8 x i32> %a
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test95(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test95:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test95:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm2, %xmm0
+; SSE4-NEXT:    pminud %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test95:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test95:
-; AVX2: vpminud
-
-; AVX512VL-LABEL: test95:
-; AVX512VL: vpminud
-}
-
-define void @test96(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
-  %load.a = load <8 x i32>, <8 x i32>* %ptr.a, align 2
-  %load.b = load <8 x i32>, <8 x i32>* %ptr.b, align 2
-  %cmp = icmp uge <8 x i32> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
-  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test95:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ugt <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %b, <8 x i32> %a
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @test96(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test96:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm8
+; SSE2-NEXT:    pxor %xmm6, %xmm8
+; SSE2-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm7
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test96:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm2, %xmm0
+; SSE4-NEXT:    pminud %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test96:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test96:
-; AVX2: vpminud
-
-; AVX512VL-LABEL: test96:
-; AVX512VL: vpminud
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test96:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp uge <8 x i32> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i32> %b, <8 x i32> %a
+  ret <8 x i32> %sel
 }
 
 ; ----------------------------
 
-define void @test97(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp slt <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+define <64 x i8> @test97(<64 x i8> %a, <64 x i8> %b) {
+; SSE2-LABEL: test97:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm10
+; SSE2-NEXT:    por %xmm10, %xmm1
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm9, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test97:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm4, %xmm0
+; SSE4-NEXT:    pminsb %xmm5, %xmm1
+; SSE4-NEXT:    pminsb %xmm6, %xmm2
+; SSE4-NEXT:    pminsb %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test97:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test97:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test97:
-; AVX512BW: vpminsb {{.*}}
-}
-
-define void @test98(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp sle <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp slt <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %a, <64 x i8> %b
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test98(<64 x i8> %a, <64 x i8> %b) {
+; SSE2-LABEL: test98:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm8, %xmm12
+; SSE2-NEXT:    pcmpgtb %xmm7, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm13, %xmm13
+; SSE2-NEXT:    movdqa %xmm12, %xmm3
+; SSE2-NEXT:    pxor %xmm13, %xmm3
+; SSE2-NEXT:    movdqa %xmm9, %xmm14
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm2
+; SSE2-NEXT:    pxor %xmm13, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm15
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm15
+; SSE2-NEXT:    movdqa %xmm15, %xmm10
+; SSE2-NEXT:    pxor %xmm13, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm11, %xmm13
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    pandn %xmm4, %xmm13
+; SSE2-NEXT:    por %xmm13, %xmm11
+; SSE2-NEXT:    pandn %xmm1, %xmm15
+; SSE2-NEXT:    pandn %xmm5, %xmm10
+; SSE2-NEXT:    por %xmm15, %xmm10
+; SSE2-NEXT:    pandn %xmm9, %xmm14
+; SSE2-NEXT:    pandn %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm14, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm12
+; SSE2-NEXT:    pandn %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm12, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test98:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm4, %xmm0
+; SSE4-NEXT:    pminsb %xmm5, %xmm1
+; SSE4-NEXT:    pminsb %xmm6, %xmm2
+; SSE4-NEXT:    pminsb %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test98:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test98:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test98:
-; AVX512BW: vpminsb {{.*}}
-}
-
-define void @test99(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp sgt <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sle <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %a, <64 x i8> %b
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test99(<64 x i8> %a, <64 x i8> %b) {
+; SSE2-LABEL: test99:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pcmpgtb %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm0, %xmm11
+; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm10
+; SSE2-NEXT:    por %xmm1, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm2, %xmm9
+; SSE2-NEXT:    pand %xmm3, %xmm8
+; SSE2-NEXT:    pandn %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test99:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm4, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsb %xmm6, %xmm2
+; SSE4-NEXT:    pmaxsb %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test99:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test99:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test99:
-; AVX512BW: vpmaxsb {{.*}}
-}
-
-define void @test100(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp sge <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sgt <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %a, <64 x i8> %b
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test100(<64 x i8> %a, <64 x i8> %b) {
+; SSE2-LABEL: test100:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    movdqa %xmm7, %xmm12
+; SSE2-NEXT:    pcmpgtb %xmm8, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm12, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    pcmpgtb %xmm9, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm14
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    movdqa %xmm4, %xmm15
+; SSE2-NEXT:    pcmpgtb %xmm10, %xmm15
+; SSE2-NEXT:    pxor %xmm15, %xmm0
+; SSE2-NEXT:    pandn %xmm10, %xmm15
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm15, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm14
+; SSE2-NEXT:    pandn %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm14, %xmm11
+; SSE2-NEXT:    pandn %xmm9, %xmm13
+; SSE2-NEXT:    pandn %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm13, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm12
+; SSE2-NEXT:    pandn %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm12, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test100:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm4, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsb %xmm6, %xmm2
+; SSE4-NEXT:    pmaxsb %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test100:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test100:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test100:
-; AVX512BW: vpmaxsb {{.*}}
-}
-
-define void @test101(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp ult <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sge <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %a, <64 x i8> %b
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test101(<64 x i8> %a, <64 x i8> %b) {
+; SSE-LABEL: test101:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm4, %xmm0
+; SSE-NEXT:    pminub %xmm5, %xmm1
+; SSE-NEXT:    pminub %xmm6, %xmm2
+; SSE-NEXT:    pminub %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test101:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminub %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminub %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test101:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminub %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminub %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test101:
-; AVX512BW: vpminub {{.*}}
-}
-
-define void @test102(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp ule <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ult <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %a, <64 x i8> %b
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test102(<64 x i8> %a, <64 x i8> %b) {
+; SSE-LABEL: test102:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm4, %xmm0
+; SSE-NEXT:    pminub %xmm5, %xmm1
+; SSE-NEXT:    pminub %xmm6, %xmm2
+; SSE-NEXT:    pminub %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test102:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminub %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminub %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test102:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminub %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminub %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test102:
-; AVX512BW: vpminub {{.*}}
-}
-
-define void @test103(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp ugt <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ule <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %a, <64 x i8> %b
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test103(<64 x i8> %a, <64 x i8> %b) {
+; SSE-LABEL: test103:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm4, %xmm0
+; SSE-NEXT:    pmaxub %xmm5, %xmm1
+; SSE-NEXT:    pmaxub %xmm6, %xmm2
+; SSE-NEXT:    pmaxub %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test103:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test103:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxub %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxub %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test103:
-; AVX512BW: vpmaxub {{.*}}
-}
-
-define void @test104(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp uge <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ugt <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %a, <64 x i8> %b
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test104(<64 x i8> %a, <64 x i8> %b) {
+; SSE-LABEL: test104:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm4, %xmm0
+; SSE-NEXT:    pmaxub %xmm5, %xmm1
+; SSE-NEXT:    pmaxub %xmm6, %xmm2
+; SSE-NEXT:    pmaxub %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test104:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test104:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxub %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxub %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test104:
-; AVX512BW: vpmaxub {{.*}}
-}
-
-define void @test105(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp slt <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp uge <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %a, <64 x i8> %b
+  ret <64 x i8> %sel
+}
+
+define <32 x i16> @test105(<32 x i16> %a, <32 x i16> %b) {
+; SSE-LABEL: test105:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm4, %xmm0
+; SSE-NEXT:    pminsw %xmm5, %xmm1
+; SSE-NEXT:    pminsw %xmm6, %xmm2
+; SSE-NEXT:    pminsw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test105:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test105:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test105:
-; AVX512BW: vpminsw {{.*}}
-}
-
-define void @test106(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp sle <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp slt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test106(<32 x i16> %a, <32 x i16> %b) {
+; SSE-LABEL: test106:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm4, %xmm0
+; SSE-NEXT:    pminsw %xmm5, %xmm1
+; SSE-NEXT:    pminsw %xmm6, %xmm2
+; SSE-NEXT:    pminsw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test106:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test106:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test106:
-; AVX512BW: vpminsw {{.*}}
-}
-
-define void @test107(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp sgt <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sle <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test107(<32 x i16> %a, <32 x i16> %b) {
+; SSE-LABEL: test107:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm4, %xmm0
+; SSE-NEXT:    pmaxsw %xmm5, %xmm1
+; SSE-NEXT:    pmaxsw %xmm6, %xmm2
+; SSE-NEXT:    pmaxsw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test107:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test107:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test107:
-; AVX512BW: vpmaxsw {{.*}}
-}
-
-define void @test108(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp sge <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sgt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test108(<32 x i16> %a, <32 x i16> %b) {
+; SSE-LABEL: test108:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm4, %xmm0
+; SSE-NEXT:    pmaxsw %xmm5, %xmm1
+; SSE-NEXT:    pmaxsw %xmm6, %xmm2
+; SSE-NEXT:    pmaxsw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test108:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test108:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test108:
-; AVX512BW: vpmaxsw {{.*}}
-}
-
-define void @test109(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp ult <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sge <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test109(<32 x i16> %a, <32 x i16> %b) {
+; SSE2-LABEL: test109:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm3, %xmm9
+; SSE2-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    pcmpgtw %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NEXT:    pcmpgtw %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pxor %xmm10, %xmm12
+; SSE2-NEXT:    pcmpgtw %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm10
+; SSE2-NEXT:    pcmpgtw %xmm11, %xmm10
+; SSE2-NEXT:    pand %xmm10, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm10
+; SSE2-NEXT:    por %xmm10, %xmm0
+; SSE2-NEXT:    pand %xmm12, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm12
+; SSE2-NEXT:    por %xmm12, %xmm1
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm9, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test109:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm4, %xmm0
+; SSE4-NEXT:    pminuw %xmm5, %xmm1
+; SSE4-NEXT:    pminuw %xmm6, %xmm2
+; SSE4-NEXT:    pminuw %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test109:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminuw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test109:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminuw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test109:
-; AVX512BW: vpminuw {{.*}}
-}
-
-define void @test110(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp ule <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ult <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test110(<32 x i16> %a, <32 x i16> %b) {
+; SSE2-LABEL: test110:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    psubusw %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm3
+; SSE2-NEXT:    psubusw %xmm6, %xmm2
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm2
+; SSE2-NEXT:    psubusw %xmm5, %xmm1
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    psubusw %xmm4, %xmm11
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm0, %xmm11
+; SSE2-NEXT:    pand %xmm1, %xmm10
+; SSE2-NEXT:    pandn %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm10, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm9
+; SSE2-NEXT:    pandn %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm9, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm8
+; SSE2-NEXT:    pandn %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test110:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm4, %xmm0
+; SSE4-NEXT:    pminuw %xmm5, %xmm1
+; SSE4-NEXT:    pminuw %xmm6, %xmm2
+; SSE4-NEXT:    pminuw %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test110:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminuw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test110:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminuw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test110:
-; AVX512BW: vpminuw {{.*}}
-}
-
-define void @test111(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp ugt <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ule <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test111(<32 x i16> %a, <32 x i16> %b) {
+; SSE2-LABEL: test111:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pxor %xmm11, %xmm8
+; SSE2-NEXT:    pcmpgtw %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    pcmpgtw %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    pcmpgtw %xmm12, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtw %xmm12, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm10
+; SSE2-NEXT:    por %xmm1, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm2, %xmm9
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test111:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm4, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm5, %xmm1
+; SSE4-NEXT:    pmaxuw %xmm6, %xmm2
+; SSE4-NEXT:    pmaxuw %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test111:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test111:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxuw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test111:
-; AVX512BW: vpmaxuw {{.*}}
-}
-
-define void @test112(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp uge <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ugt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test112(<32 x i16> %a, <32 x i16> %b) {
+; SSE2-LABEL: test112:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    psubusw %xmm3, %xmm8
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpeqw %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    psubusw %xmm2, %xmm10
+; SSE2-NEXT:    pcmpeqw %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    psubusw %xmm1, %xmm11
+; SSE2-NEXT:    pcmpeqw %xmm9, %xmm11
+; SSE2-NEXT:    movdqa %xmm4, %xmm12
+; SSE2-NEXT:    psubusw %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqw %xmm9, %xmm12
+; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm12
+; SSE2-NEXT:    por %xmm12, %xmm0
+; SSE2-NEXT:    pand %xmm11, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm1
+; SSE2-NEXT:    pand %xmm10, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm10
+; SSE2-NEXT:    por %xmm10, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test112:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm4, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm5, %xmm1
+; SSE4-NEXT:    pmaxuw %xmm6, %xmm2
+; SSE4-NEXT:    pmaxuw %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test112:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test112:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxuw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test112:
-; AVX512BW: vpmaxuw {{.*}}
-}
-
-define void @test113(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp slt <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp uge <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %sel
+}
+
+define <16 x i32> @test113(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test113:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm10
+; SSE2-NEXT:    por %xmm10, %xmm1
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm9, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test113:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm4, %xmm0
+; SSE4-NEXT:    pminsd %xmm5, %xmm1
+; SSE4-NEXT:    pminsd %xmm6, %xmm2
+; SSE4-NEXT:    pminsd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test113:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test113:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test113:
-; AVX512F: vpminsd {{.*}}
-}
-
-define void @test114(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp sle <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp slt <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %a, <16 x i32> %b
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test114(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test114:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm8, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm13, %xmm13
+; SSE2-NEXT:    movdqa %xmm12, %xmm3
+; SSE2-NEXT:    pxor %xmm13, %xmm3
+; SSE2-NEXT:    movdqa %xmm9, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm2
+; SSE2-NEXT:    pxor %xmm13, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm15
+; SSE2-NEXT:    movdqa %xmm15, %xmm10
+; SSE2-NEXT:    pxor %xmm13, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm11, %xmm13
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    pandn %xmm4, %xmm13
+; SSE2-NEXT:    por %xmm13, %xmm11
+; SSE2-NEXT:    pandn %xmm1, %xmm15
+; SSE2-NEXT:    pandn %xmm5, %xmm10
+; SSE2-NEXT:    por %xmm15, %xmm10
+; SSE2-NEXT:    pandn %xmm9, %xmm14
+; SSE2-NEXT:    pandn %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm14, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm12
+; SSE2-NEXT:    pandn %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm12, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test114:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm4, %xmm0
+; SSE4-NEXT:    pminsd %xmm5, %xmm1
+; SSE4-NEXT:    pminsd %xmm6, %xmm2
+; SSE4-NEXT:    pminsd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test114:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test114:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test114:
-; AVX512F: vpminsd {{.*}}
-}
-
-define void @test115(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp sgt <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sle <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %a, <16 x i32> %b
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test115(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test115:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm0, %xmm11
+; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm10
+; SSE2-NEXT:    por %xmm1, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm2, %xmm9
+; SSE2-NEXT:    pand %xmm3, %xmm8
+; SSE2-NEXT:    pandn %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test115:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm4, %xmm0
+; SSE4-NEXT:    pmaxsd %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsd %xmm6, %xmm2
+; SSE4-NEXT:    pmaxsd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test115:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test115:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test115:
-; AVX512F: vpmaxsd {{.*}}
-}
-
-define void @test116(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp sge <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sgt <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %a, <16 x i32> %b
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test116(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test116:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    movdqa %xmm7, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm12, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    movdqa %xmm4, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm15
+; SSE2-NEXT:    pxor %xmm15, %xmm0
+; SSE2-NEXT:    pandn %xmm10, %xmm15
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm15, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm14
+; SSE2-NEXT:    pandn %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm14, %xmm11
+; SSE2-NEXT:    pandn %xmm9, %xmm13
+; SSE2-NEXT:    pandn %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm13, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm12
+; SSE2-NEXT:    pandn %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm12, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test116:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm4, %xmm0
+; SSE4-NEXT:    pmaxsd %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsd %xmm6, %xmm2
+; SSE4-NEXT:    pmaxsd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test116:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test116:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test116:
-; AVX512F: vpmaxsd {{.*}}
-}
-
-define void @test117(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp ult <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sge <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %a, <16 x i32> %b
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test117(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test117:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm9
+; SSE2-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm10, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pxor %xmm10, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm10
+; SSE2-NEXT:    pand %xmm10, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm10
+; SSE2-NEXT:    por %xmm10, %xmm0
+; SSE2-NEXT:    pand %xmm12, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm12
+; SSE2-NEXT:    por %xmm12, %xmm1
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm9, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test117:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm4, %xmm0
+; SSE4-NEXT:    pminud %xmm5, %xmm1
+; SSE4-NEXT:    pminud %xmm6, %xmm2
+; SSE4-NEXT:    pminud %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test117:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminud %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test117:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test117:
-; AVX512F: vpminud {{.*}}
-}
-
-define void @test118(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp ule <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ult <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %a, <16 x i32> %b
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test118(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test118:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm14, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm12
+; SSE2-NEXT:    pxor %xmm14, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm14, %xmm9
+; SSE2-NEXT:    movdqa %xmm2, %xmm13
+; SSE2-NEXT:    pxor %xmm14, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm11
+; SSE2-NEXT:    movdqa %xmm1, %xmm15
+; SSE2-NEXT:    pxor %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm0
+; SSE2-NEXT:    pandn %xmm10, %xmm14
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm14, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm15
+; SSE2-NEXT:    pandn %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm2, %xmm13
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm13, %xmm9
+; SSE2-NEXT:    pandn %xmm3, %xmm12
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test118:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm4, %xmm0
+; SSE4-NEXT:    pminud %xmm5, %xmm1
+; SSE4-NEXT:    pminud %xmm6, %xmm2
+; SSE4-NEXT:    pminud %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test118:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminud %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test118:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test118:
-; AVX512F: vpminud {{.*}}
-}
-
-define void @test119(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp ugt <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ule <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %a, <16 x i32> %b
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test119(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test119:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pxor %xmm11, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm10
+; SSE2-NEXT:    por %xmm1, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm2, %xmm9
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test119:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm4, %xmm0
+; SSE4-NEXT:    pmaxud %xmm5, %xmm1
+; SSE4-NEXT:    pmaxud %xmm6, %xmm2
+; SSE4-NEXT:    pmaxud %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test119:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test119:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test119:
-; AVX512F: vpmaxud {{.*}}
-}
-
-define void @test120(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp uge <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ugt <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %a, <16 x i32> %b
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test120(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test120:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm14, %xmm0
+; SSE2-NEXT:    movdqa %xmm7, %xmm12
+; SSE2-NEXT:    pxor %xmm14, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm14, %xmm9
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    pxor %xmm14, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm11
+; SSE2-NEXT:    movdqa %xmm5, %xmm15
+; SSE2-NEXT:    pxor %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm0
+; SSE2-NEXT:    pandn %xmm10, %xmm14
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm14, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm15
+; SSE2-NEXT:    pandn %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm2, %xmm13
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm13, %xmm9
+; SSE2-NEXT:    pandn %xmm3, %xmm12
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test120:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm4, %xmm0
+; SSE4-NEXT:    pmaxud %xmm5, %xmm1
+; SSE4-NEXT:    pmaxud %xmm6, %xmm2
+; SSE4-NEXT:    pmaxud %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test120:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test120:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test120:
-; AVX512F: vpmaxud {{.*}}
-}
-
-define void @test121(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp slt <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp uge <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %a, <16 x i32> %b
+  ret <16 x i32> %sel
+}
+
+define <8 x i64> @test121(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test121:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pxor %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm6, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pxor %xmm9, %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm9, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm12, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm12
+; SSE2-NEXT:    por %xmm12, %xmm1
+; SSE2-NEXT:    pand %xmm10, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm10
+; SSE2-NEXT:    por %xmm10, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test121:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm9
+; SSE4-NEXT:    movdqa %xmm6, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm10
+; SSE4-NEXT:    movdqa %xmm5, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm11
+; SSE4-NEXT:    movdqa %xmm4, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    movapd %xmm5, %xmm1
+; SSE4-NEXT:    movapd %xmm6, %xmm2
+; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test121:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test121:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm3, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test121:
-; AVX512F: vpminsq {{.*}}
-}
-
-define void @test122(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp sle <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp slt <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %a, <8 x i64> %b
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test122:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm3, %xmm13
+; SSE2-NEXT:    pxor %xmm10, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    pand %xmm15, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm13
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm2, %xmm14
+; SSE2-NEXT:    pxor %xmm10, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm15[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm15
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    pxor %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm9, %xmm14
+; SSE2-NEXT:    pandn %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm15
+; SSE2-NEXT:    pandn %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm3, %xmm13
+; SSE2-NEXT:    pandn %xmm6, %xmm10
+; SSE2-NEXT:    por %xmm13, %xmm10
+; SSE2-NEXT:    pandn %xmm7, %xmm12
+; SSE2-NEXT:    pandn -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test122:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm9
+; SSE4-NEXT:    pcmpeqd %xmm12, %xmm12
+; SSE4-NEXT:    pxor %xmm12, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm10
+; SSE4-NEXT:    pxor %xmm12, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm5, %xmm11
+; SSE4-NEXT:    pxor %xmm12, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    pxor %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    movapd %xmm5, %xmm1
+; SSE4-NEXT:    movapd %xmm6, %xmm2
+; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test122:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test122:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm6
+; AVX2-NEXT:    vpxor %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test122:
-; AVX512F: vpminsq {{.*}}
-}
-
-define void @test123(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp sgt <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sle <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %a, <8 x i64> %b
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test123(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test123:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    pxor %xmm9, %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm9, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm12, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm12
+; SSE2-NEXT:    por %xmm12, %xmm1
+; SSE2-NEXT:    pand %xmm10, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm10
+; SSE2-NEXT:    por %xmm10, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test123:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm5, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    movapd %xmm5, %xmm1
+; SSE4-NEXT:    movapd %xmm6, %xmm2
+; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test123:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test123:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test123:
-; AVX512F: vpmaxsq {{.*}}
-}
-
-define void @test124(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp sge <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sgt <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %a, <8 x i64> %b
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test124(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test124:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    pxor %xmm10, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    pand %xmm15, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm13
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm5, %xmm14
+; SSE2-NEXT:    pxor %xmm10, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm15[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm15
+; SSE2-NEXT:    movdqa %xmm9, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    pxor %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm9, %xmm14
+; SSE2-NEXT:    pandn %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm15
+; SSE2-NEXT:    pandn %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm3, %xmm13
+; SSE2-NEXT:    pandn %xmm6, %xmm10
+; SSE2-NEXT:    por %xmm13, %xmm10
+; SSE2-NEXT:    pandn %xmm7, %xmm12
+; SSE2-NEXT:    pandn -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test124:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm9
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    pxor %xmm0, %xmm9
+; SSE4-NEXT:    movdqa %xmm6, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    movdqa %xmm5, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    movdqa %xmm4, %xmm12
+; SSE4-NEXT:    pcmpgtq %xmm8, %xmm12
+; SSE4-NEXT:    pxor %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    movapd %xmm5, %xmm1
+; SSE4-NEXT:    movapd %xmm6, %xmm2
+; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test124:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test124:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm3, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm6
+; AVX2-NEXT:    vpxor %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test124:
-; AVX512F: vpmaxsq {{.*}}
-}
-
-define void @test125(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp ult <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sge <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %a, <8 x i64> %b
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test125:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pxor %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm6, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pxor %xmm9, %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm9, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm12, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm12
+; SSE2-NEXT:    por %xmm12, %xmm1
+; SSE2-NEXT:    pand %xmm10, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm10
+; SSE2-NEXT:    por %xmm10, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test125:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm3, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    movdqa %xmm7, %xmm9
+; SSE4-NEXT:    pxor %xmm0, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm10, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    movdqa %xmm6, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm11, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm12
+; SSE4-NEXT:    pxor %xmm0, %xmm12
+; SSE4-NEXT:    movdqa %xmm5, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm12, %xmm11
+; SSE4-NEXT:    movdqa %xmm8, %xmm12
+; SSE4-NEXT:    pxor %xmm0, %xmm12
+; SSE4-NEXT:    pxor %xmm4, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    movapd %xmm5, %xmm1
+; SSE4-NEXT:    movapd %xmm6, %xmm2
+; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test125:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test125:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm6
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test125:
-; AVX512F: vpminuq {{.*}}
-}
-
-define void @test126(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp ule <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ult <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %a, <8 x i64> %b
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test126:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm3, %xmm13
+; SSE2-NEXT:    pxor %xmm10, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    pand %xmm15, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm13
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm2, %xmm14
+; SSE2-NEXT:    pxor %xmm10, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm15[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm15
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    pxor %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm9, %xmm14
+; SSE2-NEXT:    pandn %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm15
+; SSE2-NEXT:    pandn %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm3, %xmm13
+; SSE2-NEXT:    pandn %xmm6, %xmm10
+; SSE2-NEXT:    por %xmm13, %xmm10
+; SSE2-NEXT:    pandn %xmm7, %xmm12
+; SSE2-NEXT:    pandn -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test126:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm9
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm7, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    movdqa %xmm3, %xmm8
+; SSE4-NEXT:    pxor %xmm0, %xmm8
+; SSE4-NEXT:    pcmpgtq %xmm10, %xmm8
+; SSE4-NEXT:    pcmpeqd %xmm12, %xmm12
+; SSE4-NEXT:    pxor %xmm12, %xmm8
+; SSE4-NEXT:    movdqa %xmm6, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm11, %xmm10
+; SSE4-NEXT:    pxor %xmm12, %xmm10
+; SSE4-NEXT:    movdqa %xmm5, %xmm13
+; SSE4-NEXT:    pxor %xmm0, %xmm13
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm13, %xmm11
+; SSE4-NEXT:    pxor %xmm12, %xmm11
+; SSE4-NEXT:    movdqa %xmm4, %xmm13
+; SSE4-NEXT:    pxor %xmm0, %xmm13
+; SSE4-NEXT:    pxor %xmm9, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm13, %xmm0
+; SSE4-NEXT:    pxor %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm9, %xmm4
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    movdqa %xmm8, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    movapd %xmm5, %xmm1
+; SSE4-NEXT:    movapd %xmm6, %xmm2
+; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test126:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpxor %xmm8, %xmm4, %xmm4
+; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm8, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test126:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm7
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm7, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test126:
-; AVX512F: vpminuq {{.*}}
-}
-
-define void @test127(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp ugt <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ule <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %a, <8 x i64> %b
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test127:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    pxor %xmm9, %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm9, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    pand %xmm12, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm12
+; SSE2-NEXT:    por %xmm12, %xmm1
+; SSE2-NEXT:    pand %xmm10, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm10
+; SSE2-NEXT:    por %xmm10, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test127:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm7, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    pxor %xmm0, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm10, %xmm9
+; SSE4-NEXT:    movdqa %xmm6, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm11, %xmm10
+; SSE4-NEXT:    movdqa %xmm5, %xmm12
+; SSE4-NEXT:    pxor %xmm0, %xmm12
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm12, %xmm11
+; SSE4-NEXT:    movdqa %xmm4, %xmm12
+; SSE4-NEXT:    pxor %xmm0, %xmm12
+; SSE4-NEXT:    pxor %xmm8, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    movapd %xmm5, %xmm1
+; SSE4-NEXT:    movapd %xmm6, %xmm2
+; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test127:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test127:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm6
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test127:
-; AVX512F: vpmaxuq {{.*}}
-}
-
-define void @test128(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp uge <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ugt <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %a, <8 x i64> %b
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test128:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    pxor %xmm10, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    pand %xmm15, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm13
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm5, %xmm14
+; SSE2-NEXT:    pxor %xmm10, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm15[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm15
+; SSE2-NEXT:    movdqa %xmm9, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    pxor %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm9, %xmm14
+; SSE2-NEXT:    pandn %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm15
+; SSE2-NEXT:    pandn %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm3, %xmm13
+; SSE2-NEXT:    pandn %xmm6, %xmm10
+; SSE2-NEXT:    por %xmm13, %xmm10
+; SSE2-NEXT:    pandn %xmm7, %xmm12
+; SSE2-NEXT:    pandn -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test128:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm9
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm3, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    pxor %xmm0, %xmm8
+; SSE4-NEXT:    pcmpgtq %xmm10, %xmm8
+; SSE4-NEXT:    pcmpeqd %xmm12, %xmm12
+; SSE4-NEXT:    pxor %xmm12, %xmm8
+; SSE4-NEXT:    movdqa %xmm2, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    movdqa %xmm6, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm11, %xmm10
+; SSE4-NEXT:    pxor %xmm12, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm13
+; SSE4-NEXT:    pxor %xmm0, %xmm13
+; SSE4-NEXT:    movdqa %xmm5, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm13, %xmm11
+; SSE4-NEXT:    pxor %xmm12, %xmm11
+; SSE4-NEXT:    movdqa %xmm9, %xmm13
+; SSE4-NEXT:    pxor %xmm0, %xmm13
+; SSE4-NEXT:    pxor %xmm4, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm13, %xmm0
+; SSE4-NEXT:    pxor %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm9, %xmm4
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    movdqa %xmm8, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    movapd %xmm5, %xmm1
+; SSE4-NEXT:    movapd %xmm6, %xmm2
+; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test128:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpxor %xmm8, %xmm4, %xmm4
+; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm8, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test128:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm7
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm7, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test128:
-; AVX512F: vpmaxuq {{.*}}
-}
-
-define void @test129(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp slt <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp uge <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %a, <8 x i64> %b
+  ret <8 x i64> %sel
+}
+
+define <64 x i8> @test129(<64 x i8> %a, <64 x i8> %b) {
+; SSE2-LABEL: test129:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm3, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test129:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm4, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsb %xmm6, %xmm2
+; SSE4-NEXT:    pmaxsb %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test129:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test129:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test129:
-; AVX512BW: vpmaxsb
-}
-
-define void @test130(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp sle <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp slt <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %b, <64 x i8> %a
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test130(<64 x i8> %a, <64 x i8> %b) {
+; SSE2-LABEL: test130:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm12
+; SSE2-NEXT:    pcmpgtb %xmm7, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm13, %xmm13
+; SSE2-NEXT:    movdqa %xmm12, %xmm9
+; SSE2-NEXT:    pxor %xmm13, %xmm9
+; SSE2-NEXT:    movdqa %xmm8, %xmm14
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm2
+; SSE2-NEXT:    pxor %xmm13, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm15
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm15
+; SSE2-NEXT:    movdqa %xmm15, %xmm10
+; SSE2-NEXT:    pxor %xmm13, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm11, %xmm13
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    pandn %xmm0, %xmm13
+; SSE2-NEXT:    por %xmm13, %xmm11
+; SSE2-NEXT:    pandn %xmm5, %xmm15
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm15, %xmm10
+; SSE2-NEXT:    pandn %xmm6, %xmm14
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm14, %xmm2
+; SSE2-NEXT:    pandn %xmm7, %xmm12
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm12, %xmm9
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test130:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsb %xmm4, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsb %xmm6, %xmm2
+; SSE4-NEXT:    pmaxsb %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test130:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test130:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test130:
-; AVX512BW: vpmaxsb
-}
-
-define void @test131(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp sgt <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sle <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %b, <64 x i8> %a
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test131(<64 x i8> %a, <64 x i8> %b) {
+; SSE2-LABEL: test131:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pcmpgtb %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm3, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test131:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm4, %xmm0
+; SSE4-NEXT:    pminsb %xmm5, %xmm1
+; SSE4-NEXT:    pminsb %xmm6, %xmm2
+; SSE4-NEXT:    pminsb %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test131:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test131:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test131:
-; AVX512BW: vpminsb
-}
-
-define void @test132(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp sge <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sgt <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %b, <64 x i8> %a
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test132(<64 x i8> %a, <64 x i8> %b) {
+; SSE2-LABEL: test132:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    movdqa %xmm7, %xmm12
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm12, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    pcmpgtb %xmm8, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm14
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    movdqa %xmm4, %xmm15
+; SSE2-NEXT:    pcmpgtb %xmm10, %xmm15
+; SSE2-NEXT:    pxor %xmm15, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm15
+; SSE2-NEXT:    pandn %xmm10, %xmm0
+; SSE2-NEXT:    por %xmm15, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm14
+; SSE2-NEXT:    pandn %xmm1, %xmm11
+; SSE2-NEXT:    por %xmm14, %xmm11
+; SSE2-NEXT:    pandn %xmm6, %xmm13
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm13, %xmm2
+; SSE2-NEXT:    pandn %xmm7, %xmm12
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm12, %xmm9
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test132:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsb %xmm4, %xmm0
+; SSE4-NEXT:    pminsb %xmm5, %xmm1
+; SSE4-NEXT:    pminsb %xmm6, %xmm2
+; SSE4-NEXT:    pminsb %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test132:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test132:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test132:
-; AVX512BW: vpminsb
-}
-
-define void @test133(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp ult <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sge <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %b, <64 x i8> %a
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test133(<64 x i8> %a, <64 x i8> %b) {
+; SSE-LABEL: test133:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm4, %xmm0
+; SSE-NEXT:    pmaxub %xmm5, %xmm1
+; SSE-NEXT:    pmaxub %xmm6, %xmm2
+; SSE-NEXT:    pmaxub %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test133:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test133:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxub %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxub %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test133:
-; AVX512BW: vpmaxub
-}
-
-define void @test134(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp ule <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ult <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %b, <64 x i8> %a
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test134(<64 x i8> %a, <64 x i8> %b) {
+; SSE-LABEL: test134:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxub %xmm4, %xmm0
+; SSE-NEXT:    pmaxub %xmm5, %xmm1
+; SSE-NEXT:    pmaxub %xmm6, %xmm2
+; SSE-NEXT:    pmaxub %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test134:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test134:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxub %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxub %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test134:
-; AVX512BW: vpmaxub
-}
-
-define void @test135(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp ugt <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ule <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %b, <64 x i8> %a
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test135(<64 x i8> %a, <64 x i8> %b) {
+; SSE-LABEL: test135:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm4, %xmm0
+; SSE-NEXT:    pminub %xmm5, %xmm1
+; SSE-NEXT:    pminub %xmm6, %xmm2
+; SSE-NEXT:    pminub %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test135:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminub %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminub %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test135:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminub %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminub %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test135:
-; AVX512BW: vpminub
-}
-
-define void @test136(i8* nocapture %a, i8* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i8, i8* %a, i64 %index
-  %gep.b = getelementptr inbounds i8, i8* %b, i64 %index
-  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
-  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
-  %load.a = load <64 x i8>, <64 x i8>* %ptr.a, align 2
-  %load.b = load <64 x i8>, <64 x i8>* %ptr.b, align 2
-  %cmp = icmp uge <64 x i8> %load.a, %load.b
-  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
-  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
-  %index.next = add i64 %index, 32
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ugt <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %b, <64 x i8> %a
+  ret <64 x i8> %sel
+}
+
+define <64 x i8> @test136(<64 x i8> %a, <64 x i8> %b) {
+; SSE-LABEL: test136:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminub %xmm4, %xmm0
+; SSE-NEXT:    pminub %xmm5, %xmm1
+; SSE-NEXT:    pminub %xmm6, %xmm2
+; SSE-NEXT:    pminub %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test136:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminub %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminub %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test136:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminub %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminub %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test136:
-; AVX512BW: vpminub
-}
-
-define void @test137(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp slt <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp uge <64 x i8> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i8> %b, <64 x i8> %a
+  ret <64 x i8> %sel
+}
+
+define <32 x i16> @test137(<32 x i16> %a, <32 x i16> %b) {
+; SSE-LABEL: test137:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm4, %xmm0
+; SSE-NEXT:    pmaxsw %xmm5, %xmm1
+; SSE-NEXT:    pmaxsw %xmm6, %xmm2
+; SSE-NEXT:    pmaxsw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test137:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test137:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test137:
-; AVX512BW: vpmaxsw
-}
-
-define void @test138(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp sle <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp slt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %b, <32 x i16> %a
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test138(<32 x i16> %a, <32 x i16> %b) {
+; SSE-LABEL: test138:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pmaxsw %xmm4, %xmm0
+; SSE-NEXT:    pmaxsw %xmm5, %xmm1
+; SSE-NEXT:    pmaxsw %xmm6, %xmm2
+; SSE-NEXT:    pmaxsw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test138:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test138:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test138:
-; AVX512BW: vpmaxsw
-}
-
-define void @test139(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp sgt <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sle <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %b, <32 x i16> %a
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test139(<32 x i16> %a, <32 x i16> %b) {
+; SSE-LABEL: test139:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm4, %xmm0
+; SSE-NEXT:    pminsw %xmm5, %xmm1
+; SSE-NEXT:    pminsw %xmm6, %xmm2
+; SSE-NEXT:    pminsw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test139:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test139:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test139:
-; AVX512BW: vpminsw
-}
-
-define void @test140(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp sge <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sgt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %b, <32 x i16> %a
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test140(<32 x i16> %a, <32 x i16> %b) {
+; SSE-LABEL: test140:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pminsw %xmm4, %xmm0
+; SSE-NEXT:    pminsw %xmm5, %xmm1
+; SSE-NEXT:    pminsw %xmm6, %xmm2
+; SSE-NEXT:    pminsw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test140:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test140:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test140:
-; AVX512BW: vpminsw
-}
-
-define void @test141(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp ult <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sge <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %b, <32 x i16> %a
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test141(<32 x i16> %a, <32 x i16> %b) {
+; SSE2-LABEL: test141:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm3, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    pcmpgtw %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    pcmpgtw %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    pcmpgtw %xmm12, %xmm10
+; SSE2-NEXT:    movdqa %xmm11, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pcmpgtw %xmm12, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    pandn %xmm11, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    por %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test141:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm4, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm5, %xmm1
+; SSE4-NEXT:    pmaxuw %xmm6, %xmm2
+; SSE4-NEXT:    pmaxuw %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test141:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test141:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxuw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test141:
-; AVX512BW: vpmaxuw
-}
-
-define void @test142(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp ule <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ult <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %b, <32 x i16> %a
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test142(<32 x i16> %a, <32 x i16> %b) {
+; SSE2-LABEL: test142:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    psubusw %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm3
+; SSE2-NEXT:    psubusw %xmm6, %xmm2
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm2
+; SSE2-NEXT:    psubusw %xmm5, %xmm1
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    psubusw %xmm4, %xmm11
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm10, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm6
+; SSE2-NEXT:    pandn %xmm9, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test142:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxuw %xmm4, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm5, %xmm1
+; SSE4-NEXT:    pmaxuw %xmm6, %xmm2
+; SSE4-NEXT:    pmaxuw %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test142:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test142:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxuw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test142:
-; AVX512BW: vpmaxuw
-}
-
-define void @test143(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp ugt <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ule <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %b, <32 x i16> %a
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test143(<32 x i16> %a, <32 x i16> %b) {
+; SSE2-LABEL: test143:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    pcmpgtw %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    pcmpgtw %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    pcmpgtw %xmm12, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm0
+; SSE2-NEXT:    pcmpgtw %xmm12, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    pandn %xmm11, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    por %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test143:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm4, %xmm0
+; SSE4-NEXT:    pminuw %xmm5, %xmm1
+; SSE4-NEXT:    pminuw %xmm6, %xmm2
+; SSE4-NEXT:    pminuw %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test143:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminuw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test143:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminuw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test143:
-; AVX512BW: vpminuw
-}
-
-define void @test144(i16* nocapture %a, i16* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i16, i16* %a, i64 %index
-  %gep.b = getelementptr inbounds i16, i16* %b, i64 %index
-  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
-  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
-  %load.a = load <32 x i16>, <32 x i16>* %ptr.a, align 2
-  %load.b = load <32 x i16>, <32 x i16>* %ptr.b, align 2
-  %cmp = icmp uge <32 x i16> %load.a, %load.b
-  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
-  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
-  %index.next = add i64 %index, 16
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ugt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %b, <32 x i16> %a
+  ret <32 x i16> %sel
+}
+
+define <32 x i16> @test144(<32 x i16> %a, <32 x i16> %b) {
+; SSE2-LABEL: test144:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm7, %xmm3
+; SSE2-NEXT:    psubusw %xmm8, %xmm3
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    psubusw %xmm9, %xmm2
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    psubusw %xmm10, %xmm1
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    psubusw %xmm0, %xmm11
+; SSE2-NEXT:    pcmpeqw %xmm12, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm10, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm6
+; SSE2-NEXT:    pandn %xmm9, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test144:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminuw %xmm4, %xmm0
+; SSE4-NEXT:    pminuw %xmm5, %xmm1
+; SSE4-NEXT:    pminuw %xmm6, %xmm2
+; SSE4-NEXT:    pminuw %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test144:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminuw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test144:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminuw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512BW-LABEL: test144:
-; AVX512BW: vpminuw
-}
-
-define void @test145(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp slt <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp uge <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %b, <32 x i16> %a
+  ret <32 x i16> %sel
+}
+
+define <16 x i32> @test145(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test145:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm3, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test145:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm4, %xmm0
+; SSE4-NEXT:    pmaxsd %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsd %xmm6, %xmm2
+; SSE4-NEXT:    pmaxsd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test145:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test145:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test145:
-; AVX512F: vpmaxsd
-}
-
-define void @test146(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp sle <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp slt <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %b, <16 x i32> %a
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test146(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test146:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm13, %xmm13
+; SSE2-NEXT:    movdqa %xmm12, %xmm9
+; SSE2-NEXT:    pxor %xmm13, %xmm9
+; SSE2-NEXT:    movdqa %xmm8, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm2
+; SSE2-NEXT:    pxor %xmm13, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm15
+; SSE2-NEXT:    movdqa %xmm15, %xmm10
+; SSE2-NEXT:    pxor %xmm13, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm11, %xmm13
+; SSE2-NEXT:    pandn %xmm4, %xmm11
+; SSE2-NEXT:    pandn %xmm0, %xmm13
+; SSE2-NEXT:    por %xmm13, %xmm11
+; SSE2-NEXT:    pandn %xmm5, %xmm15
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm15, %xmm10
+; SSE2-NEXT:    pandn %xmm6, %xmm14
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm14, %xmm2
+; SSE2-NEXT:    pandn %xmm7, %xmm12
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm12, %xmm9
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test146:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxsd %xmm4, %xmm0
+; SSE4-NEXT:    pmaxsd %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsd %xmm6, %xmm2
+; SSE4-NEXT:    pmaxsd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test146:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxsd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test146:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxsd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test146:
-; AVX512F: vpmaxsd
-}
-
-define void @test147(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp sgt <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sle <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %b, <16 x i32> %a
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test147(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test147:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm3, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test147:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm4, %xmm0
+; SSE4-NEXT:    pminsd %xmm5, %xmm1
+; SSE4-NEXT:    pminsd %xmm6, %xmm2
+; SSE4-NEXT:    pminsd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test147:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test147:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test147:
-; AVX512F: vpminsd
-}
-
-define void @test148(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp sge <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sgt <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %b, <16 x i32> %a
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test148(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test148:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    movdqa %xmm7, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm12, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    movdqa %xmm4, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm15
+; SSE2-NEXT:    pxor %xmm15, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm15
+; SSE2-NEXT:    pandn %xmm10, %xmm0
+; SSE2-NEXT:    por %xmm15, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm14
+; SSE2-NEXT:    pandn %xmm1, %xmm11
+; SSE2-NEXT:    por %xmm14, %xmm11
+; SSE2-NEXT:    pandn %xmm6, %xmm13
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm13, %xmm2
+; SSE2-NEXT:    pandn %xmm7, %xmm12
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm12, %xmm9
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test148:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminsd %xmm4, %xmm0
+; SSE4-NEXT:    pminsd %xmm5, %xmm1
+; SSE4-NEXT:    pminsd %xmm6, %xmm2
+; SSE4-NEXT:    pminsd %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test148:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminsd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminsd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminsd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test148:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminsd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test148:
-; AVX512F: vpminsd
-}
-
-define void @test149(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp ult <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sge <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %b, <16 x i32> %a
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test149(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test149:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm10
+; SSE2-NEXT:    movdqa %xmm11, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    pandn %xmm11, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    por %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test149:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm4, %xmm0
+; SSE4-NEXT:    pmaxud %xmm5, %xmm1
+; SSE4-NEXT:    pmaxud %xmm6, %xmm2
+; SSE4-NEXT:    pmaxud %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test149:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test149:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test149:
-; AVX512F: vpmaxud
-}
-
-define void @test150(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp ule <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ult <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %b, <16 x i32> %a
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test150(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test150:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm14, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm12
+; SSE2-NEXT:    pxor %xmm14, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm14, %xmm9
+; SSE2-NEXT:    movdqa %xmm2, %xmm13
+; SSE2-NEXT:    pxor %xmm14, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm11
+; SSE2-NEXT:    movdqa %xmm1, %xmm15
+; SSE2-NEXT:    pxor %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm14
+; SSE2-NEXT:    pandn %xmm10, %xmm0
+; SSE2-NEXT:    por %xmm14, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm15
+; SSE2-NEXT:    pandn %xmm1, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm6, %xmm13
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm13, %xmm9
+; SSE2-NEXT:    pandn %xmm7, %xmm12
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test150:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pmaxud %xmm4, %xmm0
+; SSE4-NEXT:    pmaxud %xmm5, %xmm1
+; SSE4-NEXT:    pmaxud %xmm6, %xmm2
+; SSE4-NEXT:    pmaxud %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test150:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test150:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test150:
-; AVX512F: vpmaxud
-}
-
-define void @test151(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp ugt <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ule <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %b, <16 x i32> %a
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test151(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test151:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    pandn %xmm11, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    por %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test151:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm4, %xmm0
+; SSE4-NEXT:    pminud %xmm5, %xmm1
+; SSE4-NEXT:    pminud %xmm6, %xmm2
+; SSE4-NEXT:    pminud %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test151:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminud %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test151:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test151:
-; AVX512F: vpminud
-}
-
-define void @test152(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
-  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
-  %load.a = load <16 x i32>, <16 x i32>* %ptr.a, align 2
-  %load.b = load <16 x i32>, <16 x i32>* %ptr.b, align 2
-  %cmp = icmp uge <16 x i32> %load.a, %load.b
-  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
-  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ugt <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %b, <16 x i32> %a
+  ret <16 x i32> %sel
+}
+
+define <16 x i32> @test152(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test152:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm14, %xmm0
+; SSE2-NEXT:    movdqa %xmm7, %xmm12
+; SSE2-NEXT:    pxor %xmm14, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm14, %xmm9
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    pxor %xmm14, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm11
+; SSE2-NEXT:    movdqa %xmm5, %xmm15
+; SSE2-NEXT:    pxor %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm14
+; SSE2-NEXT:    pandn %xmm10, %xmm0
+; SSE2-NEXT:    por %xmm14, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm15
+; SSE2-NEXT:    pandn %xmm1, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm6, %xmm13
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm13, %xmm9
+; SSE2-NEXT:    pandn %xmm7, %xmm12
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test152:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    pminud %xmm4, %xmm0
+; SSE4-NEXT:    pminud %xmm5, %xmm1
+; SSE4-NEXT:    pminud %xmm6, %xmm2
+; SSE4-NEXT:    pminud %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test152:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpminud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminud %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test152:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpminud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test152:
-; AVX512F: vpminud
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp uge <16 x i32> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i32> %b, <16 x i32> %a
+  ret <16 x i32> %sel
 }
 
 ; -----------------------
 
-define void @test153(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp slt <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+define <8 x i64> @test153(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test153:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pxor %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm12, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    pxor %xmm4, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm12, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm12, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    por %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test153:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm9
+; SSE4-NEXT:    movdqa %xmm6, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm10
+; SSE4-NEXT:    movdqa %xmm5, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm11
+; SSE4-NEXT:    movdqa %xmm4, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test153:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test153:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm3, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test153:
-; AVX512F: vpmaxsq
-}
-
-define void @test154(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp sle <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp slt <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %b, <8 x i64> %a
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test154:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm3, %xmm13
+; SSE2-NEXT:    pxor %xmm10, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    pand %xmm15, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm13
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm2, %xmm14
+; SSE2-NEXT:    pxor %xmm10, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm15[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm15
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    pxor %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm14
+; SSE2-NEXT:    pandn %xmm9, %xmm1
+; SSE2-NEXT:    por %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm15
+; SSE2-NEXT:    pandn %xmm2, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm6, %xmm13
+; SSE2-NEXT:    pandn %xmm3, %xmm10
+; SSE2-NEXT:    por %xmm13, %xmm10
+; SSE2-NEXT:    pandn -{{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test154:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm9
+; SSE4-NEXT:    pcmpeqd %xmm12, %xmm12
+; SSE4-NEXT:    pxor %xmm12, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm10
+; SSE4-NEXT:    pxor %xmm12, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm5, %xmm11
+; SSE4-NEXT:    pxor %xmm12, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    pxor %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test154:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test154:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm6
+; AVX2-NEXT:    vpxor %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test154:
-; AVX512F: vpmaxsq
-}
-
-define void @test155(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp sgt <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sle <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %b, <8 x i64> %a
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test155(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test155:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm12, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm12, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm12, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    por %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test155:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm5, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test155:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test155:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test155:
-; AVX512F: vpminsq
-}
-
-define void @test156(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp sge <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sgt <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %b, <8 x i64> %a
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test156:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    pxor %xmm10, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    pand %xmm15, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm13
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm5, %xmm14
+; SSE2-NEXT:    pxor %xmm10, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm15[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm15
+; SSE2-NEXT:    movdqa %xmm9, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    pxor %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm14
+; SSE2-NEXT:    pandn %xmm9, %xmm1
+; SSE2-NEXT:    por %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm15
+; SSE2-NEXT:    pandn %xmm2, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm6, %xmm13
+; SSE2-NEXT:    pandn %xmm3, %xmm10
+; SSE2-NEXT:    por %xmm13, %xmm10
+; SSE2-NEXT:    pandn -{{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test156:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm9
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    pxor %xmm0, %xmm9
+; SSE4-NEXT:    movdqa %xmm6, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    movdqa %xmm5, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    movdqa %xmm4, %xmm12
+; SSE4-NEXT:    pcmpgtq %xmm8, %xmm12
+; SSE4-NEXT:    pxor %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test156:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test156:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm3, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm6
+; AVX2-NEXT:    vpxor %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test156:
-; AVX512F: vpminsq
-}
-
-define void @test157(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp ult <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp sge <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %b, <8 x i64> %a
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test157(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test157:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    pxor %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm5, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm12, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    pxor %xmm4, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm12, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm12, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    por %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test157:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm3, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    movdqa %xmm7, %xmm9
+; SSE4-NEXT:    pxor %xmm0, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm10, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    movdqa %xmm6, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm11, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm12
+; SSE4-NEXT:    pxor %xmm0, %xmm12
+; SSE4-NEXT:    movdqa %xmm5, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm12, %xmm11
+; SSE4-NEXT:    movdqa %xmm8, %xmm12
+; SSE4-NEXT:    pxor %xmm0, %xmm12
+; SSE4-NEXT:    pxor %xmm4, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test157:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test157:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm6
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test157:
-; AVX512F: vpmaxuq
-}
-
-define void @test158(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp ule <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ult <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %b, <8 x i64> %a
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test158(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test158:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm3, %xmm13
+; SSE2-NEXT:    pxor %xmm10, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    pand %xmm15, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm13
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm2, %xmm14
+; SSE2-NEXT:    pxor %xmm10, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm15[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm15
+; SSE2-NEXT:    movdqa %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    pxor %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm14
+; SSE2-NEXT:    pandn %xmm9, %xmm1
+; SSE2-NEXT:    por %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm15
+; SSE2-NEXT:    pandn %xmm2, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm6, %xmm13
+; SSE2-NEXT:    pandn %xmm3, %xmm10
+; SSE2-NEXT:    por %xmm13, %xmm10
+; SSE2-NEXT:    pandn -{{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test158:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm7, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    pxor %xmm0, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm10, %xmm9
+; SSE4-NEXT:    pcmpeqd %xmm12, %xmm12
+; SSE4-NEXT:    pxor %xmm12, %xmm9
+; SSE4-NEXT:    movdqa %xmm6, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm11, %xmm10
+; SSE4-NEXT:    pxor %xmm12, %xmm10
+; SSE4-NEXT:    movdqa %xmm5, %xmm13
+; SSE4-NEXT:    pxor %xmm0, %xmm13
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm13, %xmm11
+; SSE4-NEXT:    pxor %xmm12, %xmm11
+; SSE4-NEXT:    movdqa %xmm4, %xmm13
+; SSE4-NEXT:    pxor %xmm0, %xmm13
+; SSE4-NEXT:    pxor %xmm8, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm13, %xmm0
+; SSE4-NEXT:    pxor %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test158:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpxor %xmm8, %xmm4, %xmm4
+; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm8, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test158:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm7
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm7, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test158:
-; AVX512F: vpmaxuq
-}
-
-define void @test159(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp ugt <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ule <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %b, <8 x i64> %a
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test159:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm9
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pxor %xmm11, %xmm10
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm12, %xmm10
+; SSE2-NEXT:    movdqa %xmm4, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm12, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    pand %xmm14, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    por %xmm12, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    por %xmm7, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test159:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm7, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    pxor %xmm0, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm10, %xmm9
+; SSE4-NEXT:    movdqa %xmm6, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm11, %xmm10
+; SSE4-NEXT:    movdqa %xmm5, %xmm12
+; SSE4-NEXT:    pxor %xmm0, %xmm12
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm12, %xmm11
+; SSE4-NEXT:    movdqa %xmm4, %xmm12
+; SSE4-NEXT:    pxor %xmm0, %xmm12
+; SSE4-NEXT:    pxor %xmm8, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test159:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test159:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm6
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test159:
-; AVX512F: vpminuq
-}
-
-define void @test160(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
-  %load.a = load <8 x i64>, <8 x i64>* %ptr.a, align 2
-  %load.b = load <8 x i64>, <8 x i64>* %ptr.b, align 2
-  %cmp = icmp uge <8 x i64> %load.a, %load.b
-  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
-  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp ugt <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %b, <8 x i64> %a
+  ret <8 x i64> %sel
+}
+
+define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test160:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pxor %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm12
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    pxor %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    pxor %xmm10, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[1,1,3,3]
+; SSE2-NEXT:    pand %xmm15, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm13
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    movdqa %xmm5, %xmm14
+; SSE2-NEXT:    pxor %xmm10, %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm15
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm15
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm15[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm15
+; SSE2-NEXT:    movdqa %xmm9, %xmm11
+; SSE2-NEXT:    pxor %xmm10, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm14[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    pxor %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm14
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    pxor %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm14
+; SSE2-NEXT:    pandn %xmm9, %xmm1
+; SSE2-NEXT:    por %xmm14, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm15
+; SSE2-NEXT:    pandn %xmm2, %xmm11
+; SSE2-NEXT:    por %xmm15, %xmm11
+; SSE2-NEXT:    pandn %xmm6, %xmm13
+; SSE2-NEXT:    pandn %xmm3, %xmm10
+; SSE2-NEXT:    por %xmm13, %xmm10
+; SSE2-NEXT:    pandn -{{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
+; SSE2-NEXT:    pandn %xmm7, %xmm8
+; SSE2-NEXT:    por %xmm12, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test160:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm3, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    movdqa %xmm7, %xmm9
+; SSE4-NEXT:    pxor %xmm0, %xmm9
+; SSE4-NEXT:    pcmpgtq %xmm10, %xmm9
+; SSE4-NEXT:    pcmpeqd %xmm12, %xmm12
+; SSE4-NEXT:    pxor %xmm12, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    movdqa %xmm6, %xmm10
+; SSE4-NEXT:    pxor %xmm0, %xmm10
+; SSE4-NEXT:    pcmpgtq %xmm11, %xmm10
+; SSE4-NEXT:    pxor %xmm12, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm13
+; SSE4-NEXT:    pxor %xmm0, %xmm13
+; SSE4-NEXT:    movdqa %xmm5, %xmm11
+; SSE4-NEXT:    pxor %xmm0, %xmm11
+; SSE4-NEXT:    pcmpgtq %xmm13, %xmm11
+; SSE4-NEXT:    pxor %xmm12, %xmm11
+; SSE4-NEXT:    movdqa %xmm8, %xmm13
+; SSE4-NEXT:    pxor %xmm0, %xmm13
+; SSE4-NEXT:    pxor %xmm4, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm13, %xmm0
+; SSE4-NEXT:    pxor %xmm12, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    movdqa %xmm11, %xmm0
+; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test160:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpxor %xmm8, %xmm4, %xmm4
+; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
+; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm7
+; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm8, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test160:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm7
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm7, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test160:
-; AVX512F: vpminuq
-}
-
-define void @test161(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp slt <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test161:
-; AVX512VL: vpminsq
-}
-
-define void @test162(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp sle <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test162:
-; AVX512VL: vpminsq
-}
-
-define void @test163(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp sgt <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test163:
-; AVX512VL: vpmaxsq 
-}
-
-define void @test164(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp sge <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test164:
-; AVX512VL: vpmaxsq
-}
-
-define void @test165(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp ult <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test165:
-; AVX512VL: vpminuq 
-}
-
-define void @test166(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp ule <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test166:
-; AVX512VL: vpminuq
-}
-
-define void @test167(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp ugt <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test167:
-; AVX512VL: vpmaxuq
-}
-
-define void @test168(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp uge <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test168:
-; AVX512VL: vpmaxuq
-}
-
-define void @test169(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp slt <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test169:
-; AVX512VL: vpmaxsq
-}
-
-define void @test170(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp sle <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test170:
-; AVX512VL: vpmaxsq
-}
-
-define void @test171(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp sgt <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test171:
-; AVX512VL: vpminsq
-}
-
-define void @test172(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp sge <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test172:
-; AVX512VL: vpminsq
-}
-
-define void @test173(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp ult <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test173:
-; AVX512VL: vpmaxuq
-}
-
-define void @test174(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp ule <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test174:
-; AVX512VL: vpmaxuq
-}
-
-define void @test175(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp ugt <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test175:
-; AVX512VL: vpminuq
-}
-
-define void @test176(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
-  %load.a = load <4 x i64>, <4 x i64>* %ptr.a, align 2
-  %load.b = load <4 x i64>, <4 x i64>* %ptr.b, align 2
-  %cmp = icmp uge <4 x i64> %load.a, %load.b
-  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
-  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test176:
-; AVX512VL: vpminuq
-}
-
-define void @test177(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp slt <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test177:
-; AVX512VL: vpminsq
-}
-
-define void @test178(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp sle <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test178:
-; AVX512VL: vpminsq
-}
-
-define void @test179(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp sgt <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test179:
-; AVX512VL: vpmaxsq
-}
-
-define void @test180(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp sge <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test180:
-; AVX512VL: vpmaxsq
-}
-
-define void @test181(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp ult <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test181:
-; AVX512VL: vpminuq
-}
-
-define void @test182(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp ule <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test182:
-; AVX512VL: vpminuq
-}
-
-define void @test183(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp ugt <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test183:
-; AVX512VL: vpmaxuq
-}
-
-define void @test184(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp uge <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test184:
-; AVX512VL: vpmaxuq
-}
-
-define void @test185(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp slt <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test185:
-; AVX512VL: vpmaxsq
-}
-
-define void @test186(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp sle <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test186:
-; AVX512VL: vpmaxsq
-}
-
-define void @test187(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp sgt <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test187:
-; AVX512VL: vpminsq
-}
-
-define void @test188(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp sge <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test188:
-; AVX512VL: vpminsq
-}
-
-define void @test189(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp ult <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test189:
-; AVX512VL: vpmaxuq
-}
-
-define void @test190(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp ule <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test190:
-; AVX512VL: vpmaxuq
-}
-
-define void @test191(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp ugt <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test191:
-; AVX512VL: vpminuq
-}
-
-define void @test192(i32* nocapture %a, i32* nocapture %b) nounwind {
-vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %gep.a = getelementptr inbounds i32, i32* %a, i64 %index
-  %gep.b = getelementptr inbounds i32, i32* %b, i64 %index
-  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
-  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
-  %load.a = load <2 x i64>, <2 x i64>* %ptr.a, align 2
-  %load.b = load <2 x i64>, <2 x i64>* %ptr.b, align 2
-  %cmp = icmp uge <2 x i64> %load.a, %load.b
-  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
-  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
-  %index.next = add i64 %index, 8
-  %loop = icmp eq i64 %index.next, 16384
-  br i1 %loop, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
-  ret void
-
-; AVX512VL-LABEL: test192:
-; AVX512VL: vpminuq
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+entry:
+  %cmp = icmp uge <8 x i64> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i64> %b, <8 x i64> %a
+  ret <8 x i64> %sel
+}
+
+define <4 x i64> @test161(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test161:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test161:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa %xmm3, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm5
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    movapd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test161:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test161:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test161:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp slt <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test162(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test162:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm8
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test162:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm5
+; SSE4-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE4-NEXT:    pxor %xmm6, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
+; SSE4-NEXT:    pxor %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    movapd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test162:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test162:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test162:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sle <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test163(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test163:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test163:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    movapd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test163:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test163:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test163:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sgt <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test164(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test164:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm8
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test164:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa %xmm3, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm5
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    pxor %xmm0, %xmm5
+; SSE4-NEXT:    movdqa %xmm2, %xmm6
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm6
+; SSE4-NEXT:    pxor %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    movapd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test164:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test164:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test164:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sge <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test165(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test165:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test165:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm1, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    movdqa %xmm3, %xmm5
+; SSE4-NEXT:    pxor %xmm0, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm4, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    pxor %xmm2, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    movapd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test165:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test165:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test165:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ult <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test166(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test166:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm8
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test166:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm3, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm0, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE4-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE4-NEXT:    pxor %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm2, %xmm7
+; SSE4-NEXT:    pxor %xmm0, %xmm7
+; SSE4-NEXT:    pxor %xmm4, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    movapd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test166:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test166:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test166:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ule <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test167(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test167:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test167:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm3, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm0, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm2, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    pxor %xmm4, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    movapd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test167:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test167:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test167:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ugt <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test168(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test168:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm8
+; SSE2-NEXT:    pandn %xmm3, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test168:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm1, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    movdqa %xmm3, %xmm5
+; SSE4-NEXT:    pxor %xmm0, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE4-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE4-NEXT:    pxor %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm4, %xmm7
+; SSE4-NEXT:    pxor %xmm0, %xmm7
+; SSE4-NEXT:    pxor %xmm2, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    movapd %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test168:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test168:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test168:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp uge <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test169(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test169:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test169:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa %xmm3, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm5
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test169:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test169:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test169:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp slt <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %b, <4 x i64> %a
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test170(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test170:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    pandn %xmm1, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test170:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm5
+; SSE4-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE4-NEXT:    pxor %xmm6, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
+; SSE4-NEXT:    pxor %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test170:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test170:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test170:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sle <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %b, <4 x i64> %a
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test171(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test171:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test171:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test171:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test171:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test171:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sgt <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %b, <4 x i64> %a
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test172(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test172:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    pandn %xmm1, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test172:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa %xmm3, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm5
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    pxor %xmm0, %xmm5
+; SSE4-NEXT:    movdqa %xmm2, %xmm6
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm6
+; SSE4-NEXT:    pxor %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test172:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test172:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test172:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sge <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %b, <4 x i64> %a
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test173(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test173:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test173:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm1, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    movdqa %xmm3, %xmm5
+; SSE4-NEXT:    pxor %xmm0, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm4, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    pxor %xmm2, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test173:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test173:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test173:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ult <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %b, <4 x i64> %a
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test174(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test174:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    pandn %xmm1, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test174:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm3, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm0, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE4-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE4-NEXT:    pxor %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm2, %xmm7
+; SSE4-NEXT:    pxor %xmm0, %xmm7
+; SSE4-NEXT:    pxor %xmm4, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test174:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test174:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test174:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ule <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %b, <4 x i64> %a
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test175(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test175:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test175:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm3, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm0, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm2, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    pxor %xmm4, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test175:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test175:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test175:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ugt <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %b, <4 x i64> %a
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @test176(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test176:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm7, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm9
+; SSE2-NEXT:    pxor %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    pandn %xmm1, %xmm9
+; SSE2-NEXT:    por %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test176:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm1, %xmm6
+; SSE4-NEXT:    pxor %xmm0, %xmm6
+; SSE4-NEXT:    movdqa %xmm3, %xmm5
+; SSE4-NEXT:    pxor %xmm0, %xmm5
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm5
+; SSE4-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE4-NEXT:    pxor %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm4, %xmm7
+; SSE4-NEXT:    pxor %xmm0, %xmm7
+; SSE4-NEXT:    pxor %xmm2, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    movapd %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test176:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test176:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test176:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp uge <4 x i64> %a, %b
+  %sel = select <4 x i1> %cmp, <4 x i64> %b, <4 x i64> %a
+  ret <4 x i64> %sel
+}
+
+define <2 x i64> @test177(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test177:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test177:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    movapd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test177:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test177:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test177:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp slt <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test178(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test178:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test178:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE4-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    movapd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test178:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test178:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test178:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sle <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test179(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test179:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test179:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    movapd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test179:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test179:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test179:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sgt <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test180(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test180:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test180:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa %xmm1, %xmm3
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    movapd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test180:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test180:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test180:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sge <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test181:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test181:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm2, %xmm3
+; SSE4-NEXT:    pxor %xmm0, %xmm3
+; SSE4-NEXT:    pxor %xmm1, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    movapd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test181:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test181:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test181:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ult <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test182:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test182:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    pxor %xmm2, %xmm3
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    movapd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test182:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test182:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test182:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ule <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test183:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test183:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm1, %xmm3
+; SSE4-NEXT:    pxor %xmm0, %xmm3
+; SSE4-NEXT:    pxor %xmm2, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    movapd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test183:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test183:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test183:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ugt <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test184:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test184:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm3
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    movapd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test184:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test184:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test184:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp uge <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test185(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test185:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test185:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test185:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test185:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test185:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp slt <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %b, <2 x i64> %a
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test186(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test186:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test186:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE4-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test186:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test186:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test186:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sle <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %b, <2 x i64> %a
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test187(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test187:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test187:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test187:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test187:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test187:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sgt <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %b, <2 x i64> %a
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test188(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test188:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test188:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa %xmm1, %xmm3
+; SSE4-NEXT:    pcmpgtq %xmm2, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test188:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test188:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test188:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminsq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp sge <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %b, <2 x i64> %a
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test189:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test189:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm2, %xmm3
+; SSE4-NEXT:    pxor %xmm0, %xmm3
+; SSE4-NEXT:    pxor %xmm1, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test189:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test189:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test189:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ult <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %b, <2 x i64> %a
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test190:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test190:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    pxor %xmm2, %xmm3
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test190:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test190:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test190:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ule <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %b, <2 x i64> %a
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test191:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test191:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm1, %xmm3
+; SSE4-NEXT:    pxor %xmm0, %xmm3
+; SSE4-NEXT:    pxor %xmm2, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test191:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test191:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test191:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp ugt <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %b, <2 x i64> %a
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test192:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test192:
+; SSE4:       # BB#0: # %entry
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm3
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm0
+; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    movapd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test192:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test192:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test192:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    vpminuq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
+entry:
+  %cmp = icmp uge <2 x i64> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i64> %b, <2 x i64> %a
+  ret <2 x i64> %sel
 }
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 8c8092888834a..359ea7eb3ee5a 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2,-sse4.1 < %s | FileCheck %s
 
 ; Verify that we don't emit packed vector shifts instructions if the
diff --git a/test/CodeGen/X86/vshift_scalar.ll b/test/CodeGen/X86/vshift_scalar.ll
index 9dd8478caaed4..87eec3f9e97bf 100644
--- a/test/CodeGen/X86/vshift_scalar.ll
+++ b/test/CodeGen/X86/vshift_scalar.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; REQUIRES: default_triple
 
 ; Legalization test that requires scalarizing a vector.
 
diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll
new file mode 100644
index 0000000000000..c45a0541e6a7c
--- /dev/null
+++ b/test/CodeGen/X86/wide-integer-cmp.ll
@@ -0,0 +1,130 @@
+; RUN: llc -mtriple=i686-linux-gnu %s -o - | FileCheck %s
+
+
+define i32 @branch_eq(i64 %a, i64 %b) {
+entry:
+  %cmp = icmp eq i64 %a, %b
+	br i1 %cmp, label %bb1, label %bb2
+bb1:
+  ret i32 1
+bb2:
+  ret i32 2
+
+; CHECK-LABEL: branch_eq:
+; CHECK: movl 4(%esp), [[LHSLo:%[a-z]+]]
+; CHECK: movl 8(%esp), [[LHSHi:%[a-z]+]]
+; CHECK: xorl 16(%esp), [[LHSHi]]
+; CHECK: xorl 12(%esp), [[LHSLo]]
+; CHECK: orl [[LHSHi]], [[LHSLo]]
+; CHECK: jne [[FALSE:.LBB[0-9_]+]]
+; CHECK: movl $1, %eax
+; CHECK: retl
+; CHECK: [[FALSE]]:
+; CHECK: movl $2, %eax
+; CHECK: retl
+}
+
+define i32 @branch_slt(i64 %a, i64 %b) {
+entry:
+  %cmp = icmp slt i64 %a, %b
+	br i1 %cmp, label %bb1, label %bb2
+bb1:
+  ret i32 1
+bb2:
+  ret i32 2
+
+; CHECK-LABEL: branch_slt:
+; CHECK: movl 4(%esp), [[LHSLo:%[a-z]+]]
+; CHECK: movl 8(%esp), [[LHSHi:%[a-z]+]]
+; CHECK: cmpl 12(%esp), [[LHSLo]]
+; CHECK: sbbl 16(%esp), [[LHSHi]]
+; CHECK: jge [[FALSE:.LBB[0-9_]+]]
+; CHECK: movl $1, %eax
+; CHECK: retl
+; CHECK: [[FALSE]]:
+; CHECK: movl $2, %eax
+; CHECK: retl
+}
+
+define i32 @branch_ule(i64 %a, i64 %b) {
+entry:
+  %cmp = icmp ule i64 %a, %b
+	br i1 %cmp, label %bb1, label %bb2
+bb1:
+  ret i32 1
+bb2:
+  ret i32 2
+
+; CHECK-LABEL: branch_ule:
+; CHECK: movl 12(%esp), [[RHSLo:%[a-z]+]]
+; CHECK: movl 16(%esp), [[RHSHi:%[a-z]+]]
+; CHECK: cmpl 4(%esp), [[RHSLo]]
+; CHECK: sbbl 8(%esp), [[RHSHi]]
+; CHECK: jb [[FALSE:.LBB[0-9_]+]]
+; CHECK: movl $1, %eax
+; CHECK: retl
+; CHECK: [[FALSE]]:
+; CHECK: movl $2, %eax
+; CHECK: retl
+}
+
+define i32 @set_gt(i64 %a, i64 %b) {
+entry:
+  %cmp = icmp sgt i64 %a, %b
+  %res = select i1 %cmp, i32 1, i32 0
+  ret i32 %res
+
+; CHECK-LABEL: set_gt:
+; CHECK: movl 12(%esp), [[RHSLo:%[a-z]+]]
+; CHECK: movl 16(%esp), [[RHSHi:%[a-z]+]]
+; CHECK: cmpl 4(%esp), [[RHSLo]]
+; CHECK: sbbl 8(%esp), [[RHSHi]]
+; CHECK: setl %al
+; CHECK: retl
+}
+
+define i32 @test_wide(i128 %a, i128 %b) {
+entry:
+  %cmp = icmp slt i128 %a, %b
+	br i1 %cmp, label %bb1, label %bb2
+bb1:
+  ret i32 1
+bb2:
+  ret i32 2
+
+; CHECK-LABEL: test_wide:
+; CHECK: cmpl 24(%esp)
+; CHECK: sbbl 28(%esp)
+; CHECK: sbbl 32(%esp)
+; CHECK: sbbl 36(%esp)
+; CHECK: jge [[FALSE:.LBB[0-9_]+]]
+; CHECK: movl $1, %eax
+; CHECK: retl
+; CHECK: [[FALSE]]:
+; CHECK: movl $2, %eax
+; CHECK: retl
+}
+
+define i32 @test_carry_false(i64 %a, i64 %b) {
+entry:
+  %x = and i64 %a, -4294967296 ;0xffffffff00000000
+  %y = and i64 %b, -4294967296
+  %cmp = icmp slt i64 %x, %y
+	br i1 %cmp, label %bb1, label %bb2
+bb1:
+  ret i32 1
+bb2:
+  ret i32 2
+
+; The comparison of the low bits will be folded to a CARRY_FALSE node. Make
+; sure the code can handle that.
+; CHECK-LABEL: carry_false:
+; CHECK: movl 8(%esp), [[LHSHi:%[a-z]+]]
+; CHECK: cmpl 16(%esp), [[LHSHi]]
+; CHECK: jge [[FALSE:.LBB[0-9_]+]]
+; CHECK: movl $1, %eax
+; CHECK: retl
+; CHECK: [[FALSE]]:
+; CHECK: movl $2, %eax
+; CHECK: retl
+}
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 6f1bd7541231a..fad1fa32559ad 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -191,9 +191,7 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa
 ; CHECK-NEXT:    movd    %[[CONSTANT1]], %e[[R1:[abcd]]]x
 ; CHECK-NEXT:    movw    %[[R1]]x, (%[[PTR1:.*]])
 ; CHECK-NEXT:    movb    $1, 2(%[[PTR1]])
-; CHECK-NEXT:    movl    (%[[PTR0]]), [[TMP1:%e[abcd]+x]]
-; CHECK-NEXT:    movl    [[TMP1]], [[TMP2:.*]]
-; CHECK-NEXT:    pmovzxbd [[TMP2]], %[[X0:xmm[0-9]+]]
+; CHECK-NEXT:    pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]]
 ; CHECK-NEXT:    movdqa  %[[X0]], %[[X1:xmm[0-9]+]]
 ; CHECK-NEXT:    psrld   $1, %[[X1]]
 ; CHECK-NEXT:    pblendw $192, %[[X0]], %[[X1]]
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index 302805213d068..66ba6350c8a87 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 
 target triple = "x86_64-unknown-unknown"
diff --git a/test/CodeGen/X86/win-catchpad-csrs.ll b/test/CodeGen/X86/win-catchpad-csrs.ll
new file mode 100644
index 0000000000000..327ee45b4326d
--- /dev/null
+++ b/test/CodeGen/X86/win-catchpad-csrs.ll
@@ -0,0 +1,268 @@
+; RUN: llc -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchableType = type { i32, i8*, i32, i32, i32, i32, i8* }
+%eh.CatchableTypeArray.1 = type { i32, [1 x %eh.CatchableType*] }
+%eh.ThrowInfo = type { i32, i8*, i8*, i8* }
+
+$"\01??_R0H@8" = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+
+declare i32 @getint()
+declare void @useints(...)
+declare void @f(i32 %p)
+declare i32 @__CxxFrameHandler3(...)
+
+define i32 @try_catch_catch() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %a = call i32 @getint()
+  %b = call i32 @getint()
+  %c = call i32 @getint()
+  %d = call i32 @getint()
+  call void (...) @useints(i32 %a, i32 %b, i32 %c, i32 %d)
+  invoke void @f(i32 1)
+          to label %try.cont unwind label %catch.dispatch
+
+try.cont:
+  ret i32 0
+
+catch.dispatch:
+  %cs = catchswitch within none [label %handler1] unwind to caller
+
+handler1:
+  %h1 = catchpad within %cs [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i8* null]
+  call void @f(i32 2) [ "funclet"(token %h1) ]
+  catchret from %h1 to label %try.cont
+}
+
+; X86-LABEL: _try_catch_catch:
+; X86: pushl %ebp
+; X86: movl %esp, %ebp
+; X86: pushl %ebx
+; X86: pushl %edi
+; X86: pushl %esi
+; X86: subl ${{[0-9]+}}, %esp
+; X86: calll _getint
+; X86: calll _getint
+; X86: calll _getint
+; X86: calll _getint
+; X86: calll _useints
+; X86: movl $0, -{{[0-9]+}}(%ebp)
+; X86: movl $1, (%esp)
+; X86: calll _f
+; X86: [[contbb:LBB0_[0-9]+]]: # %try.cont
+; X86: popl %esi
+; X86: popl %edi
+; X86: popl %ebx
+; X86: popl %ebp
+; X86: retl
+
+; X86: [[restorebb:LBB0_[0-9]+]]:
+; X86: addl $12, %ebp
+; X86: jmp [[contbb]]
+
+; X86: "?catch$[[catch1bb:[0-9]+]]@?0?try_catch_catch@4HA":
+; X86: LBB0_[[catch1bb]]: # %handler1{{$}}
+; X86: pushl %ebp
+; X86-NOT: pushl
+; X86: subl $16, %esp
+; X86: addl $12, %ebp
+; X86: movl $1, -{{[0-9]+}}(%ebp)
+; X86: movl $2, (%esp)
+; X86: calll _f
+; X86: movl $[[restorebb]], %eax
+; X86-NEXT: addl $16, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+
+; X86: L__ehtable$try_catch_catch:
+; X86: $handlerMap$0$try_catch_catch:
+; X86:   .long   0
+; X86:   .long   "??_R0H@8"
+; X86:   .long   0
+; X86:   .long   "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"
+
+; X64-LABEL: try_catch_catch:
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64: pushq %rsi
+; X64: .seh_pushreg 6
+; X64: pushq %rdi
+; X64: .seh_pushreg 7
+; X64: pushq %rbx
+; X64: .seh_pushreg 3
+; X64: subq $40, %rsp
+; X64: .seh_stackalloc 40
+; X64: leaq 32(%rsp), %rbp
+; X64: .seh_setframe 5, 32
+; X64: .seh_endprologue
+; X64: movq $-2, (%rbp)
+; X64: callq getint
+; X64: callq getint
+; X64: callq getint
+; X64: callq getint
+; X64: callq useints
+; X64: movl $1, %ecx
+; X64: callq f
+; X64: [[contbb:\.LBB0_[0-9]+]]: # Block address taken
+; X64-NEXT:                      # %try.cont
+; X64: addq $40, %rsp
+; X64: popq %rbp
+; X64: retq
+
+; X64: "?catch$[[catch1bb:[0-9]+]]@?0?try_catch_catch@4HA":
+; X64: LBB0_[[catch1bb]]: # %handler1{{$}}
+; X64: movq %rdx, 16(%rsp)
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64: pushq %rsi
+; X64: .seh_pushreg 6
+; X64: pushq %rdi
+; X64: .seh_pushreg 7
+; X64: pushq %rbx
+; X64: .seh_pushreg 3
+; X64: subq $40, %rsp
+; X64: .seh_stackalloc 40
+; X64: leaq 32(%rdx), %rbp
+; X64: .seh_endprologue
+; X64: movl $2, %ecx
+; X64: callq f
+; X64: leaq [[contbb]](%rip), %rax
+; X64: addq $40, %rsp
+; X64: popq %rbx
+; X64: popq %rdi
+; X64: popq %rsi
+; X64: popq %rbp
+; X64: retq
+
+; X64: $handlerMap$0$try_catch_catch:
+; X64:   .long   0
+; X64:   .long   "??_R0H@8"@IMGREL
+; X64:   .long   0
+; X64:   .long   "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"@IMGREL
+; X64:   .long   88
+
+define i32 @try_one_csr() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %a = call i32 @getint()
+  %b = call i32 @getint()
+  call void (...) @useints(i32 %a)
+  invoke void @f(i32 1)
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:
+  %cs = catchswitch within none [label %handler1] unwind to caller
+
+handler1:
+  %0 = catchpad within %cs [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i8* null]
+  catchret from %0 to label %try.cont
+
+try.cont:
+  ret i32 0
+}
+
+; X64-LABEL: try_one_csr:
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64: pushq %rsi
+; X64: .seh_pushreg 6
+; X64-NOT: pushq
+; X64: subq $40, %rsp
+; X64: .seh_stackalloc 40
+; X64: leaq 32(%rsp), %rbp
+; X64: .seh_setframe 5, 32
+; X64: .seh_endprologue
+; X64: callq getint
+; X64: callq getint
+; X64: callq useints
+; X64: movl $1, %ecx
+; X64: callq f
+; X64: [[contbb:\.LBB1_[0-9]+]]: # Block address taken
+; X64-NEXT:                      # %try.cont
+; X64: addq $40, %rsp
+; X64-NOT: popq
+; X64: popq %rsi
+; X64: popq %rbp
+; X64: retq
+
+; X64: "?catch$[[catch1bb:[0-9]+]]@?0?try_one_csr@4HA":
+; X64: LBB1_[[catch1bb]]: # %handler1{{$}}
+; X64: movq %rdx, 16(%rsp)
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64: pushq %rsi
+; X64: .seh_pushreg 6
+; X64: subq $40, %rsp
+; X64: .seh_stackalloc 40
+; X64: leaq 32(%rdx), %rbp
+; X64: .seh_endprologue
+; X64: leaq [[contbb]](%rip), %rax
+; X64: addq $40, %rsp
+; X64: popq %rsi
+; X64: popq %rbp
+; X64: retq
+
+; X64: $handlerMap$0$try_one_csr:
+; X64:   .long   0
+; X64:   .long   "??_R0H@8"@IMGREL
+; X64:   .long   0
+; X64:   .long   "?catch$[[catch1bb]]@?0?try_one_csr@4HA"@IMGREL
+; X64:   .long   72
+
+define i32 @try_no_csr() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @f(i32 1)
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:
+  %cs = catchswitch within none [label %handler1] unwind to caller
+
+handler1:
+  %cp1 = catchpad within %cs [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i8* null]
+  catchret from %cp1 to label %try.cont
+
+try.cont:
+  ret i32 0
+}
+
+; X64-LABEL: try_no_csr:
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64-NOT: pushq
+; X64: subq $48, %rsp
+; X64: .seh_stackalloc 48
+; X64: leaq 48(%rsp), %rbp
+; X64: .seh_setframe 5, 48
+; X64: .seh_endprologue
+; X64: movl $1, %ecx
+; X64: callq f
+; X64: [[contbb:\.LBB2_[0-9]+]]: # Block address taken
+; X64-NEXT:                      # %try.cont
+; X64: addq $48, %rsp
+; X64-NOT: popq
+; X64: popq %rbp
+; X64: retq
+
+; X64: "?catch$[[catch1bb:[0-9]+]]@?0?try_no_csr@4HA":
+; X64: LBB2_[[catch1bb]]: # %handler1{{$}}
+; X64: movq %rdx, 16(%rsp)
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64: subq $32, %rsp
+; X64: .seh_stackalloc 32
+; X64: leaq 48(%rdx), %rbp
+; X64: .seh_endprologue
+; X64: leaq [[contbb]](%rip), %rax
+; X64: addq $32, %rsp
+; X64: popq %rbp
+; X64: retq
+
+; X64: $handlerMap$0$try_no_csr:
+; X64:   .long   0
+; X64:   .long   "??_R0H@8"@IMGREL
+; X64:   .long   0
+; X64:   .long   "?catch$[[catch1bb]]@?0?try_no_csr@4HA"@IMGREL
+; X64:   .long   56
diff --git a/test/CodeGen/X86/win-catchpad-nested-cxx.ll b/test/CodeGen/X86/win-catchpad-nested-cxx.ll
new file mode 100644
index 0000000000000..ac4598385cd1f
--- /dev/null
+++ b/test/CodeGen/X86/win-catchpad-nested-cxx.ll
@@ -0,0 +1,105 @@
+; RUN: llc -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s \
+; RUN:     | FileCheck --check-prefix=CHECK --check-prefix=X86 %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s \
+; RUN:     | FileCheck --check-prefix=CHECK --check-prefix=X64 %s
+
+; Loosely based on IR for this C++ source code:
+;   void f(int p);
+;   void try_in_catch() {
+;     try {
+;       f(1);
+;     } catch (...) {
+;       try {
+;         f(2);
+;       } catch (...) {
+;         f(3);
+;       }
+;     }
+;   }
+
+declare void @f(i32 %p)
+declare i32 @__CxxFrameHandler3(...)
+
+define i32 @try_in_catch() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @f(i32 1)
+          to label %try.cont unwind label %catch.dispatch.1
+try.cont:
+  ret i32 0
+
+catch.dispatch.1:
+  %cs1 = catchswitch within none [label %handler1] unwind to caller
+handler1:
+  %h1 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  invoke void @f(i32 2) [ "funclet"(token %h1) ]
+          to label %catchret1 unwind label %catch.dispatch.2
+catchret1:
+  catchret from %h1 to label %try.cont
+
+catch.dispatch.2:
+  %cs2 = catchswitch within %h1 [label %handler2] unwind to caller
+handler2:
+  %h2 = catchpad within %cs2 [i8* null, i32 64, i8* null]
+  call void @f(i32 3)
+  catchret from %h2 to label %catchret1
+}
+
+; X86-LABEL: L__ehtable$try_in_catch:
+; X64-LABEL: $cppxdata$try_in_catch:
+; CHECK-NEXT: .long   429065506
+; CHECK-NEXT: .long   4
+; CHECK-NEXT: .long   ($stateUnwindMap$try_in_catch)
+; CHECK-NEXT: .long   2
+; CHECK-NEXT: .long   ($tryMap$try_in_catch)
+; ip2state num + ptr
+; X86-NEXT: .long   0
+; X86-NEXT: .long   0
+; X64-NEXT: .long   7
+; X64-NEXT: .long   ($ip2state$try_in_catch)
+; unwindhelp offset
+; X64-NEXT: .long   40
+; CHECK-NEXT: .long   0
+; EHFlags
+; CHECK-NEXT: .long   1
+
+; CHECK: $tryMap$try_in_catch:
+; CHECK-NEXT: .long   2
+; CHECK-NEXT: .long   2
+; CHECK-NEXT: .long   3
+; CHECK-NEXT: .long   1
+; CHECK-NEXT: .long   ($handlerMap$0$try_in_catch)
+; CHECK-NEXT: .long   0
+; CHECK-NEXT: .long   0
+; CHECK-NEXT: .long   3
+; CHECK-NEXT: .long   1
+; CHECK-NEXT: .long   ($handlerMap$1$try_in_catch)
+
+; CHECK: $handlerMap$0$try_in_catch:
+; CHECK-NEXT:   .long   64
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .long   "?catch${{[0-9]+}}@?0?try_in_catch@4HA"
+; X64-NEXT:   .long   56
+
+; CHECK: $handlerMap$1$try_in_catch:
+; CHECK-NEXT:   .long   64
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .long   "?catch${{[0-9]+}}@?0?try_in_catch@4HA"
+; X64-NEXT:   .long   56
+
+; X64: $ip2state$try_in_catch:
+; X64-NEXT: .long   .Lfunc_begin0@IMGREL
+; X64-NEXT: .long   -1
+; X64-NEXT: .long   .Ltmp0@IMGREL+1
+; X64-NEXT: .long   0
+; X64-NEXT: .long   .Ltmp1@IMGREL+1
+; X64-NEXT: .long   -1
+; X64-NEXT: .long   "?catch$2@?0?try_in_catch@4HA"@IMGREL
+; X64-NEXT: .long   1
+; X64-NEXT: .long   .Ltmp2@IMGREL+1
+; X64-NEXT: .long   2
+; X64-NEXT: .long   .Ltmp3@IMGREL+1
+; X64-NEXT: .long   1
+; X64-NEXT: .long   "?catch$4@?0?try_in_catch@4HA"@IMGREL
+; X64-NEXT: .long   3
diff --git a/test/CodeGen/X86/win-catchpad-nested.ll b/test/CodeGen/X86/win-catchpad-nested.ll
new file mode 100644
index 0000000000000..7afcd9cc1f3e1
--- /dev/null
+++ b/test/CodeGen/X86/win-catchpad-nested.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple=x86_64-pc-windows-coreclr < %s | FileCheck %s
+
+declare void @ProcessCLRException()
+
+declare void @f()
+
+define void @test1() personality void ()* @ProcessCLRException {
+entry:
+  invoke void @f()
+          to label %exit unwind label %catch.dispatch.1
+exit:
+  ret void
+
+catch.dispatch.1:
+  %cs1 = catchswitch within none [label %outer.catch] unwind to caller
+
+outer.catch:
+  %cp1 = catchpad within %cs1 [i32 1]
+  invoke void @f() [ "funclet"(token %cp1) ]
+          to label %outer.ret unwind label %catch.dispatch.2
+outer.ret:
+  catchret from %cp1 to label %exit
+
+catch.dispatch.2:
+  %cs2 = catchswitch within %cp1 [label %inner.catch] unwind to caller
+inner.catch:
+  %cp2 = catchpad within %cs2 [i32 2]
+  catchret from %cp2 to label %outer.ret
+}
+
+; Check the catchret targets
+; CHECK-LABEL: test1: # @test1
+; CHECK: [[Exit:^[^: ]+]]: # Block address taken
+; CHECK-NEXT:              # %exit
+; CHECK: [[OuterRet:^[^: ]+]]: # Block address taken
+; CHECK-NEXT:                  # %outer.ret
+; CHECK-NEXT: leaq [[Exit]](%rip), %rax
+; CHECK:      retq   # CATCHRET
+; CHECK: {{^[^: ]+}}: # %inner.catch
+; CHECK: .seh_endprolog
+; CHECK-NEXT: leaq [[OuterRet]](%rip), %rax
+; CHECK:      retq   # CATCHRET
diff --git a/test/CodeGen/X86/win-catchpad-varargs.ll b/test/CodeGen/X86/win-catchpad-varargs.ll
new file mode 100644
index 0000000000000..6508f3bd7d644
--- /dev/null
+++ b/test/CodeGen/X86/win-catchpad-varargs.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+declare i32 @__CxxFrameHandler3(...)
+declare void @g()
+
+define i32 @f(i32 %a, ...) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %ap = alloca i8*
+  invoke void @g()
+          to label %return unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %0 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %argp.cur = load i8*, i8** %ap
+  %1 = bitcast i8* %argp.cur to i32*
+  %arg2 = load i32, i32* %1
+  call void @llvm.va_end(i8* %ap1)
+  catchret from %0 to label %return
+
+return:                                           ; preds = %entry, %catch
+  %retval.0 = phi i32 [ %arg2, %catch ], [ -1, %entry ]
+  ret i32 %retval.0
+}
+
+; X64-LABEL: .seh_proc f
+; X64: pushq %rbp
+; X64: subq $64, %rsp
+; X64: leaq 64(%rsp), %rbp
+; X64: movq $-2, -8(%rbp)
+; X64: movl    $-1, -20(%rbp) # 4-byte Folded Spill
+; X64: callq g
+; X64: .LBB0_1
+; X64: movl    -20(%rbp), %eax # 4-byte Reload
+; X64: addq $64, %rsp
+; X64: popq %rbp
+
+; X64-LABEL: "?catch${{[0-9]}}@?0?f@4HA":
+; X64: .seh_proc "?catch${{[0-9]}}@?0?f@4HA"
+; X64:         movq    %rdx, 16(%rsp)
+; X64:         pushq   %rbp
+; X64:         subq    $32, %rsp
+; X64:         leaq    64(%rdx), %rbp
+; arg2 is at RBP+40:
+; start at arg2
+; + 8 for arg1
+; + 8 for retaddr
+; + 8 for RBP
+; + 64 for stackalloc
+; - 64 for setframe
+; = 40
+; X64:         movl    24(%rbp), %eax
+; X64:         movl    %eax, -20(%rbp)  # 4-byte Spill
+; X64:         leaq    .LBB0_1(%rip), %rax
+; X64:         addq    $32, %rsp
+; X64:         popq    %rbp
+; X64:         retq                            # CATCHRET
+
+; X86-LABEL: _f:                                     # @f
+; X86:         pushl   %ebp
+; X86:         movl    %esp, %ebp
+; X86:         pushl   %ebx
+; X86:         pushl   %edi
+; X86:         pushl   %esi
+; X86:         subl    $24, %esp
+; X86: 	       movl    $-1, -36(%ebp)
+; X86:         calll   _g
+; X86: LBB0_[[retbb:[0-9]+]]:
+; X86:         movl    -36(%ebp), %eax
+; X86:         addl    $24, %esp
+; X86:         popl    %esi
+; X86:         popl    %edi
+; X86:         popl    %ebx
+; X86:         popl    %ebp
+; X86:         retl
+
+; X86: LBB0_[[restorebb:[0-9]+]]: # Block address taken
+; X86: addl $12, %ebp
+; arg2 is at EBP offset 12:
+; + 4 for arg1
+; + 4 for retaddr
+; + 4 for EBP
+; X86: movl 12(%ebp), %eax
+; X86: movl %eax, -36(%ebp)
+; X86: jmp LBB0_[[retbb]]
+
+; X86-LABEL: "?catch${{[0-9]}}@?0?f@4HA":
+; X86:         pushl   %ebp
+; X86:         addl    $12, %ebp
+; Done due to mov %esp, %ebp
+; X86:         leal    12(%ebp), %eax
+; X86:         movl    $LBB0_[[restorebb]], %eax
+; X86:         popl    %ebp
+; X86:         retl                            # CATCHRET
diff --git a/test/CodeGen/X86/win-catchpad.ll b/test/CodeGen/X86/win-catchpad.ll
new file mode 100644
index 0000000000000..836c53bda8e68
--- /dev/null
+++ b/test/CodeGen/X86/win-catchpad.ll
@@ -0,0 +1,353 @@
+; RUN: llc -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
+
+; Loosely based on IR for this C++ source code:
+;   void f(int p);
+;   int main() {
+;     try {
+;       f(1);
+;     } catch (int e) {
+;       f(e);
+;     } catch (...) {
+;       f(3);
+;     }
+;   }
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchableType = type { i32, i8*, i32, i32, i32, i32, i8* }
+%eh.CatchableTypeArray.1 = type { i32, [1 x %eh.CatchableType*] }
+%eh.ThrowInfo = type { i32, i8*, i8*, i8* }
+
+$"\01??_R0H@8" = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+
+
+declare void @f(i32 %p, i32* %l)
+declare i1 @getbool()
+declare i32 @__CxxFrameHandler3(...)
+
+define i32 @try_catch_catch() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %e.addr = alloca i32
+  %local = alloca i32
+  invoke void @f(i32 1, i32* %local)
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs = catchswitch within none [label %handler1, label %handler2] unwind to caller
+
+handler1:
+  %h1 = catchpad within %cs [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i32* %e.addr]
+  %e = load i32, i32* %e.addr
+  call void @f(i32 %e, i32* %local) [ "funclet"(token %h1) ]
+  catchret from %h1 to label %try.cont
+
+handler2:
+  %h2 = catchpad within %cs [i8* null, i32 64, i8* null]
+  call void @f(i32 3, i32* %local) [ "funclet"(token %h2) ]
+  catchret from %h2 to label %try.cont
+
+try.cont:
+  ret i32 0
+}
+
+; X86-LABEL: _try_catch_catch:
+; X86: movl %esp, -[[sp_offset:[0-9]+]](%ebp)
+; X86: movl $0, -{{[0-9]+}}(%ebp)
+; X86: leal -[[local_offs:[0-9]+]](%ebp), %[[addr_reg:[a-z]+]]
+; X86-DAG: movl %[[addr_reg]], 4(%esp)
+; X86-DAG: movl $1, (%esp)
+; X86: calll _f
+; X86: [[contbb:LBB0_[0-9]+]]: # %try.cont
+; X86: retl
+
+; X86: [[restorebb1:LBB0_[0-9]+]]: # Block address taken
+; X86-NEXT:                        # %handler1
+; X86-NEXT: addl $12, %ebp
+; X86: jmp [[contbb]]
+
+; FIXME: These should be de-duplicated.
+; X86: [[restorebb2:LBB0_[0-9]+]]: # Block address taken
+; X86-NEXT:                        # %handler2
+; X86-NEXT: addl $12, %ebp
+; X86: jmp [[contbb]]
+
+; X86: "?catch$[[catch1bb:[0-9]+]]@?0?try_catch_catch@4HA":
+; X86: LBB0_[[catch1bb]]: # %handler1{{$}}
+; X86: pushl %ebp
+; X86: subl $8, %esp
+; X86: addl $12, %ebp
+; X86: movl %esp, -[[sp_offset]](%ebp)
+; X86-DAG: movl -32(%ebp), %[[e_reg:[a-z]+]]
+; X86-DAG: leal -[[local_offs]](%ebp), %[[addr_reg:[a-z]+]]
+; X86-DAG: movl $1, -{{[0-9]+}}(%ebp)
+; X86-DAG: movl %[[addr_reg]], 4(%esp)
+; X86-DAG: movl %[[e_reg]], (%esp)
+; X86: calll _f
+; X86-NEXT: movl $[[restorebb1]], %eax
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+
+; X86: "?catch$[[catch2bb:[0-9]+]]@?0?try_catch_catch@4HA":
+; X86: LBB0_[[catch2bb]]: # %handler2{{$}}
+; X86: pushl %ebp
+; X86: subl $8, %esp
+; X86: addl $12, %ebp
+; X86: movl %esp, -[[sp_offset]](%ebp)
+; X86-DAG: leal -[[local_offs]](%ebp), %[[addr_reg:[a-z]+]]
+; X86-DAG: movl $1, -{{[0-9]+}}(%ebp)
+; X86-DAG: movl %[[addr_reg]], 4(%esp)
+; X86-DAG: movl $3, (%esp)
+; X86: calll _f
+; X86-NEXT: movl $[[restorebb2]], %eax
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+
+; X86: L__ehtable$try_catch_catch:
+; X86: $handlerMap$0$try_catch_catch:
+; X86-NEXT:   .long   0
+; X86-NEXT:   .long   "??_R0H@8"
+; X86-NEXT:   .long   -20
+; X86-NEXT:   .long   "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"
+; X86-NEXT:   .long   64
+; X86-NEXT:   .long   0
+; X86-NEXT:   .long   0
+; X86-NEXT:   .long   "?catch$[[catch2bb]]@?0?try_catch_catch@4HA"
+
+; X64-LABEL: try_catch_catch:
+; X64: Lfunc_begin0:
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64: subq $48, %rsp
+; X64: .seh_stackalloc 48
+; X64: leaq 48(%rsp), %rbp
+; X64: .seh_setframe 5, 48
+; X64: .seh_endprologue
+; X64: movq $-2, -8(%rbp)
+; X64: .Ltmp0
+; X64-DAG: leaq -[[local_offs:[0-9]+]](%rbp), %rdx
+; X64-DAG: movl $1, %ecx
+; X64: callq f
+; X64: [[contbb:\.LBB0_[0-9]+]]: # Block address taken
+; X64-NEXT:                      # %try.cont
+; X64: addq $48, %rsp
+; X64: popq %rbp
+; X64: retq
+
+; X64: "?catch$[[catch1bb:[0-9]+]]@?0?try_catch_catch@4HA":
+; X64: LBB0_[[catch1bb]]: # %handler1{{$}}
+; X64: movq %rdx, 16(%rsp)
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64: subq $32, %rsp
+; X64: .seh_stackalloc 32
+; X64: leaq 48(%rdx), %rbp
+; X64: .seh_endprologue
+; X64-DAG: leaq -[[local_offs]](%rbp), %rdx
+; X64-DAG: movl -12(%rbp), %ecx
+; X64: callq f
+; X64: leaq [[contbb]](%rip), %rax
+; X64-NEXT: addq $32, %rsp
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+
+; X64: "?catch$[[catch2bb:[0-9]+]]@?0?try_catch_catch@4HA":
+; X64: LBB0_[[catch2bb]]: # %handler2{{$}}
+; X64: movq %rdx, 16(%rsp)
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64: subq $32, %rsp
+; X64: .seh_stackalloc 32
+; X64: leaq 48(%rdx), %rbp
+; X64: .seh_endprologue
+; X64-DAG: leaq -[[local_offs]](%rbp), %rdx
+; X64-DAG: movl $3, %ecx
+; X64: callq f
+; X64: leaq [[contbb]](%rip), %rax
+; X64-NEXT: addq $32, %rsp
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+
+; X64: $cppxdata$try_catch_catch:
+; X64-NEXT: .long   429065506
+; X64-NEXT: .long   2
+; X64-NEXT: .long   ($stateUnwindMap$try_catch_catch)@IMGREL
+; X64-NEXT: .long   1
+; X64-NEXT: .long   ($tryMap$try_catch_catch)@IMGREL
+; X64-NEXT: .long   5
+; X64-NEXT: .long   ($ip2state$try_catch_catch)@IMGREL
+; X64-NEXT: .long   40
+; X64-NEXT: .long   0
+; X64-NEXT: .long   1
+
+; X64: $tryMap$try_catch_catch:
+; X64-NEXT: .long   0
+; X64-NEXT: .long   0
+; X64-NEXT: .long   1
+; X64-NEXT: .long   2
+; X64-NEXT: .long   ($handlerMap$0$try_catch_catch)@IMGREL
+
+; X64: $handlerMap$0$try_catch_catch:
+; X64-NEXT:   .long   0
+; X64-NEXT:   .long   "??_R0H@8"@IMGREL
+; X64-NEXT:   .long   36
+; X64-NEXT:   .long   "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"@IMGREL
+; X64-NEXT:   .long   56
+; X64-NEXT:   .long   64
+; X64-NEXT:   .long   0
+; X64-NEXT:   .long   0
+; X64-NEXT:   .long   "?catch$[[catch2bb]]@?0?try_catch_catch@4HA"@IMGREL
+; X64-NEXT:   .long   56
+
+; X64: $ip2state$try_catch_catch:
+; X64-NEXT: .long   .Lfunc_begin0@IMGREL
+; X64-NEXT: .long   -1
+; X64-NEXT: .long   .Ltmp0@IMGREL+1
+; X64-NEXT: .long   0
+; X64-NEXT: .long   .Ltmp1@IMGREL+1
+; X64-NEXT: .long   -1
+; X64-NEXT: .long   "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"@IMGREL
+; X64-NEXT: .long   1
+; X64-NEXT: .long   "?catch$[[catch2bb]]@?0?try_catch_catch@4HA"@IMGREL
+; X64-NEXT: .long   1
+
+
+define i32 @branch_to_normal_dest() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @f(i32 1, i32* null)
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:
+  %cp1 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  br label %loop
+
+loop:
+  %V = call i1 @getbool() [ "funclet"(token %cp1) ]
+  br i1 %V, label %loop, label %catch.done
+
+catch.done:
+  catchret from %cp1 to label %try.cont
+
+try.cont:
+  ret i32 0
+}
+
+; X86-LABEL: _branch_to_normal_dest:
+; X86: calll _f
+
+; X86: [[contbb:LBB1_[0-9]+]]: # %try.cont
+; X86: retl
+
+; X86: [[restorebb:LBB1_[0-9]+]]: # Block address taken
+; X86-NEXT:                       # %catch.done
+; X86-NEXT: addl $12, %ebp
+; X86: jmp [[contbb]]
+
+; X86: "?catch$[[catchbb:[0-9]+]]@?0?branch_to_normal_dest@4HA":
+; X86: LBB1_[[catchbb]]: # %catch{{$}}
+; X86: pushl %ebp
+; X86: subl $8, %esp
+; X86: addl $12, %ebp
+; X86: LBB1_[[loopbb:[0-9]+]]: # %loop
+; X86: movl    $1, -16(%ebp)
+; X86: calll   _getbool
+; X86: testb   $1, %al
+; X86: jne LBB1_[[loopbb]]
+; X86: # %catch.done
+; X86-NEXT: movl $[[restorebb]], %eax
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+
+; X86: L__ehtable$branch_to_normal_dest:
+; X86: $handlerMap$0$branch_to_normal_dest:
+; X86-NEXT:   .long   64
+; X86-NEXT:   .long   0
+; X86-NEXT:   .long   0
+; X86-NEXT:   .long   "?catch$[[catchbb]]@?0?branch_to_normal_dest@4HA"
+
+; X64-LABEL: branch_to_normal_dest:
+; X64: # %entry
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64: subq $48, %rsp
+; X64: .seh_stackalloc 48
+; X64: leaq 48(%rsp), %rbp
+; X64: .seh_setframe 5, 48
+; X64: .seh_endprologue
+; X64: .Ltmp[[before_call:[0-9]+]]:
+; X64: callq f
+; X64: .Ltmp[[after_call:[0-9]+]]:
+; X64: [[contbb:\.LBB1_[0-9]+]]: # Block address taken
+; X64-NEXT:                      # %try.cont
+; X64: addq $48, %rsp
+; X64: popq %rbp
+; X64: retq
+
+; X64: "?catch$[[catchbb:[0-9]+]]@?0?branch_to_normal_dest@4HA":
+; X64: LBB1_[[catchbb]]: # %catch{{$}}
+; X64: movq %rdx, 16(%rsp)
+; X64: pushq %rbp
+; X64: .seh_pushreg 5
+; X64: subq $32, %rsp
+; X64: .seh_stackalloc 32
+; X64: leaq 48(%rdx), %rbp
+; X64: .seh_endprologue
+; X64: .LBB1_[[normal_dest_bb:[0-9]+]]: # %loop
+; X64: callq   getbool
+; X64: testb   $1, %al
+; X64: jne     .LBB1_[[normal_dest_bb]]
+; X64: # %catch.done
+; X64: leaq [[contbb]](%rip), %rax
+; X64-NEXT: addq $32, %rsp
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+
+; X64-LABEL: $cppxdata$branch_to_normal_dest:
+; X64-NEXT: .long   429065506
+; X64-NEXT: .long   2
+; X64-NEXT: .long   ($stateUnwindMap$branch_to_normal_dest)@IMGREL
+; X64-NEXT: .long   1
+; X64-NEXT: .long   ($tryMap$branch_to_normal_dest)@IMGREL
+; X64-NEXT: .long   4
+; X64-NEXT: .long   ($ip2state$branch_to_normal_dest)@IMGREL
+; X64-NEXT: .long   40
+; X64-NEXT: .long   0
+; X64-NEXT: .long   1
+
+; X64-LABEL: $stateUnwindMap$branch_to_normal_dest:
+; X64-NEXT: .long   -1
+; X64-NEXT: .long   0
+; X64-NEXT: .long   -1
+; X64-NEXT: .long   0
+
+; X64-LABEL: $tryMap$branch_to_normal_dest:
+; X64-NEXT: .long   0
+; X64-NEXT: .long   0
+; X64-NEXT: .long   1
+; X64-NEXT: .long   1
+; X64-NEXT: .long   ($handlerMap$0$branch_to_normal_dest)@IMGREL
+
+; X64-LABEL: $handlerMap$0$branch_to_normal_dest:
+; X64-NEXT: .long   64
+; X64-NEXT: .long   0
+; X64-NEXT: .long   0
+; X64-NEXT: .long   "?catch$[[catchbb]]@?0?branch_to_normal_dest@4HA"@IMGREL
+; X64-NEXT: .long   56
+
+; X64-LABEL: $ip2state$branch_to_normal_dest:
+; X64-NEXT: .long   .Lfunc_begin1@IMGREL
+; X64-NEXT: .long   -1
+; X64-NEXT: .long   .Ltmp[[before_call]]@IMGREL+1
+; X64-NEXT: .long   0
+; X64-NEXT: .long   .Ltmp[[after_call]]@IMGREL+1
+; X64-NEXT: .long   -1
+; X64-NEXT: .long   "?catch$[[catchbb]]@?0?branch_to_normal_dest@4HA"@IMGREL
+; X64-NEXT: .long   1
diff --git a/test/CodeGen/X86/win-cleanuppad.ll b/test/CodeGen/X86/win-cleanuppad.ll
new file mode 100644
index 0000000000000..4b0a543a876a7
--- /dev/null
+++ b/test/CodeGen/X86/win-cleanuppad.ll
@@ -0,0 +1,199 @@
+; RUN: llc -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
+
+%struct.Dtor = type { i8 }
+
+define void @simple_cleanup() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %o = alloca %struct.Dtor, align 1
+  invoke void @f(i32 1)
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  call x86_thiscallcc void @"\01??1Dtor@@QAE@XZ"(%struct.Dtor* %o) #2
+  ret void
+
+ehcleanup:                                        ; preds = %entry
+  %0 = cleanuppad within none []
+  call x86_thiscallcc void @"\01??1Dtor@@QAE@XZ"(%struct.Dtor* %o) #2 [ "funclet"(token %0) ]
+  cleanupret from %0 unwind to caller
+}
+
+; CHECK: simple_cleanup:                         # @simple_cleanup
+; CHECK:         pushq   %rbp
+; CHECK:         subq    $48, %rsp
+; CHECK:         leaq    48(%rsp), %rbp
+; CHECK:         movq    $-2, -8(%rbp)
+; CHECK:         movl    $1, %ecx
+; CHECK:         callq   f
+; CHECK:         callq   "??1Dtor@@QAE@XZ"
+; CHECK:         nop
+; CHECK:         addq    $48, %rsp
+; CHECK:         popq    %rbp
+; CHECK:         retq
+
+; CHECK: "?dtor$2@?0?simple_cleanup@4HA":
+; CHECK:         callq   "??1Dtor@@QAE@XZ"
+; CHECK:         retq
+
+; CHECK: $cppxdata$simple_cleanup:
+; CHECK-NEXT:         .long   429065506
+; CHECK-NEXT:         .long   1
+; CHECK-NEXT:         .long   ($stateUnwindMap$simple_cleanup)@IMGREL
+; CHECK-NEXT:         .long   0
+; CHECK-NEXT:         .long   0
+; CHECK-NEXT:         .long   3
+; CHECK-NEXT:         .long   ($ip2state$simple_cleanup)@IMGREL
+; UnwindHelp offset should match the -2 store above
+; CHECK-NEXT:         .long   40
+; CHECK-NEXT:         .long   0
+; CHECK-NEXT:         .long   1
+
+declare void @f(i32) #0
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind
+declare x86_thiscallcc void @"\01??1Dtor@@QAE@XZ"(%struct.Dtor*) #1
+
+define void @nested_cleanup() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %o1 = alloca %struct.Dtor, align 1
+  %o2 = alloca %struct.Dtor, align 1
+  invoke void @f(i32 1)
+          to label %invoke.cont unwind label %cleanup.outer
+
+invoke.cont:                                      ; preds = %entry
+  invoke void @f(i32 2)
+          to label %invoke.cont.1 unwind label %cleanup.inner
+
+invoke.cont.1:                                    ; preds = %invoke.cont
+  call x86_thiscallcc void @"\01??1Dtor@@QAE@XZ"(%struct.Dtor* %o2) #2
+  invoke void @f(i32 3)
+          to label %invoke.cont.2 unwind label %cleanup.outer
+
+invoke.cont.2:                                    ; preds = %invoke.cont.1
+  call x86_thiscallcc void @"\01??1Dtor@@QAE@XZ"(%struct.Dtor* %o1) #2
+  ret void
+
+cleanup.inner:                                        ; preds = %invoke.cont
+  %0 = cleanuppad within none []
+  call x86_thiscallcc void @"\01??1Dtor@@QAE@XZ"(%struct.Dtor* %o2) #2 [ "funclet"(token %0) ]
+  cleanupret from %0 unwind label %cleanup.outer
+
+cleanup.outer:                                      ; preds = %invoke.cont.1, %cleanup.inner, %entry
+  %1 = cleanuppad within none []
+  call x86_thiscallcc void @"\01??1Dtor@@QAE@XZ"(%struct.Dtor* %o1) #2 [ "funclet"(token %1) ]
+  cleanupret from %1 unwind to caller
+}
+
+; X86-LABEL: _nested_cleanup:
+; X86: movl    $1, (%esp)
+; X86: calll   _f
+; X86: movl    $2, (%esp)
+; X86: calll   _f
+; X86: movl    $3, (%esp)
+; X86: calll   _f
+
+; X86: "?dtor$[[cleanup_inner:[0-9]+]]@?0?nested_cleanup@4HA":
+; X86: LBB1_[[cleanup_inner]]: # %cleanup.inner{{$}}
+; X86: pushl %ebp
+; X86: leal    {{.*}}(%ebp), %ecx
+; X86: calll   "??1Dtor@@QAE@XZ"
+; X86: popl %ebp
+; X86: retl
+
+; X86: "?dtor$[[cleanup_outer:[0-9]+]]@?0?nested_cleanup@4HA":
+; X86: LBB1_[[cleanup_outer]]: # %cleanup.outer{{$}}
+; X86: pushl %ebp
+; X86: leal    {{.*}}(%ebp), %ecx
+; X86: calll   "??1Dtor@@QAE@XZ"
+; X86: popl %ebp
+; X86: retl
+
+; X86: L__ehtable$nested_cleanup:
+; X86:         .long   429065506
+; X86:         .long   2
+; X86:         .long   ($stateUnwindMap$nested_cleanup)
+; X86:         .long   0
+; X86:         .long   0
+; X86:         .long   0
+; X86:         .long   0
+; X86:         .long   0
+; X86:         .long   1
+; X86: $stateUnwindMap$nested_cleanup:
+; X86:         .long   -1
+; X86:         .long   "?dtor$[[cleanup_outer]]@?0?nested_cleanup@4HA"
+; X86:         .long   0
+; X86:         .long   "?dtor$[[cleanup_inner]]@?0?nested_cleanup@4HA"
+
+; X64-LABEL: nested_cleanup:
+; X64: .Lfunc_begin1:
+; X64: .Ltmp13:
+; X64: movl    $1, %ecx
+; X64: callq   f
+; X64: .Ltmp15:
+; X64: movl    $2, %ecx
+; X64: callq   f
+; X64: .Ltmp16:
+; X64: callq   "??1Dtor@@QAE@XZ"
+; X64: .Ltmp17:
+; X64: movl    $3, %ecx
+; X64: callq   f
+; X64: .Ltmp18:
+
+; X64: "?dtor$[[cleanup_inner:[0-9]+]]@?0?nested_cleanup@4HA":
+; X64: LBB1_[[cleanup_inner]]: # %cleanup.inner{{$}}
+; X64: pushq %rbp
+; X64: leaq    {{.*}}(%rbp), %rcx
+; X64: callq   "??1Dtor@@QAE@XZ"
+; X64: popq %rbp
+; X64: retq
+
+; X64:        .seh_handlerdata
+; X64:        .text
+; X64:        .seh_endproc
+
+; X64: "?dtor$[[cleanup_outer:[0-9]+]]@?0?nested_cleanup@4HA":
+; X64: LBB1_[[cleanup_outer]]: # %cleanup.outer{{$}}
+; X64: pushq %rbp
+; X64: leaq    {{.*}}(%rbp), %rcx
+; X64: callq   "??1Dtor@@QAE@XZ"
+; X64: popq %rbp
+; X64: retq
+
+; X64:        .section .xdata,"dr"
+; X64-NEXT: .align  4
+; X64: $cppxdata$nested_cleanup:
+; X64-NEXT: .long   429065506
+; X64-NEXT: .long   2
+; X64-NEXT: .long   ($stateUnwindMap$nested_cleanup)@IMGREL
+; X64-NEXT: .long   0
+; X64-NEXT: .long   0
+; X64-NEXT: .long   5
+; X64-NEXT: .long   ($ip2state$nested_cleanup)@IMGREL
+; X64-NEXT: .long   56
+; X64-NEXT: .long   0
+; X64-NEXT: .long   1
+
+; X64: $stateUnwindMap$nested_cleanup:
+; X64-NEXT: .long   -1
+; X64-NEXT: .long   "?dtor$[[cleanup_outer]]@?0?nested_cleanup@4HA"@IMGREL
+; X64-NEXT: .long   0
+; X64-NEXT: .long   "?dtor$[[cleanup_inner]]@?0?nested_cleanup@4HA"@IMGREL
+
+; X64: $ip2state$nested_cleanup:
+; X64-NEXT: .long   .Lfunc_begin1@IMGREL
+; X64-NEXT: .long   -1
+; X64-NEXT: .long   .Ltmp13@IMGREL
+; X64-NEXT: .long   0
+; X64-NEXT: .long   .Ltmp15@IMGREL
+; X64-NEXT: .long   1
+; X64-NEXT: .long   .Ltmp17@IMGREL
+; X64-NEXT: .long   0
+; X64-NEXT: .long   .Ltmp18@IMGREL+1
+; X64-NEXT: .long   -1
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/X86/win-funclet-cfi.ll b/test/CodeGen/X86/win-funclet-cfi.ll
new file mode 100644
index 0000000000000..2151cdc7bb4b3
--- /dev/null
+++ b/test/CodeGen/X86/win-funclet-cfi.ll
@@ -0,0 +1,95 @@
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @"\01?f@@YAXXZ"(i1 %B) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @g()
+          to label %unreachable unwind label %cleanupblock
+
+cleanupblock:
+  %cleanp = cleanuppad within none []
+  call void @g() [ "funclet"(token %cleanp) ]
+  cleanupret from %cleanp unwind label %catch.dispatch
+
+catch.dispatch:
+  %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:
+  %cp = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  call void @g() [ "funclet"(token %cp) ]
+  catchret from %cp to label %try.cont
+
+try.cont:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+
+declare void @g()
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Destructors need CFI but they shouldn't use the .seh_handler directive.
+; CHECK: "?dtor$[[cleanup:[0-9]+]]@?0??f@@YAXXZ@4HA":
+; CHECK: .seh_proc "?dtor$[[cleanup]]@?0??f@@YAXXZ@4HA"
+; CHECK-NOT: .seh_handler __CxxFrameHandler3
+; CHECK: LBB0_[[cleanup]]: # %cleanupblock{{$}}
+
+; Emit CFI for pushing RBP.
+; CHECK: movq    %rdx, 16(%rsp)
+; CHECK: pushq   %rbp
+; CHECK: .seh_pushreg 5
+
+; Emit CFI for allocating from the stack pointer.
+; CHECK: subq    $32, %rsp
+; CHECK: .seh_stackalloc 32
+
+; CHECK: leaq    48(%rdx), %rbp
+; CHECK-NOT: .seh_setframe
+
+; Prologue is done, emit the .seh_endprologue directive.
+; CHECK: .seh_endprologue
+
+; Make sure there is a nop after a call if the call precedes the epilogue.
+; CHECK: callq g
+; CHECK-NEXT: nop
+
+; Don't emit a reference to the LSDA.
+; CHECK: .seh_handlerdata
+; CHECK-NOT:  .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT: .text
+; CHECK: .seh_endproc
+
+; CHECK: "?catch$[[catch:[0-9]+]]@?0??f@@YAXXZ@4HA":
+; CHECK: .seh_proc "?catch$[[catch]]@?0??f@@YAXXZ@4HA"
+; CHECK-NEXT: .seh_handler __CxxFrameHandler3, @unwind, @except
+; CHECK: LBB0_[[catch]]: # %catch{{$}}
+
+; Emit CFI for pushing RBP.
+; CHECK: movq    %rdx, 16(%rsp)
+; CHECK: pushq   %rbp
+; CHECK: .seh_pushreg 5
+
+; Emit CFI for allocating from the stack pointer.
+; CHECK: subq    $32, %rsp
+; CHECK: .seh_stackalloc 32
+
+; CHECK: leaq    48(%rdx), %rbp
+; CHECK-NOT: .seh_setframe
+
+; Prologue is done, emit the .seh_endprologue directive.
+; CHECK: .seh_endprologue
+
+; Make sure there is at least one instruction after a call before the epilogue.
+; CHECK: callq g
+; CHECK-NEXT: leaq    .LBB0_{{[0-9]+}}(%rip), %rax
+
+; Emit a reference to the LSDA.
+; CHECK: .seh_handlerdata
+; CHECK-NEXT:  .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT: .text
+; CHECK: .seh_endproc
diff --git a/test/CodeGen/X86/win-mixed-ehpersonality.ll b/test/CodeGen/X86/win-mixed-ehpersonality.ll
new file mode 100644
index 0000000000000..f7b6d0702ebe5
--- /dev/null
+++ b/test/CodeGen/X86/win-mixed-ehpersonality.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple x86_64-pc-windows-msvc < %s | FileCheck %s
+
+declare void @maybe_throw()
+
+@_ZTIi = external constant i8*
+@g = external global i32
+
+declare i32 @__C_specific_handler(...)
+declare i32 @__gxx_personality_seh0(...)
+declare i32 @llvm.eh.typeid.for(i8*) readnone nounwind
+
+define i32 @use_seh() personality i32 (...)* @__C_specific_handler {
+entry:
+  invoke void @maybe_throw()
+      to label %cont unwind label %lpad
+
+cont:
+  ret i32 0
+
+lpad:
+  %cs = catchswitch within none [label %catch] unwind to caller
+catch:
+  %p = catchpad within %cs [i8* bitcast (i32 (i8*, i8*)* @filt_g to i8*)]
+  catchret from %p to label %ret1
+
+ret1:
+  ret i32 1
+}
+
+define internal i32 @filt_g(i8*, i8*) {
+  %g = load i32, i32* @g
+  ret i32 %g
+}
+
+; CHECK-LABEL: use_seh:
+; CHECK: callq maybe_throw
+; CHECK: xorl %eax, %eax
+; CHECK: .LBB0_[[epilogue:[0-9]+]]
+; CHECK: retq
+; CHECK: # %catch{{$}}
+; CHECK: movl $1, %eax
+; CHECK: jmp .LBB0_[[epilogue]]
+
+; A MinGW64-ish EH style. It could happen if a binary uses both MSVC CRT and
+; mingw CRT and is linked with LTO.
+define i32 @use_gcc() personality i32 (...)* @__gxx_personality_seh0 {
+entry:
+  invoke void @maybe_throw()
+      to label %cont unwind label %lpad
+
+cont:
+  ret i32 0
+
+lpad:
+  %ehvals = landingpad { i8*, i32 }
+      cleanup
+      catch i8* bitcast (i8** @_ZTIi to i8*)
+  %ehsel = extractvalue { i8*, i32 } %ehvals, 1
+  %filt_g_sel = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @filt_g to i8*))
+  %matches = icmp eq i32 %ehsel, %filt_g_sel
+  br i1 %matches, label %ret1, label %eh.resume
+
+ret1:
+  ret i32 1
+
+eh.resume:
+  resume { i8*, i32 } %ehvals
+}
+
+; CHECK-LABEL: use_gcc:
+; CHECK: callq maybe_throw
+; CHECK: xorl %eax, %eax
+;
+; CHECK: # %lpad
+; CHECK: cmpl $2, %edx
+; CHECK: jne
+;
+; CHECK: # %ret1
+; CHECK: movl $1, %eax
+;
+; CHECK: callq _Unwind_Resume
diff --git a/test/CodeGen/X86/win32-eh-states.ll b/test/CodeGen/X86/win32-eh-states.ll
index 0aae8c4d01898..2777d6644e6a3 100644
--- a/test/CodeGen/X86/win32-eh-states.ll
+++ b/test/CodeGen/X86/win32-eh-states.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck %s
+; RUN: llc -mtriple=i686-pc-windows-msvc   < %s | FileCheck %s --check-prefix=X86
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=X64
 
 ; Based on this source:
 ; extern "C" void may_throw(int);
@@ -33,82 +34,174 @@ $"\01??_R0H@8" = comdat any
 define void @f() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
   invoke void @may_throw(i32 1)
-          to label %invoke.cont unwind label %lpad
+          to label %invoke.cont unwind label %lpad.1
 
 invoke.cont:                                      ; preds = %entry
   invoke void @may_throw(i32 2)
-          to label %try.cont.9 unwind label %lpad.1
+          to label %try.cont.9 unwind label %lpad
 
 try.cont.9:                                       ; preds = %invoke.cont.3, %invoke.cont, %catch.7
-  ; FIXME: Something about our CFG breaks TailDuplication. This empy asm blocks
-  ; it so we can focus on testing the state numbering.
-  call void asm sideeffect "", "~{dirflag},~{fpsr},~{flags}"()
   ret void
 
 lpad:                                             ; preds = %catch, %entry
-  %0 = landingpad { i8*, i32 }
-          catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0
-  %1 = extractvalue { i8*, i32 } %0, 0
-  %2 = extractvalue { i8*, i32 } %0, 1
-  br label %catch.dispatch.4
-
-lpad.1:                                           ; preds = %invoke.cont
-  %3 = landingpad { i8*, i32 }
-          catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)
-  %4 = extractvalue { i8*, i32 } %3, 0
-  %5 = extractvalue { i8*, i32 } %3, 1
-  %6 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)) #3
-  %matches = icmp eq i32 %5, %6
-  br i1 %matches, label %catch, label %catch.dispatch.4
-
-catch.dispatch.4:                                 ; preds = %lpad.1, %lpad
-  %exn.slot.0 = phi i8* [ %4, %lpad.1 ], [ %1, %lpad ]
-  %ehselector.slot.0 = phi i32 [ %5, %lpad.1 ], [ %2, %lpad ]
-  %.pre = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)) #3
-  %matches6 = icmp eq i32 %ehselector.slot.0, %.pre
-  br i1 %matches6, label %catch.7, label %eh.resume
-
-catch.7:                                          ; preds = %catch.dispatch.4
-  tail call void @llvm.eh.begincatch(i8* %exn.slot.0, i8* null) #3
-  tail call void @may_throw(i32 4)
-  tail call void @llvm.eh.endcatch() #3
-  br label %try.cont.9
+  %cs1 = catchswitch within none [label %catch] unwind label %lpad.1
 
 catch:                                            ; preds = %lpad.1
-  tail call void @llvm.eh.begincatch(i8* %4, i8* null) #3
-  invoke void @may_throw(i32 3)
-          to label %invoke.cont.3 unwind label %lpad
+  %p1 = catchpad within %cs1 [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i8* null]
+  invoke void @may_throw(i32 3) [ "funclet"(token %p1) ]
+          to label %invoke.cont.3 unwind label %lpad.1
 
 invoke.cont.3:                                    ; preds = %catch
-  tail call void @llvm.eh.endcatch() #3
-  br label %try.cont.9
+  catchret from %p1 to label %try.cont.9
+
+lpad.1:                                           ; preds = %invoke.cont
+  %cs2 = catchswitch within none [label %catch.7] unwind to caller
 
-eh.resume:                                        ; preds = %catch.dispatch.4
-  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
-  %lpad.val.12 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
-  resume { i8*, i32 } %lpad.val.12
+catch.7:
+  %p2 = catchpad within %cs2 [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i8* null]
+  call void @may_throw(i32 4) [ "funclet"(token %p2) ]
+  catchret from %p2 to label %try.cont.9
 }
 
-; CHECK-LABEL: _f:
-; CHECK: movl $-1, [[state:[-0-9]+]](%ebp)
-; CHECK: movl $___ehhandler$f, {{.*}}
+; X86-LABEL: _f:
+; X86: movl $-1, [[state:[-0-9]+]](%ebp)
+; X86: movl $___ehhandler$f, {{.*}}
 ;
-; CHECK: movl $0, [[state]](%ebp)
-; CHECK: movl $1, (%esp)
-; CHECK: calll _may_throw
+; X86: movl $0, [[state]](%ebp)
+; X86: movl $1, (%esp)
+; X86: calll _may_throw
 ;
-; CHECK: movl $1, [[state]](%ebp)
-; CHECK: movl $2, (%esp)
-; CHECK: calll _may_throw
+; X86: movl $1, [[state]](%ebp)
+; X86: movl $2, (%esp)
+; X86: calll _may_throw
+;
+; X86: movl $2, [[state]](%ebp)
+; X86: movl $3, (%esp)
+; X86: calll _may_throw
+;
+; X86: movl $3, [[state]](%ebp)
+; X86: movl $4, (%esp)
+; X86: calll _may_throw
+
+
+; X64-LABEL: f:
+; X64-LABEL: $ip2state$f:
+; X64-NEXT:   .long .Lfunc_begin0@IMGREL
+; X64-NEXT:   .long -1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long 0
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long 1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long -1
+; X64-NEXT:   .long "?catch${{.*}}@?0?f@4HA"@IMGREL
+; X64-NEXT:   .long 2
+; X64-NEXT:   .long "?catch${{.*}}@?0?f@4HA"@IMGREL
+; X64-NEXT:   .long 3
+
+; Based on this source:
+; extern "C" void may_throw(int);
+; struct S { ~S(); };
+; void g() {
+;   S x;
+;   try {
+;     may_throw(-1);
+;   } catch (...) {
+;     may_throw(0);
+;     {
+;       S y;
+;       may_throw(1);
+;     }
+;     may_throw(2);
+;   }
+; }
+
+%struct.S = type { i8 }
+declare void @"\01??1S@@QEAA@XZ"(%struct.S*)
+
+define void @g() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %x = alloca %struct.S, align 1
+  %y = alloca %struct.S, align 1
+  invoke void @may_throw(i32 -1)
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch] unwind label %ehcleanup5
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+  invoke void @may_throw(i32 0) [ "funclet"(token %1) ]
+          to label %invoke.cont unwind label %ehcleanup5
+
+invoke.cont:                                      ; preds = %catch
+  invoke void @may_throw(i32 1) [ "funclet"(token %1) ]
+          to label %invoke.cont2 unwind label %ehcleanup
 
-; CHECK-LABEL: _f.catch:
-; CHECK: movl $4, Lf$frame_escape_{{[0-9]+.*}}
-; CHECK: movl $4, (%esp)
-; CHECK: calll _may_throw
+invoke.cont2:                                     ; preds = %invoke.cont
+  invoke void @"\01??1S@@QEAA@XZ"(%struct.S* nonnull %y) [ "funclet"(token %1) ]
+          to label %invoke.cont3 unwind label %ehcleanup5
+
+invoke.cont3:                                     ; preds = %invoke.cont2
+  invoke void @may_throw(i32 2) [ "funclet"(token %1) ]
+          to label %invoke.cont4 unwind label %ehcleanup5
+
+invoke.cont4:                                     ; preds = %invoke.cont3
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont4
+  call void @"\01??1S@@QEAA@XZ"(%struct.S* nonnull %x)
+  ret void
 
-; CHECK-LABEL: _f.catch.1:
-; CHECK: movl $3, Lf$frame_escape_{{[0-9]+.*}}
-; CHECK: movl $3, (%esp)
-; CHECK: calll _may_throw
+ehcleanup:                                        ; preds = %invoke.cont
+  %2 = cleanuppad within %1 []
+  call void @"\01??1S@@QEAA@XZ"(%struct.S* nonnull %y) [ "funclet"(token %2) ]
+  cleanupret from %2 unwind label %ehcleanup5
 
-; CHECK: .safeseh ___ehhandler$f
+ehcleanup5:                                       ; preds = %invoke.cont2, %invoke.cont3, %ehcleanup, %catch, %catch.dispatch
+  %3 = cleanuppad within none []
+  call void @"\01??1S@@QEAA@XZ"(%struct.S* nonnull %x) [ "funclet"(token %3) ]
+  cleanupret from %3 unwind to caller
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+; X86-LABEL: _g:
+; X86: movl $-1, [[state:[-0-9]+]](%ebp)
+; X86: movl $___ehhandler$g, {{.*}}
+;
+; X86: movl $1, [[state]](%ebp)
+; X86: movl $-1, (%esp)
+; X86: calll _may_throw
+;
+; X86: movl $2, [[state]](%ebp)
+; X86: movl $0, (%esp)
+; X86: calll _may_throw
+;
+; X86: movl $3, [[state]](%ebp)
+; X86: movl $1, (%esp)
+; X86: calll _may_throw
+;
+; X86: movl $2, [[state]](%ebp)
+; X86: movl $2, (%esp)
+; X86: calll _may_throw
+
+; X64-LABEL: g:
+; X64-LABEL: $ip2state$g:
+; X64-NEXT:   .long .Lfunc_begin1@IMGREL
+; X64-NEXT:   .long -1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long 1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long -1
+; X64-NEXT:   .long "?catch${{.*}}@?0?g@4HA"@IMGREL
+; X64-NEXT:   .long 2
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long 3
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long 2
+
+
+; X86: .safeseh ___ehhandler$f
+; X86: .safeseh ___ehhandler$g
diff --git a/test/CodeGen/X86/win32-eh.ll b/test/CodeGen/X86/win32-eh.ll
index 3ee4723ce5f3a..73c7b486a55a1 100644
--- a/test/CodeGen/X86/win32-eh.ll
+++ b/test/CodeGen/X86/win32-eh.ll
@@ -15,18 +15,14 @@ define internal i32 @catchall_filt() {
 define void @use_except_handler3() personality i32 (...)* @_except_handler3 {
 entry:
   invoke void @may_throw_or_crash()
-      to label %cont unwind label %catchall
+      to label %cont unwind label %lpad
 cont:
   ret void
-catchall:
-  %0 = landingpad { i8*, i32 }
-      catch i8* bitcast (i32 ()* @catchall_filt to i8*)
-  %1 = extractvalue { i8*, i32 } %0, 1
-  %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @catchall_filt to i8*)) #4
-  %matches = icmp eq i32 %1, %2
-  br i1 %matches, label %cont, label %eh.resume
-eh.resume:
-  resume { i8*, i32 } %0
+lpad:
+  %cs = catchswitch within none [label %catch] unwind to caller
+catch:
+  %p = catchpad within %cs [i8* bitcast (i32 ()* @catchall_filt to i8*)]
+  catchret from %p to label %cont
 }
 
 ; CHECK-LABEL: _use_except_handler3:
@@ -47,28 +43,25 @@ eh.resume:
 ; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
 ; CHECK: movl %[[next]], %fs:0
 ; CHECK: retl
+; CHECK: LBB1_2: # %catch{{$}}
 
 ; CHECK: .section .xdata,"dr"
 ; CHECK-LABEL: L__ehtable$use_except_handler3:
 ; CHECK-NEXT:  .long   -1
 ; CHECK-NEXT:  .long   _catchall_filt
-; CHECK-NEXT:  .long   Ltmp{{[0-9]+}}
+; CHECK-NEXT:  .long   LBB1_2
 
 define void @use_except_handler4() personality i32 (...)* @_except_handler4 {
 entry:
   invoke void @may_throw_or_crash()
-      to label %cont unwind label %catchall
+      to label %cont unwind label %lpad
 cont:
   ret void
-catchall:
-  %0 = landingpad { i8*, i32 }
-      catch i8* bitcast (i32 ()* @catchall_filt to i8*)
-  %1 = extractvalue { i8*, i32 } %0, 1
-  %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @catchall_filt to i8*)) #4
-  %matches = icmp eq i32 %1, %2
-  br i1 %matches, label %cont, label %eh.resume
-eh.resume:
-  resume { i8*, i32 } %0
+lpad:
+  %cs = catchswitch within none [label %catch] unwind to caller
+catch:
+  %p = catchpad within %cs [i8* bitcast (i32 ()* @catchall_filt to i8*)]
+  catchret from %p to label %cont
 }
 
 ; CHECK-LABEL: _use_except_handler4:
@@ -89,6 +82,7 @@ eh.resume:
 ; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
 ; CHECK: movl %[[next]], %fs:0
 ; CHECK: retl
+; CHECK: LBB2_2: # %catch{{$}}
 
 ; CHECK: .section .xdata,"dr"
 ; CHECK-LABEL: L__ehtable$use_except_handler4:
@@ -98,20 +92,19 @@ eh.resume:
 ; CHECK-NEXT:  .long   0
 ; CHECK-NEXT:  .long   -2
 ; CHECK-NEXT:  .long   _catchall_filt
-; CHECK-NEXT:  .long   Ltmp{{[0-9]+}}
+; CHECK-NEXT:  .long   LBB2_2
 
 define void @use_CxxFrameHandler3() personality i32 (...)* @__CxxFrameHandler3 {
   invoke void @may_throw_or_crash()
       to label %cont unwind label %catchall
 cont:
   ret void
+
 catchall:
-  %ehvals = landingpad { i8*, i32 }
-      catch i8* null
-  %ehptr = extractvalue { i8*, i32 } %ehvals, 0
-  call void @llvm.eh.begincatch(i8* %ehptr, i8* null)
-  call void @llvm.eh.endcatch()
-  br label %cont
+  %cs = catchswitch within none [label %catch] unwind to caller
+catch:
+  %p = catchpad within %cs [i8* null, i32 64, i8* null]
+  catchret from %p to label %cont
 }
 
 ; CHECK-LABEL: _use_CxxFrameHandler3:
diff --git a/test/CodeGen/X86/win32-pic-jumptable.ll b/test/CodeGen/X86/win32-pic-jumptable.ll
index cabd36ae395d9..3a8ef2d0b9162 100644
--- a/test/CodeGen/X86/win32-pic-jumptable.ll
+++ b/test/CodeGen/X86/win32-pic-jumptable.ll
@@ -1,16 +1,20 @@
 ; RUN: llc < %s -relocation-model=pic | FileCheck %s
 
 ; CHECK:        calll L0$pb
+; CHECK-NEXT: Ltmp{{[0-9]+}}:
+; CHECK-NEXT: .cfi_adjust_cfa_offset 4
 ; CHECK-NEXT: L0$pb:
 ; CHECK-NEXT:   popl %eax
+; CHECK-NEXT: Ltmp{{[0-9]+}}:
+; CHECK-NEXT: .cfi_adjust_cfa_offset -4
 ; CHECK-NEXT:   addl LJTI0_0(,%ecx,4), %eax
 ; CHECK-NEXT:   jmpl *%eax
 
 ; CHECK:      LJTI0_0:
+; CHECK-NEXT:   .long LBB0_2-L0$pb
+; CHECK-NEXT:   .long LBB0_3-L0$pb
 ; CHECK-NEXT:   .long LBB0_4-L0$pb
 ; CHECK-NEXT:   .long LBB0_5-L0$pb
-; CHECK-NEXT:   .long LBB0_6-L0$pb
-; CHECK-NEXT:   .long LBB0_7-L0$pb
 
 
 target triple = "i686--windows-itanium"
diff --git a/test/CodeGen/X86/win32-seh-catchpad-realign.ll b/test/CodeGen/X86/win32-seh-catchpad-realign.ll
new file mode 100644
index 0000000000000..23aeea37c117f
--- /dev/null
+++ b/test/CodeGen/X86/win32-seh-catchpad-realign.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s | FileCheck %s
+
+; The aligned alloca means that we have to realign the stack, which forces the
+; use of ESI to address local variables.
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686--windows-msvc"
+
+; Function Attrs: nounwind
+define void @realigned_try() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+  %x = alloca [4 x i32], align 16
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %x, i32 0, i32 0
+  invoke void @useit(i32* %arrayidx)
+          to label %__try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %__except.ret] unwind to caller
+
+__except.ret:                                     ; preds = %catch.dispatch
+  %pad = catchpad within %cs1 [i8* bitcast (i32 ()* @"\01?filt$0@0@realigned_try@@" to i8*)]
+  catchret from %pad to label %__try.cont
+
+__try.cont:                                       ; preds = %entry, %__except.ret
+  ret void
+}
+
+; Function Attrs: nounwind argmemonly
+
+; Function Attrs: nounwind
+define internal i32 @"\01?filt$0@0@realigned_try@@"() {
+entry:
+  ret i32 1
+}
+
+declare void @useit(i32*)
+
+declare i32 @_except_handler3(...)
+
+; CHECK-LABEL: _realigned_try:
+; Prologue
+; CHECK: pushl   %ebp
+; CHECK: movl    %esp, %ebp
+; CHECK: pushl   %ebx
+; CHECK: pushl   %edi
+; CHECK: pushl   %esi
+; CHECK: andl    $-16, %esp
+; CHECK: subl    $64, %esp
+; CHECK: movl    %esp, %esi
+; Spill EBP
+; CHECK: movl    %ebp, 12(%esi)
+; Spill ESP
+; CHECK: movl    %esp, 36(%esi)
+; The state is stored at ESI+56, the end of the node is ESI+60.
+; CHECK: movl    $-1, 56(%esi)
+;
+; __try
+; CHECK: calll _useit
+;
+; Epilogue
+; CHECK: LBB0_2:       # %__try.cont
+; CHECK: leal    -12(%ebp), %esp
+; CHECK: popl    %esi
+; CHECK: popl    %edi
+; CHECK: popl    %ebx
+; CHECK: popl    %ebp
+; CHECK: retl
+;
+; CHECK: LBB0_1:                                 # %__except.ret
+; Restore ESP
+; CHECK: movl    -24(%ebp), %esp
+; Recompute ESI by subtracting 60 from the end of the registration node.
+; CHECK: leal    -60(%ebp), %esi
+; Restore EBP
+; CHECK: movl    12(%esi), %ebp
+; Rejoin normal control flow
+; CHECK: jmp     LBB0_2
diff --git a/test/CodeGen/X86/win32-seh-catchpad.ll b/test/CodeGen/X86/win32-seh-catchpad.ll
new file mode 100644
index 0000000000000..224e96f8b8f0b
--- /dev/null
+++ b/test/CodeGen/X86/win32-seh-catchpad.ll
@@ -0,0 +1,231 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+define void @try_except() #0 personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+  %__exception_code = alloca i32, align 4
+  call void (...) @llvm.localescape(i32* %__exception_code)
+  invoke void @f(i32 1) #3
+          to label %invoke.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %__except.ret] unwind to caller
+
+__except.ret:                                     ; preds = %catch.dispatch
+  %0 = catchpad within %cs1 [i8* bitcast (i32 ()* @try_except_filter_catchall to i8*)]
+  catchret from %0 to label %__except
+
+__except:                                         ; preds = %__except.ret
+  call void @f(i32 2)
+  br label %__try.cont
+
+__try.cont:                                       ; preds = %__except, %invoke.cont
+  call void @f(i32 3)
+  ret void
+
+invoke.cont:                                      ; preds = %entry
+  br label %__try.cont
+}
+
+; CHECK-LABEL: _try_except:
+;     Store state #0
+; CHECK: movl $0, -[[state:[0-9]+]](%ebp)
+; CHECK: movl $1, (%esp)
+; CHECK: calll _f
+; CHECK: movl $-1, -[[state]](%ebp)
+; CHECK: movl $3, (%esp)
+; CHECK: calll _f
+; CHECK: retl
+
+;   __except
+; CHECK: movl $-1, -[[state]](%ebp)
+; CHECK: movl $2, (%esp)
+; CHECK: calll _f
+
+; CHECK: .section        .xdata,"dr"
+; CHECK: L__ehtable$try_except:
+; CHECK:         .long   -1                          # ToState
+; CHECK:         .long   _try_except_filter_catchall # Filter
+; CHECK:         .long   LBB0_1
+
+define internal i32 @try_except_filter_catchall() #0 {
+entry:
+  %0 = call i8* @llvm.frameaddress(i32 1)
+  %1 = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (void ()* @try_except to i8*), i8* %0)
+  %2 = call i8* @llvm.localrecover(i8* bitcast (void ()* @try_except to i8*), i8* %1, i32 0)
+  %__exception_code = bitcast i8* %2 to i32*
+  %3 = getelementptr inbounds i8, i8* %0, i32 -20
+  %4 = bitcast i8* %3 to i8**
+  %5 = load i8*, i8** %4, align 4
+  %6 = bitcast i8* %5 to { i32*, i8* }*
+  %7 = getelementptr inbounds { i32*, i8* }, { i32*, i8* }* %6, i32 0, i32 0
+  %8 = load i32*, i32** %7, align 4
+  %9 = load i32, i32* %8, align 4
+  store i32 %9, i32* %__exception_code, align 4
+  ret i32 1
+}
+
+define void @nested_exceptions() #0 personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+  %__exception_code = alloca i32, align 4
+  call void (...) @llvm.localescape(i32* %__exception_code)
+  invoke void @crash() #3
+          to label %__try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %__except.ret] unwind label %catch.dispatch.11
+
+__except.ret:                                     ; preds = %catch.dispatch
+  %0 = catchpad within %cs1 [i8* bitcast (i32 ()* @nested_exceptions_filter_catchall to i8*)]
+  catchret from %0 to label %__try.cont
+
+__try.cont:                                       ; preds = %entry, %__except.ret
+  invoke void @crash() #3
+          to label %__try.cont.9 unwind label %catch.dispatch.5
+
+catch.dispatch.5:                                 ; preds = %__try.cont
+  %cs2 = catchswitch within none [label %__except.ret.7] unwind label %catch.dispatch.11
+
+__except.ret.7:                                   ; preds = %catch.dispatch.5
+  %1 = catchpad within %cs2 [i8* bitcast (i32 ()* @nested_exceptions_filter_catchall to i8*)]
+  catchret from %1 to label %__try.cont.9
+
+__try.cont.9:                                     ; preds = %__try.cont, %__except.ret.7
+  invoke void @crash() #3
+          to label %__try.cont.15 unwind label %catch.dispatch.11
+
+catch.dispatch.11:                                ; preds = %catchendblock, %catchendblock.6, %__try.cont.9
+  %cs3 = catchswitch within none [label %__except.ret.13] unwind label %catch.dispatch.17
+
+__except.ret.13:                                  ; preds = %catch.dispatch.11
+  %2 = catchpad within %cs3 [i8* bitcast (i32 ()* @nested_exceptions_filter_catchall to i8*)]
+  catchret from %2 to label %__try.cont.15
+
+__try.cont.15:                                    ; preds = %__try.cont.9, %__except.ret.13
+  invoke void @crash() #3
+          to label %__try.cont.35 unwind label %catch.dispatch.17
+
+catch.dispatch.17:                                ; preds = %catchendblock.12, %__try.cont.15
+  %cs4 = catchswitch within none [label %__except.ret.19] unwind to caller
+
+__except.ret.19:                                  ; preds = %catch.dispatch.17
+  %3 = catchpad within %cs4 [i8* bitcast (i32 ()* @nested_exceptions_filter_catchall to i8*)]
+  catchret from %3 to label %__except.20
+
+__except.20:                                      ; preds = %__except.ret.19
+  invoke void @crash() #3
+          to label %__try.cont.27 unwind label %catch.dispatch.23
+
+catch.dispatch.23:                                ; preds = %__except.20
+  %cs5 = catchswitch within none [label %__except.ret.25] unwind to caller
+
+__except.ret.25:                                  ; preds = %catch.dispatch.23
+  %4 = catchpad within %cs5 [i8* bitcast (i32 ()* @nested_exceptions_filter_catchall to i8*)]
+  catchret from %4 to label %__try.cont.27
+
+__try.cont.27:                                    ; preds = %__except.20, %__except.ret.25
+  invoke void @crash() #3
+          to label %__try.cont.35 unwind label %catch.dispatch.30
+
+catch.dispatch.30:                                ; preds = %__try.cont.27
+  %cs6 = catchswitch within none [label %__except.ret.32] unwind to caller
+
+__except.ret.32:                                  ; preds = %catch.dispatch.30
+  %5 = catchpad within %cs6 [i8* bitcast (i32 ()* @nested_exceptions_filter_catchall to i8*)]
+  catchret from %5 to label %__try.cont.35
+
+__try.cont.35:                                    ; preds = %__try.cont.15, %__try.cont.27, %__except.ret.32
+  ret void
+}
+
+; This table is equivalent to the one produced by MSVC, even if it isn't in
+; quite the same order.
+
+; CHECK-LABEL: _nested_exceptions:
+; CHECK: L__ehtable$nested_exceptions:
+; CHECK:         .long   -1
+; CHECK:         .long   _nested_exceptions_filter_catchall
+; CHECK:         .long   LBB
+; CHECK:         .long   0
+; CHECK:         .long   _nested_exceptions_filter_catchall
+; CHECK:         .long   LBB
+; CHECK:         .long   1
+; CHECK:         .long   _nested_exceptions_filter_catchall
+; CHECK:         .long   LBB
+; CHECK:         .long   1
+; CHECK:         .long   _nested_exceptions_filter_catchall
+; CHECK:         .long   LBB
+; CHECK:         .long   -1
+; CHECK:         .long   _nested_exceptions_filter_catchall
+; CHECK:         .long   LBB
+; CHECK:         .long   -1
+; CHECK:         .long   _nested_exceptions_filter_catchall
+; CHECK:         .long   LBB
+
+declare void @crash() #0
+
+define internal i32 @nested_exceptions_filter_catchall() #0 {
+entry:
+  %0 = call i8* @llvm.frameaddress(i32 1)
+  %1 = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (void ()* @nested_exceptions to i8*), i8* %0)
+  %2 = call i8* @llvm.localrecover(i8* bitcast (void ()* @nested_exceptions to i8*), i8* %1, i32 0)
+  %__exception_code3 = bitcast i8* %2 to i32*
+  %3 = getelementptr inbounds i8, i8* %0, i32 -20
+  %4 = bitcast i8* %3 to i8**
+  %5 = load i8*, i8** %4, align 4
+  %6 = bitcast i8* %5 to { i32*, i8* }*
+  %7 = getelementptr inbounds { i32*, i8* }, { i32*, i8* }* %6, i32 0, i32 0
+  %8 = load i32*, i32** %7, align 4
+  %9 = load i32, i32* %8, align 4
+  store i32 %9, i32* %__exception_code3, align 4
+  ret i32 1
+}
+
+define void @code_in_catchpad() #0 personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+  invoke void @f(i32 1) #3
+          to label %__except unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs1 = catchswitch within none [label %__except.ret] unwind to caller
+
+__except.ret:                                     ; preds = %catch.dispatch
+  %0 = catchpad within %cs1 [i8* bitcast (i32 ()* @try_except_filter_catchall to i8*)]
+  call void @f(i32 2) [ "funclet"(token %0) ]
+  catchret from %0 to label %__except
+
+__except:
+  ret void
+}
+
+; CHECK-LABEL: _code_in_catchpad:
+; CHECK: # %__except.ret
+; CHECK-NEXT:         movl    -24(%ebp), %esp
+; CHECK-NEXT:         addl    $12, %ebp
+; CHECK-NEXT:         movl    $-1, -16(%ebp)
+; CHECK-NEXT:         movl    $2, (%esp)
+; CHECK-NEXT:         calll   _f
+
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.frameaddress(i32) #1
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.x86.seh.recoverfp(i8*, i8*) #1
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.localrecover(i8*, i8*, i32) #1
+
+declare void @f(i32) #0
+
+declare i32 @_except_handler3(...)
+
+; Function Attrs: nounwind
+declare void @llvm.localescape(...) #2
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+attributes #3 = { noinline }
diff --git a/test/CodeGen/X86/win32-seh-nested-finally.ll b/test/CodeGen/X86/win32-seh-nested-finally.ll
new file mode 100644
index 0000000000000..c283a35d70cfc
--- /dev/null
+++ b/test/CodeGen/X86/win32-seh-nested-finally.ll
@@ -0,0 +1,80 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+define void @nested_finally() #0 personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+  invoke void @f(i32 1) #3
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  invoke void @f(i32 2) #3
+          to label %invoke.cont.1 unwind label %ehcleanup.3
+
+invoke.cont.1:                                    ; preds = %invoke.cont
+  call void @f(i32 3) #3
+  ret void
+
+ehcleanup:                                        ; preds = %entry
+  %0 = cleanuppad within none []
+  invoke void @f(i32 2) #3 [ "funclet"(token %0) ]
+          to label %invoke.cont.2 unwind label %ehcleanup.3
+
+invoke.cont.2:                                    ; preds = %ehcleanup
+  cleanupret from %0 unwind label %ehcleanup.3
+
+ehcleanup.3:                                      ; preds = %invoke.cont.2, %ehcleanup.end, %invoke.cont
+  %1 = cleanuppad within none []
+  call void @f(i32 3) #3 [ "funclet"(token %1) ]
+  cleanupret from %1 unwind to caller
+}
+
+declare void @f(i32) #0
+
+declare i32 @_except_handler3(...)
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { noinline }
+
+; CHECK: _nested_finally:
+; CHECK: movl $-1, -[[state:[0-9]+]](%ebp)
+; CHECK: movl {{.*}}, %fs:0
+; CHECK: movl $1, -[[state]](%ebp)
+; CHECK: movl $1, (%esp)
+; CHECK: calll _f
+; CHECK: movl $0, -[[state]](%ebp)
+; CHECK: movl $2, (%esp)
+; CHECK: calll _f
+; CHECK: movl $-1, -[[state]](%ebp)
+; CHECK: movl $3, (%esp)
+; CHECK: calll _f
+; CHECK: retl
+
+; CHECK: LBB0_[[inner:[0-9]+]]: # %ehcleanup
+; CHECK: pushl %ebp
+; CHECK: addl $12, %ebp
+; CHECK: movl $0, -[[state]](%ebp)
+; CHECK: movl $2, (%esp)
+; CHECK: calll _f
+; CHECK: popl %ebp
+; CHECK: retl
+
+; CHECK: LBB0_[[outer:[0-9]+]]: # %ehcleanup.3
+; CHECK: pushl %ebp
+; CHECK: addl $12, %ebp
+; CHECK: movl $-1, -[[state]](%ebp)
+; CHECK: movl $3, (%esp)
+; CHECK: calll _f
+; CHECK: popl %ebp
+; CHECK: retl
+
+; CHECK: L__ehtable$nested_finally:
+; CHECK:        .long   -1 # ToState
+; CHECK:        .long   0  # Null
+; CHECK:        .long   "?dtor$[[outer]]@?0?nested_finally@4HA" # FinallyFunclet
+; CHECK:        .long   0  # ToState
+; CHECK:        .long   0  # Null
+; CHECK:        .long   "?dtor$[[inner]]@?0?nested_finally@4HA" # FinallyFunclet
diff --git a/test/CodeGen/X86/win32-spill-xmm.ll b/test/CodeGen/X86/win32-spill-xmm.ll
new file mode 100644
index 0000000000000..0db97cfe20f0f
--- /dev/null
+++ b/test/CodeGen/X86/win32-spill-xmm.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mcpu=generic -mtriple=i686-pc-windows-msvc -mattr=+sse < %s | FileCheck %s
+
+; Check proper alignment of spilled vector
+
+; CHECK-LABEL: spill_ok
+; CHECK: subl    $32, %esp
+; CHECK: movaps  %xmm3, (%esp)
+; CHECK: movl    $0, 16(%esp)
+; CHECK: calll   _bar
+define void @spill_ok(i32, <16 x float> *) {
+entry:
+  %2 = alloca i32, i32 %0
+  %3 = load <16 x float>, <16 x float> * %1, align 64
+  tail call void @bar(<16 x float> %3, i32 0) nounwind
+  ret void
+}
+
+declare void @bar(<16 x float> %a, i32 %b)
+
+; Check that proper alignment of spilled vector does not affect vargs
+
+; CHECK-LABEL: vargs_not_affected
+; CHECK: leal    28(%ebp), %eax
+define i32 @vargs_not_affected(<4 x float> %v, i8* %f, ...) {
+entry:
+  %ap = alloca i8*, align 4
+  %0 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %0)
+  %argp.cur = load i8*, i8** %ap, align 4
+  %argp.next = getelementptr inbounds i8, i8* %argp.cur, i32 4
+  store i8* %argp.next, i8** %ap, align 4
+  %1 = bitcast i8* %argp.cur to i32*
+  %2 = load i32, i32* %1, align 4
+  call void @llvm.va_end(i8* %0)
+  ret i32 %2
+}
+
+declare void @llvm.va_start(i8*)
+
+declare void @llvm.va_end(i8*)
diff --git a/test/CodeGen/X86/win64_frame.ll b/test/CodeGen/X86/win64_frame.ll
index 477b3144d9e73..27d78dbe5479c 100644
--- a/test/CodeGen/X86/win64_frame.ll
+++ b/test/CodeGen/X86/win64_frame.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s --check-prefix=CHECK --check-prefix=PUSHF
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+sahf | FileCheck %s --check-prefix=SAHF
 
 define i32 @f1(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5) "no-frame-pointer-elim"="true" {
   ; CHECK-LABEL: f1:
@@ -118,6 +119,73 @@ define i32 @f8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) "no-frame-pointer-elim"="
   ; CHECK:        leaq    224(%rbp), %rsp
 }
 
+define i64 @f9() {
+entry:
+  ; CHECK-LABEL: f9:
+  ; CHECK:      pushq   %rbp
+  ; CHECK:      .seh_pushreg 5
+  ; CHECK-NEXT: movq    %rsp, %rbp
+  ; CHECK:      .seh_setframe 5, 0
+  ; CHECK:      .seh_endprologue
+
+  %call = call i64 asm sideeffect "pushf\0A\09popq $0\0A", "=r,~{dirflag},~{fpsr},~{flags}"()
+  ; CHECK-NEXT: #APP
+  ; CHECK-NEXT: pushfq
+  ; CHECK-NEXT: popq    %rax
+  ; CHECK:      #NO_APP
+
+  ret i64 %call
+  ; CHECK-NEXT: popq    %rbp
+  ; CHECK-NEXT: retq
+}
+
+declare i64 @dummy()
+
+define i64 @f10(i64* %foo, i64 %bar, i64 %baz) {
+  ; CHECK-LABEL: f10:
+  ; CHECK:      pushq   %rbp
+  ; CHECK:      .seh_pushreg 5
+  ; CHECK:      pushq   %rsi
+  ; CHECK:      .seh_pushreg 6
+  ; CHECK:      pushq   %rdi
+  ; CHECK:      .seh_pushreg 7
+  ; CHECK:      subq    $32, %rsp
+  ; CHECK:      .seh_stackalloc 32
+  ; CHECK:      leaq    32(%rsp), %rbp
+  ; CHECK:      .seh_setframe 5, 32
+  ; CHECK:      .seh_endprologue
+
+  %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+  ; PUSHF:      lock cmpxchgq
+  ; PUSHF-NEXT: pushfq
+  ; PUSHF-NEXT: popq %[[REG:.*]]
+  ; SAHF:       lock cmpxchgq
+  ; SAHF-NEXT:  seto    %al
+  ; SAHF-NEXT:  lahf
+
+  %v = extractvalue { i64, i1 } %cx, 0
+  %p = extractvalue { i64, i1 } %cx, 1
+
+  %call = call i64 @dummy()
+  ; PUSHF:      callq dummy
+  ; PUSHF-NEXT: pushq %[[REG]]
+  ; PUSHF-NEXT: popfq
+  ; SAHF:       callq dummy
+  ; SAHF-NEXT:  pushq
+  ; SAHF:       addb $127, %al
+  ; SAHF-NEXT:  sahf
+  ; SAHF-NEXT:  popq
+
+  %sel = select i1 %p, i64 %call, i64 %bar
+  ; CHECK-NEXT: cmovneq
+
+  ret i64 %sel
+  ; CHECK-NEXT: addq    $32, %rsp
+  ; CHECK-NEXT: popq    %rdi
+  ; CHECK-NEXT: popq    %rsi
+  ; CHECK-NEXT: popq    %rbp
+}
+
 declare i8* @llvm.returnaddress(i32) nounwind readnone
 
 declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/X86/win64_sibcall.ll b/test/CodeGen/X86/win64_sibcall.ll
new file mode 100644
index 0000000000000..4001f638c2ab3
--- /dev/null
+++ b/test/CodeGen/X86/win64_sibcall.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32-coreclr | FileCheck %s -check-prefix=WIN_X64
+; RUN: llc < %s -mtriple=x86_64-pc-linux         | FileCheck %s -check-prefix=LINUX
+
+%Object = type <{ [0 x i64*]* }>
+
+define void @C1(%Object addrspace(1)* %param0) gc "coreclr" {
+entry:
+
+; WIN_X64: # BB#0:
+; WIN_X64:	pushq	%rax
+; LINUX:   # BB#0:                                 # %entry
+; LINUX:	movq	$0, -8(%rsp)
+
+  %this = alloca %Object addrspace(1)*
+  store %Object addrspace(1)* null, %Object addrspace(1)** %this
+  store %Object addrspace(1)* %param0, %Object addrspace(1)** %this
+  br label %0
+
+; <label>:0                                       ; preds = %entry
+  %1 = load %Object addrspace(1)*, %Object addrspace(1)** %this, align 8
+
+; WIN_X64:	xorl	%r8d, %r8d
+; WIN_X64:	popq	%rax
+; WIN_X64:	rex64 jmp	C2              # TAILCALL
+; LINUX:	xorl	%edx, %edx
+; LINUX:	jmp	C2                      # TAILCALL
+
+  tail call void @C2(%Object addrspace(1)* %1, i32 0, %Object addrspace(1)* null)
+  ret void
+}
+
+declare void @C2(%Object addrspace(1)*, i32, %Object addrspace(1)*)
+
+; Function Attrs: nounwind
+declare void @llvm.localescape(...) #0
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll
new file mode 100644
index 0000000000000..c9a5fc2b32884
--- /dev/null
+++ b/test/CodeGen/X86/win_coreclr_chkstk.ll
@@ -0,0 +1,143 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32-coreclr | FileCheck %s -check-prefix=WIN_X64
+; RUN: llc < %s -mtriple=x86_64-pc-linux         | FileCheck %s -check-prefix=LINUX
+
+; By default, windows CoreCLR requires an inline prologue stack expansion check
+; if more than 4096 bytes are allocated on the stack.
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+define i32 @main4k() nounwind {
+entry:
+; WIN_X64-LABEL:main4k:
+; WIN_X64: # BB#0:
+; WIN_X64:      movl    $4096, %eax
+; WIN_X64:      movq    %rcx, 8(%rsp)
+; WIN_X64:	movq	%rdx, 16(%rsp)
+; WIN_X64:	xorq	%rcx, %rcx
+; WIN_X64:	movq	%rsp, %rdx
+; WIN_X64:	subq	%rax, %rdx
+; WIN_X64:	cmovbq	%rcx, %rdx
+; WIN_X64:	movq	%gs:16, %rcx
+; WIN_X64:	cmpq	%rcx, %rdx
+; WIN_X64:	jae	.LBB0_3
+; WIN_X64:# BB#1:
+; WIN_X64:	andq	$-4096, %rdx
+; WIN_X64:.LBB0_2:
+; WIN_X64:	leaq	-4096(%rcx), %rcx
+; WIN_X64:	movb	$0, (%rcx)
+; WIN_X64:	cmpq	%rcx, %rdx
+; WIN_X64:	jne	.LBB0_2
+; WIN_X64:.LBB0_3:
+; WIN_X64:	movq	8(%rsp), %rcx
+; WIN_X64:	movq	16(%rsp), %rdx
+; WIN_X64:	subq	%rax, %rsp
+; WIN_X64:	xorl	%eax, %eax
+; WIN_X64:	addq	$4096, %rsp
+; WIN_X64:	retq
+; LINUX-LABEL:main4k:
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX: 	retq
+  %a = alloca [4096 x i8]
+  ret i32 0
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with frame pointer
+define i32 @main4k_frame() nounwind "no-frame-pointer-elim"="true" {
+entry:
+; WIN_X64-LABEL:main4k_frame:
+; WIN_X64:      movq    %rcx,   16(%rsp)
+; WIN_X64:      movq    %gs:16, %rcx
+; LINUX-LABEL:main4k_frame:
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX: 	retq
+  %a = alloca [4096 x i8]
+  ret i32 0
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with INT args
+define i32 @main4k_intargs(i32 %x, i32 %y) nounwind {
+entry:
+; WIN_X64:      movq    %rcx,   8(%rsp)
+; WIN_X64:      movq    %gs:16, %rcx
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX: 	retq
+  %a = alloca [4096 x i8]
+  %t = add i32 %x, %y
+  ret i32 %t
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with FP regs
+define i32 @main4k_fpargs(double %x, double %y) nounwind {
+entry:
+; WIN_X64:      movq    %rcx,   8(%rsp)
+; WIN_X64:      movq    %gs:16, %rcx
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX: 	retq
+  %a = alloca [4096 x i8]
+  ret i32 0
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with mixed regs
+define i32 @main4k_mixargs(double %x, i32 %y) nounwind {
+entry:
+; WIN_X64:      movq    %gs:16, %rcx
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX: 	retq
+  %a = alloca [4096 x i8]
+  ret i32 %y
+}
+
+; Make sure we don't emit the probe for a smaller prolog stack allocation.
+define i32 @main128() nounwind {
+entry:
+; WIN_X64-NOT:  movq    %gs:16, %rcx
+; WIN_X64:      retq
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX: 	retq
+  %a = alloca [128 x i8]
+  ret i32 0
+}
+
+; Make sure we don't emit the probe sequence if not on windows even if the
+; caller has the Win64 calling convention.
+define x86_64_win64cc i32 @main4k_win64() nounwind {
+entry:
+; WIN_X64:      movq    %gs:16, %rcx
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX: 	retq
+  %a = alloca [4096 x i8]
+  ret i32 0
+}
+
+declare i32 @bar(i8*) nounwind
+
+; Within-body inline probe expansion
+define x86_64_win64cc i32 @main4k_alloca(i64 %n) nounwind {
+entry:
+; WIN_X64: 	callq	bar
+; WIN_X64:  	movq	%gs:16, [[R:%r.*]]
+; WIN_X64: 	callq	bar
+; LINUX: 	callq	bar
+; LINUX-NOT:  	movq	%gs:16, [[R:%r.*]]
+; LINUX: 	callq	bar
+  %a = alloca i8, i64 1024
+  %ra = call i32 @bar(i8* %a) nounwind
+  %b = alloca i8, i64 %n
+  %rb = call i32 @bar(i8* %b) nounwind
+  %r = add i32 %ra, %rb
+  ret i32 %r
+}
+
+; Influence of stack-probe-size attribute
+; Note this is not exposed in coreclr
+define i32 @test_probe_size() "stack-probe-size"="8192" nounwind {
+; WIN_X64-NOT:  movq    %gs:16, %rcx
+; WIN_X64: 	retq
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX: 	retq
+  %a = alloca [4096 x i8]
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/win_eh_prepare.ll b/test/CodeGen/X86/win_eh_prepare.ll
deleted file mode 100644
index 3e3f9af058223..0000000000000
--- a/test/CodeGen/X86/win_eh_prepare.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: opt -S -winehprepare -dwarfehprepare -mtriple x86_64-pc-windows-msvc < %s | FileCheck %s
-
-; FIXME: Add and test outlining here.
-
-declare void @maybe_throw()
-
-@_ZTIi = external constant i8*
-@g = external global i32
-
-declare i32 @__C_specific_handler(...)
-declare i32 @__gxx_personality_seh0(...)
-declare i32 @llvm.eh.typeid.for(i8*) readnone nounwind
-
-define i32 @use_seh() personality i32 (...)* @__C_specific_handler {
-entry:
-  invoke void @maybe_throw()
-      to label %cont unwind label %lpad
-
-cont:
-  ret i32 0
-
-lpad:
-  %ehvals = landingpad { i8*, i32 }
-      cleanup
-      catch i8* bitcast (i32 (i8*, i8*)* @filt_g to i8*)
-  %ehsel = extractvalue { i8*, i32 } %ehvals, 1
-  %filt_g_sel = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @filt_g to i8*))
-  %matches = icmp eq i32 %ehsel, %filt_g_sel
-  br i1 %matches, label %ret1, label %eh.resume
-
-ret1:
-  ret i32 1
-
-eh.resume:
-  resume { i8*, i32 } %ehvals
-}
-
-define internal i32 @filt_g(i8*, i8*) {
-  %g = load i32, i32* @g
-  ret i32 %g
-}
-
-; CHECK-LABEL: define i32 @use_seh()
-; CHECK: invoke void @maybe_throw()
-; CHECK-NEXT: to label %cont unwind label %lpad
-; CHECK: landingpad
-; CHECK-NEXT: cleanup
-; CHECK-NEXT: catch
-; CHECK-NEXT: call i8* (...) @llvm.eh.actions({{.*}})
-
-
-; A MinGW64-ish EH style. It could happen if a binary uses both MSVC CRT and
-; mingw CRT and is linked with LTO.
-define i32 @use_gcc() personality i32 (...)* @__gxx_personality_seh0 {
-entry:
-  invoke void @maybe_throw()
-      to label %cont unwind label %lpad
-
-cont:
-  ret i32 0
-
-lpad:
-  %ehvals = landingpad { i8*, i32 }
-      cleanup
-      catch i8* bitcast (i8** @_ZTIi to i8*)
-  %ehsel = extractvalue { i8*, i32 } %ehvals, 1
-  %filt_g_sel = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @filt_g to i8*))
-  %matches = icmp eq i32 %ehsel, %filt_g_sel
-  br i1 %matches, label %ret1, label %eh.resume
-
-ret1:
-  ret i32 1
-
-eh.resume:
-  resume { i8*, i32 } %ehvals
-}
-
-; CHECK-LABEL: define i32 @use_gcc()
-; CHECK: invoke void @maybe_throw()
-; CHECK-NEXT: to label %cont unwind label %lpad
-; CHECK: eh.resume:
-; CHECK: call void @_Unwind_Resume(i8* %exn.obj)
diff --git a/test/CodeGen/X86/win_ftol2.ll b/test/CodeGen/X86/win_ftol2.ll
deleted file mode 100644
index dfa6e3aa76bdd..0000000000000
--- a/test/CodeGen/X86/win_ftol2.ll
+++ /dev/null
@@ -1,166 +0,0 @@
-; RUN: llc < %s -mtriple=i686-pc-win32 -mcpu=generic | FileCheck %s -check-prefix=FTOL
-; RUN: llc < %s -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=COMPILERRT
-; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s -check-prefix=COMPILERRT
-; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=COMPILERRT
-; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=COMPILERRT
-; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s -check-prefix=COMPILERRT
-; RUN: llc < %s -mattr=-sse -O0 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=FTOL_2
-
-; Win32 targets use the MSVCRT _ftol2 runtime function for fptoui to i64. This
-; function has a nonstandard calling convention: the input value is expected on
-; the x87 stack instead of the callstack. The input value is popped by the
-; callee. Mingw32 uses normal cdecl compiler-rt functions.
-
-define i64 @double_ui64(double %x) nounwind {
-entry:
-; COMPILERRT: @double_ui64
-; COMPILERRT-NOT: calll __ftol2
-; FTOL: @double_ui64
-; FTOL: fldl
-; FTOL: calll __ftol2
-; FTOL-NOT: fstp
-  %0 = fptoui double %x to i64
-  ret i64 %0
-}
-
-define i64 @float_ui64(float %x) nounwind {
-entry:
-; COMPILERRT: @float_ui64
-; COMPILERRT-NOT: calll __ftol2
-; FTOL: @float_ui64
-; FTOL: flds
-; FTOL: calll __ftol2
-; FTOL-NOT: fstp
-  %0 = fptoui float %x to i64
-  ret i64 %0
-}
-
-define i64 @double_ui64_2(double %x, double %y, double %z) nounwind {
-; COMPILERRT: @double_ui64_2
-; FTOL: @double_ui64_2
-; FTOL_2: @double_ui64_2
-;; stack is empty
-; FTOL_2: fldl
-;; stack is %z
-; FTOL_2: fldl
-;; stack is %y %z
-; FTOL_2: fldl
-;; stack is %x %y %z
-; FTOL_2: fdiv %st(0), %st(1)
-;; stack is %x %1 %z
-; FTOL_2: fsubp %st(2)
-;; stack is %1 %2
-; FTOL_2: fxch
-; FTOL_2-NOT: fld
-; FTOL_2-NOT: fst
-;; stack is %2 %1
-; FTOL_2: calll __ftol2
-; FTOL_2-NOT: fxch
-; FTOL_2-NOT: fld
-; FTOL_2-NOT: fst
-; FTOL_2: calll __ftol2
-;; stack is empty
-
-  %1 = fdiv double %x, %y
-  %2 = fsub double %x, %z
-  %3 = fptoui double %2 to i64
-  %4 = fptoui double %1 to i64
-  %5 = sub i64 %4, %3
-  ret i64 %5
-}
-
-define i64 @double_ui64_3(double %x, double %y, double %z) nounwind {
-; COMPILERRT: @double_ui64_3
-; FTOL: @double_ui64_3
-; FTOL_2: @double_ui64_3
-;; stack is empty
-; FTOL_2: fldl
-;; stack is %z
-; FTOL_2: fldl
-;; stack is %y %z
-; FTOL_2: fldl
-;; stack is %x %y %z
-; FTOL_2: fdiv %st(0), %st(1)
-;; stack is %x %1 %z
-; FTOL_2: fsubp %st(2)
-;; stack is %1 %2
-; FTOL_2-NOT: fxch
-; FTOL_2-NOT: fld
-; FTOL_2-NOT: fst
-;; stack is %1 %2 (still)
-; FTOL_2: calll __ftol2
-; FTOL_2-NOT: fxch
-; FTOL_2-NOT: fld
-; FTOL_2-NOT: fst
-; FTOL_2: calll __ftol2
-;; stack is empty
-
-  %1 = fdiv double %x, %y
-  %2 = fsub double %x, %z
-  %3 = fptoui double %1 to i64
-  %4 = fptoui double %2 to i64
-  %5 = sub i64 %4, %3
-  ret i64 %5
-}
-
-define {double, i64} @double_ui64_4(double %x, double %y) nounwind {
-; COMPILERRT: @double_ui64_4
-; FTOL: @double_ui64_4
-; FTOL_2: @double_ui64_4
-;; stack is empty
-; FTOL_2: fldl
-;; stack is %y
-; FTOL_2: fldl
-;; stack is %x %y
-; FTOL_2: fxch
-;; stack is %y %x
-; FTOL_2: calll __ftol2
-;; stack is %x
-; FTOL_2: fld %st(0)
-;; stack is %x %x
-; FTOL_2: calll __ftol2
-;; stack is %x
-
-  %1 = fptoui double %y to i64
-  %2 = fptoui double %x to i64
-  %3 = sub i64 %2, %1
-  %4 = insertvalue {double, i64} undef, double %x, 0
-  %5 = insertvalue {double, i64} %4, i64 %3, 1
-  ret {double, i64} %5
-}
-
-define i32 @double_ui32_5(double %X) {
-; FTOL: @double_ui32_5
-; FTOL: calll __ftol2
-  %tmp.1 = fptoui double %X to i32
-  ret i32 %tmp.1
-}
-
-define i64 @double_ui64_5(double %X) {
-; FTOL: @double_ui64_5
-; FTOL: calll __ftol2
-  %tmp.1 = fptoui double %X to i64
-  ret i64 %tmp.1
-}
-
-define double @pr23957_32(double %A) {
-; FTOL-LABEL: @pr23957_32
-; FTOL: fldl
-; FTOL-NEXT: fld %st(0)
-; FTOL-NEXT: calll __ftol2
-  %B = fptoui double %A to i32
-  %C = uitofp i32 %B to double
-  %D = fsub double %C, %A
-  ret double %D
-}
-
-define double @pr23957_64(double %A) {
-; FTOL-LABEL: @pr23957_64
-; FTOL: fldl
-; FTOL-NEXT: fld %st(0)
-; FTOL-NEXT: calll __ftol2
-  %B = fptoui double %A to i64
-  %C = uitofp i64 %B to double
-  %D = fsub double %C, %A
-  ret double %D
-}
diff --git a/test/CodeGen/X86/wineh-coreclr.ll b/test/CodeGen/X86/wineh-coreclr.ll
new file mode 100644
index 0000000000000..b61876827cac8
--- /dev/null
+++ b/test/CodeGen/X86/wineh-coreclr.ll
@@ -0,0 +1,267 @@
+; RUN: llc -mtriple=x86_64-pc-windows-coreclr -verify-machineinstrs < %s | FileCheck %s
+
+declare void @ProcessCLRException()
+declare void @f(i32)
+declare void @g(i8 addrspace(1)*)
+declare i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token)
+
+; Simplified IR for pseudo-C# like the following:
+; void test1() {
+;   try {
+;     f(1);
+;     try {
+;       f(2);
+;     } catch (type1) {
+;       f(3);
+;     } catch (type2) {
+;       f(4);
+;       try {
+;         f(5);
+;       } fault {
+;         f(6);
+;       }
+;     }
+;   } finally {
+;     f(7);
+;   }
+;   f(8);
+; }
+
+; CHECK-LABEL: test1:     # @test1
+; CHECK-NEXT: [[L_begin:.*func_begin.*]]:
+define void @test1() personality i8* bitcast (void ()* @ProcessCLRException to i8*) {
+entry:
+; CHECK: # %entry
+; CHECK: leaq [[FPOffset:[0-9]+]](%rsp), %rbp
+; CHECK: .seh_endprologue
+; CHECK: movq %rsp, [[PSPSymOffset:[0-9]+]](%rsp)
+; CHECK: [[L_before_f1:.+]]:
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[L_after_f1:.+]]:
+  invoke void @f(i32 1)
+    to label %inner_try unwind label %finally.pad
+inner_try:
+; CHECK: # %inner_try
+; CHECK: [[L_before_f2:.+]]:
+; CHECK-NEXT: movl $2, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[L_after_f2:.+]]:
+  invoke void @f(i32 2)
+    to label %finally.clone unwind label %catch1.pad
+catch1.pad:
+  %cs1 = catchswitch within none [label %catch1.body, label %catch2.body] unwind label %finally.pad
+catch1.body:
+  %catch1 = catchpad within %cs1 [i32 1]
+; CHECK: .seh_proc [[L_catch1:[^ ]+]]
+; CHECK: .seh_stackalloc [[FuncletFrameSize:[0-9]+]]
+;                        ^ all funclets use the same frame size
+; CHECK: movq [[PSPSymOffset]](%rcx), %rcx
+;                              ^ establisher frame pointer passed in rcx
+; CHECK: movq %rcx, [[PSPSymOffset]](%rsp)
+; CHECK: leaq [[FPOffset]](%rcx), %rbp
+; CHECK: .seh_endprologue
+; CHECK: movq %rdx, %rcx
+;             ^ exception pointer passed in rdx
+; CHECK-NEXT: callq g
+  %exn1 = call i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token %catch1)
+  call void @g(i8 addrspace(1)* %exn1) [ "funclet"(token %catch1) ]
+; CHECK: [[L_before_f3:.+]]:
+; CHECK-NEXT: movl $3, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[L_after_f3:.+]]:
+  invoke void @f(i32 3) [ "funclet"(token %catch1) ]
+    to label %catch1.ret unwind label %finally.pad
+catch1.ret:
+  catchret from %catch1 to label %finally.clone
+catch2.body:
+  %catch2 = catchpad within %cs1 [i32 2]
+; CHECK: .seh_proc [[L_catch2:[^ ]+]]
+; CHECK: .seh_stackalloc [[FuncletFrameSize:[0-9]+]]
+;                        ^ all funclets use the same frame size
+; CHECK: movq [[PSPSymOffset]](%rcx), %rcx
+;                              ^ establisher frame pointer passed in rcx
+; CHECK: movq %rcx, [[PSPSymOffset]](%rsp)
+; CHECK: leaq [[FPOffset]](%rcx), %rbp
+; CHECK: .seh_endprologue
+; CHECK: movq %rdx, %rcx
+;             ^ exception pointer passed in rdx
+; CHECK-NEXT: callq g
+  %exn2 = call i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token %catch2)
+  call void @g(i8 addrspace(1)* %exn2) [ "funclet"(token %catch2) ]
+; CHECK: [[L_before_f4:.+]]:
+; CHECK-NEXT: movl $4, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[L_after_f4:.+]]:
+  invoke void @f(i32 4) [ "funclet"(token %catch2) ]
+    to label %try_in_catch unwind label %finally.pad
+try_in_catch:
+; CHECK: # %try_in_catch
+; CHECK: [[L_before_f5:.+]]:
+; CHECK-NEXT: movl $5, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[L_after_f5:.+]]:
+  invoke void @f(i32 5) [ "funclet"(token %catch2) ]
+    to label %catch2.ret unwind label %fault.pad
+fault.pad:
+; CHECK: .seh_proc [[L_fault:[^ ]+]]
+  %fault = cleanuppad within none [i32 undef]
+; CHECK: .seh_stackalloc [[FuncletFrameSize:[0-9]+]]
+;                        ^ all funclets use the same frame size
+; CHECK: movq [[PSPSymOffset]](%rcx), %rcx
+;                              ^ establisher frame pointer passed in rcx
+; CHECK: movq %rcx, [[PSPSymOffset]](%rsp)
+; CHECK: leaq [[FPOffset]](%rcx), %rbp
+; CHECK: .seh_endprologue
+; CHECK: [[L_before_f6:.+]]:
+; CHECK-NEXT: movl $6, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[L_after_f6:.+]]:
+  invoke void @f(i32 6) [ "funclet"(token %fault) ]
+    to label %fault.ret unwind label %finally.pad
+fault.ret:
+  cleanupret from %fault unwind label %finally.pad
+catch2.ret:
+  catchret from %catch2 to label %finally.clone
+finally.clone:
+  call void @f(i32 7)
+  br label %tail
+finally.pad:
+; CHECK: .seh_proc [[L_finally:[^ ]+]]
+  %finally = cleanuppad within none []
+; CHECK: .seh_stackalloc [[FuncletFrameSize:[0-9]+]]
+;                        ^ all funclets use the same frame size
+; CHECK: movq [[PSPSymOffset]](%rcx), %rcx
+;                              ^ establisher frame pointer passed in rcx
+; CHECK: movq %rcx, [[PSPSymOffset]](%rsp)
+; CHECK: leaq [[FPOffset]](%rcx), %rbp
+; CHECK: .seh_endprologue
+; CHECK-NEXT: movl $7, %ecx
+; CHECK-NEXT: callq f
+  call void @f(i32 7) [ "funclet"(token %finally) ]
+  cleanupret from %finally unwind to caller
+tail:
+  call void @f(i32 8)
+  ret void
+; CHECK: [[L_end:.*func_end.*]]:
+}
+
+; FIXME: Verify that the new clauses are correct and re-enable these checks.
+
+; Now check for EH table in xdata (following standard xdata)
+; CHECKX-LABEL: .section .xdata
+; standard xdata comes here
+; CHECKX:      .long 4{{$}}
+;                   ^ number of funclets
+; CHECKX-NEXT: .long [[L_catch1]]-[[L_begin]]
+;                   ^ offset from L_begin to start of 1st funclet
+; CHECKX-NEXT: .long [[L_catch2]]-[[L_begin]]
+;                   ^ offset from L_begin to start of 2nd funclet
+; CHECKX-NEXT: .long [[L_fault]]-[[L_begin]]
+;                   ^ offset from L_begin to start of 3rd funclet
+; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+;                   ^ offset from L_begin to start of 4th funclet
+; CHECKX-NEXT: .long [[L_end]]-[[L_begin]]
+;                   ^ offset from L_begin to end of last funclet
+; CHECKX-NEXT: .long 7
+;                   ^ number of EH clauses
+; Clause 1: call f(2) is guarded by catch1
+; CHECKX-NEXT: .long 0
+;                   ^ flags (0 => catch handler)
+; CHECKX-NEXT: .long ([[L_before_f2]]-[[L_begin]])+1
+;                   ^ offset of start of clause
+; CHECKX-NEXT: .long ([[L_after_f2]]-[[L_begin]])+1
+;                   ^ offset of end of clause
+; CHECKX-NEXT: .long [[L_catch1]]-[[L_begin]]
+;                   ^ offset of start of handler
+; CHECKX-NEXT: .long [[L_catch2]]-[[L_begin]]
+;                   ^ offset of end of handler
+; CHECKX-NEXT: .long 1
+;                   ^ type token of catch (from catchpad)
+; Clause 2: call f(2) is also guarded by catch2
+; CHECKX-NEXT: .long 0
+;                   ^ flags (0 => catch handler)
+; CHECKX-NEXT: .long ([[L_before_f2]]-[[L_begin]])+1
+;                   ^ offset of start of clause
+; CHECKX-NEXT: .long ([[L_after_f2]]-[[L_begin]])+1
+;                   ^ offset of end of clause
+; CHECKX-NEXT: .long [[L_catch2]]-[[L_begin]]
+;                   ^ offset of start of handler
+; CHECKX-NEXT: .long [[L_fault]]-[[L_begin]]
+;                   ^ offset of end of handler
+; CHECKX-NEXT: .long 2
+;                   ^ type token of catch (from catchpad)
+; Clause 3: calls f(1) and f(2) are guarded by finally
+; CHECKX-NEXT: .long 2
+;                   ^ flags (2 => finally handler)
+; CHECKX-NEXT: .long ([[L_before_f1]]-[[L_begin]])+1
+;                   ^ offset of start of clause
+; CHECKX-NEXT: .long ([[L_after_f2]]-[[L_begin]])+1
+;                   ^ offset of end of clause
+; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+;                   ^ offset of start of handler
+; CHECKX-NEXT: .long [[L_end]]-[[L_begin]]
+;                   ^ offset of end of handler
+; CHECKX-NEXT: .long 0
+;                   ^ type token slot (null for finally)
+; Clause 4: call f(3) is guarded by finally
+;           This is a "duplicate" because the protected range (f(3))
+;           is in funclet catch1 but the finally's immediate parent
+;           is the main function, not that funclet.
+; CHECKX-NEXT: .long 10
+;                   ^ flags (2 => finally handler | 8 => duplicate)
+; CHECKX-NEXT: .long ([[L_before_f3]]-[[L_begin]])+1
+;                   ^ offset of start of clause
+; CHECKX-NEXT: .long ([[L_after_f3]]-[[L_begin]])+1
+;                   ^ offset of end of clause
+; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+;                   ^ offset of start of handler
+; CHECKX-NEXT: .long [[L_end]]-[[L_begin]]
+;                   ^ offset of end of handler
+; CHECKX-NEXT: .long 0
+;                   ^ type token slot (null for finally)
+; Clause 5: call f(5) is guarded by fault
+; CHECKX-NEXT: .long 4
+;                   ^ flags (4 => fault handler)
+; CHECKX-NEXT: .long ([[L_before_f5]]-[[L_begin]])+1
+;                   ^ offset of start of clause
+; CHECKX-NEXT: .long ([[L_after_f5]]-[[L_begin]])+1
+;                   ^ offset of end of clause
+; CHECKX-NEXT: .long [[L_fault]]-[[L_begin]]
+;                   ^ offset of start of handler
+; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+;                   ^ offset of end of handler
+; CHECKX-NEXT: .long 0
+;                   ^ type token slot (null for fault)
+; Clause 6: calls f(4) and f(5) are guarded by finally
+;           This is a "duplicate" because the protected range (f(4)-f(5))
+;           is in funclet catch2 but the finally's immediate parent
+;           is the main function, not that funclet.
+; CHECKX-NEXT: .long 10
+;                   ^ flags (2 => finally handler | 8 => duplicate)
+; CHECKX-NEXT: .long ([[L_before_f4]]-[[L_begin]])+1
+;                   ^ offset of start of clause
+; CHECKX-NEXT: .long ([[L_after_f5]]-[[L_begin]])+1
+;                   ^ offset of end of clause
+; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+;                   ^ offset of start of handler
+; CHECKX-NEXT: .long [[L_end]]-[[L_begin]]
+;                   ^ offset of end of handler
+; CHECKX-NEXT: .long 0
+;                   ^ type token slot (null for finally)
+; Clause 7: call f(6) is guarded by finally
+;           This is a "duplicate" because the protected range (f(3))
+;           is in funclet catch1 but the finally's immediate parent
+;           is the main function, not that funclet.
+; CHECKX-NEXT: .long 10
+;                   ^ flags (2 => finally handler | 8 => duplicate)
+; CHECKX-NEXT: .long ([[L_before_f6]]-[[L_begin]])+1
+;                   ^ offset of start of clause
+; CHECKX-NEXT: .long ([[L_after_f6]]-[[L_begin]])+1
+;                   ^ offset of end of clause
+; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+;                   ^ offset of start of handler
+; CHECKX-NEXT: .long [[L_end]]-[[L_begin]]
+;                   ^ offset of end of handler
+; CHECKX-NEXT: .long 0
+;                   ^ type token slot (null for finally)
diff --git a/test/CodeGen/X86/wineh-exceptionpointer.ll b/test/CodeGen/X86/wineh-exceptionpointer.ll
new file mode 100644
index 0000000000000..f6fd4fe7c5256
--- /dev/null
+++ b/test/CodeGen/X86/wineh-exceptionpointer.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=x86_64-pc-windows-coreclr < %s | FileCheck %s
+
+declare void @ProcessCLRException()
+declare i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token)
+declare void @f()
+declare void @g(i32 addrspace(1)*)
+
+; CHECK-LABEL: test1: # @test1
+define void @test1() personality i8* bitcast (void ()* @ProcessCLRException to i8*) {
+entry:
+  invoke void @f()
+    to label %exit unwind label %catch.pad
+catch.pad:
+  %cs1 = catchswitch within none [label %catch.body] unwind to caller
+catch.body:
+  ; CHECK: {{^[^: ]+}}: # %catch.body
+  ; CHECK: movq %rdx, %rcx
+  ; CHECK-NEXT: callq g
+  %catch = catchpad within %cs1 [i32 5]
+  %exn = call i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token %catch)
+  %cast_exn = bitcast i8 addrspace(1)* %exn to i32 addrspace(1)*
+  call void @g(i32 addrspace(1)* %cast_exn) [ "funclet"(token %catch) ]
+  catchret from %catch to label %exit
+exit:
+  ret void
+}
diff --git a/test/CodeGen/X86/wineh-no-ehpads.ll b/test/CodeGen/X86/wineh-no-ehpads.ll
new file mode 100644
index 0000000000000..fd6798f2e0885
--- /dev/null
+++ b/test/CodeGen/X86/wineh-no-ehpads.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @g()
+declare i32 @__CxxFrameHandler3(...)
+
+define void @personality_no_ehpad() personality i32 (...)* @__CxxFrameHandler3 {
+  call void @g()
+  ret void
+}
+
+; CHECK-LABEL: personality_no_ehpad: # @personality_no_ehpad
+; CHECK-NOT: movq $-2,
+; CHECK: callq g
+; CHECK: nop
+; CHECK: retq
+
+; Shouldn't have any LSDA either.
+; CHECK-NOT: cppxdata
diff --git a/test/CodeGen/X86/x32-function_pointer-3.ll b/test/CodeGen/X86/x32-function_pointer-3.ll
index 5eaf85d8f9315..f5687b8a9de2f 100644
--- a/test/CodeGen/X86/x32-function_pointer-3.ll
+++ b/test/CodeGen/X86/x32-function_pointer-3.ll
@@ -3,7 +3,7 @@
 
 ; Test calling function pointer passed in struct
 
-;    The fuction argument `h' in
+;    The function argument `h' in
 
 ;    struct foo {
 ;      void (*f) (void);
diff --git a/test/CodeGen/X86/x32-indirectbr.ll b/test/CodeGen/X86/x32-indirectbr.ll
new file mode 100644
index 0000000000000..7c83827990c7a
--- /dev/null
+++ b/test/CodeGen/X86/x32-indirectbr.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-none-none-gnux32 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-none-none-gnux32 -mcpu=generic -fast-isel | FileCheck %s
+; Bug 22859
+;
+; x32 pointers are 32-bits wide. x86-64 indirect branches use the full 64-bit
+; registers. Therefore, x32 CodeGen needs to zero extend indirectbr's target to
+; 64-bit.
+
+define i8 @test1() nounwind ssp {
+entry:
+  %0 = select i1 undef,                           ; <i8*> [#uses=1]
+              i8* blockaddress(@test1, %bb),
+              i8* blockaddress(@test1, %bb6)
+  indirectbr i8* %0, [label %bb, label %bb6]
+bb:                                               ; preds = %entry
+  ret i8 1
+
+bb6:                                              ; preds = %entry
+  ret i8 2
+}
+; CHECK-LABEL: @test1
+; We are looking for a movl ???, %r32 followed by a 64-bit jmp through the
+; same register.
+; CHECK: movl {{.*}}, %{{e|r}}[[REG:.[^d]*]]{{d?}}
+; CHECK-NEXT: jmpq *%r[[REG]]
+
diff --git a/test/CodeGen/X86/x32-landingpad.ll b/test/CodeGen/X86/x32-landingpad.ll
new file mode 100644
index 0000000000000..b026a31a4045a
--- /dev/null
+++ b/test/CodeGen/X86/x32-landingpad.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=x86_64-none-none-gnux32 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-none-none-gnux32 -mcpu=generic -fast-isel | FileCheck %s
+;
+; Ensures that landingpad instructions in x32 use the right Exception Pointer
+; and Exception Selector registers.
+
+declare void @foo()
+declare void @bar(i8*, i32) noreturn
+declare i32 @__gxx_personality_v0(...)
+
+define void @test1() uwtable personality i32 (...)* @__gxx_personality_v0 {
+entry:
+  invoke void @foo() to label %done unwind label %lpad
+done:
+  ret void
+lpad:
+  %0 = landingpad { i8*, i32 } cleanup
+; The Exception Pointer is %eax; the Exception Selector, %edx.
+; CHECK: LBB{{[^%]*}} %lpad
+; CHECK-DAG: movl %eax, {{.*}}
+; CHECK-DAG: movl %edx, {{.*}}
+; CHECK: callq bar
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = extractvalue { i8*, i32 } %0, 1
+  call void @bar(i8* %1, i32 %2)
+  unreachable
+}
diff --git a/test/CodeGen/X86/x32-va_start.ll b/test/CodeGen/X86/x32-va_start.ll
new file mode 100644
index 0000000000000..a48468880507e
--- /dev/null
+++ b/test/CodeGen/X86/x32-va_start.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -mattr=-sse | FileCheck %s -check-prefix=CHECK -check-prefix=NOSSE
+;
+; Verifies that x32 va_start lowering is sane. To regenerate this test, use
+; cat <<EOF |
+; #include <stdarg.h>
+;
+; int foo(float a, const char* fmt, ...) {
+;   va_list ap;
+;   va_start(ap, fmt);
+;   int value = va_arg(ap, int);
+;   va_end(ap);
+;   return value;
+; }
+; EOF
+; build/bin/clang -mx32 -O3 -o- -S -emit-llvm -xc -
+;
+target datalayout = "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnux32"
+
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+
+define i32 @foo(float %a, i8* nocapture readnone %fmt, ...) nounwind {
+entry:
+  %ap = alloca [1 x %struct.__va_list_tag], align 16
+  %0 = bitcast [1 x %struct.__va_list_tag]* %ap to i8*
+  call void @llvm.lifetime.start(i64 16, i8* %0) #2
+  call void @llvm.va_start(i8* %0)
+; SSE: subl $72, %esp
+; SSE: testb %al, %al
+; SSE: je .[[NOFP:.*]]
+; SSE-DAG: movaps %xmm1
+; SSE-DAG: movaps %xmm2
+; SSE-DAG: movaps %xmm3
+; SSE-DAG: movaps %xmm4
+; SSE-DAG: movaps %xmm5
+; SSE-DAG: movaps %xmm6
+; SSE-DAG: movaps %xmm7
+; NOSSE-NOT: xmm
+; SSE: .[[NOFP]]:
+; CHECK-DAG: movq %r9
+; CHECK-DAG: movq %r8
+; CHECK-DAG: movq %rcx
+; CHECK-DAG: movq %rdx
+; CHECK-DAG: movq %rsi
+  %gp_offset_p = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 0
+  %gp_offset = load i32, i32* %gp_offset_p, align 16
+  %fits_in_gp = icmp ult i32 %gp_offset, 41
+  br i1 %fits_in_gp, label %vaarg.in_reg, label %vaarg.in_mem
+; CHECK: cmpl $40, [[COUNT:.*]]
+; CHECK: ja .[[IN_MEM:.*]]
+
+vaarg.in_reg:                                     ; preds = %entry
+  %1 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 3
+  %reg_save_area = load i8*, i8** %1, align 4
+  %2 = getelementptr i8, i8* %reg_save_area, i32 %gp_offset
+  %3 = add i32 %gp_offset, 8
+  store i32 %3, i32* %gp_offset_p, align 16
+  br label %vaarg.end
+; CHECK: movl {{[^,]*}}, [[ADDR:.*]]
+; CHECK: addl [[COUNT]], [[ADDR]]
+; SSE: jmp .[[END:.*]]
+; NOSSE: movl ([[ADDR]]), %eax
+; NOSSE: retq
+; CHECK: .[[IN_MEM]]:
+vaarg.in_mem:                                     ; preds = %entry
+  %overflow_arg_area_p = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 2
+  %overflow_arg_area = load i8*, i8** %overflow_arg_area_p, align 8
+  %overflow_arg_area.next = getelementptr i8, i8* %overflow_arg_area, i32 8
+  store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8
+  br label %vaarg.end
+; CHECK: movl {{[^,]*}}, [[ADDR]]
+; NOSSE: movl ([[ADDR]]), %eax
+; NOSSE: retq
+; SSE: .[[END]]:
+
+vaarg.end:                                        ; preds = %vaarg.in_mem, %vaarg.in_reg
+  %vaarg.addr.in = phi i8* [ %2, %vaarg.in_reg ], [ %overflow_arg_area, %vaarg.in_mem ]
+  %vaarg.addr = bitcast i8* %vaarg.addr.in to i32*
+  %4 = load i32, i32* %vaarg.addr, align 4
+  call void @llvm.va_end(i8* %0)
+  call void @llvm.lifetime.end(i64 16, i8* %0) #2
+  ret i32 %4
+; SSE: movl ([[ADDR]]), %eax
+; SSE: retq
+}
+
+; Function Attrs: nounwind argmemonly
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) nounwind
+
+; Function Attrs: nounwind
+declare void @llvm.va_end(i8*) nounwind
+
+; Function Attrs: nounwind argmemonly
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
diff --git a/test/CodeGen/X86/x86-32-intrcc.ll b/test/CodeGen/X86/x86-32-intrcc.ll
new file mode 100644
index 0000000000000..908da3d11206f
--- /dev/null
+++ b/test/CodeGen/X86/x86-32-intrcc.ll
@@ -0,0 +1,79 @@
+; RUN: llc -mtriple=i686-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-unknown -O0 < %s | FileCheck %s -check-prefix=CHECK0
+
+%struct.interrupt_frame = type { i32, i32, i32, i32, i32 }
+
+@llvm.used = appending global [3 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_clobbers to i8*)], section "llvm.metadata"
+
+; Spills eax, putting original esp at +4.
+; No stack adjustment if declared with no error code
+define x86_intrcc void @test_isr_no_ecode(%struct.interrupt_frame* %frame) {
+  ; CHECK-LABEL: test_isr_no_ecode:
+  ; CHECK: pushl %eax
+  ; CHECK: movl 12(%esp), %eax
+  ; CHECK: popl %eax
+  ; CHECK: iretl
+  ; CHECK0-LABEL: test_isr_no_ecode:
+  ; CHECK0: pushl %eax
+  ; CHECK0: leal 4(%esp), %eax
+  ; CHECK0: movl 8(%eax), %eax
+  ; CHECK0: popl %eax
+  ; CHECK0: iretl
+  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
+  %flags = load i32, i32* %pflags, align 4
+  call void asm sideeffect "", "r"(i32 %flags)
+  ret void
+}
+
+; Spills eax and ecx, putting original esp at +8. Stack is adjusted up another 4 bytes
+; before return, popping the error code.
+define x86_intrcc void @test_isr_ecode(%struct.interrupt_frame* %frame, i32 %ecode) {
+  ; CHECK-LABEL: test_isr_ecode
+  ; CHECK: pushl %ecx
+  ; CHECK: pushl %eax
+  ; CHECK: movl 8(%esp), %eax
+  ; CHECK: movl 20(%esp), %ecx
+  ; CHECK: popl %eax
+  ; CHECK: popl %ecx
+  ; CHECK: addl $4, %esp
+  ; CHECK: iretl
+  ; CHECK0-LABEL: test_isr_ecode
+  ; CHECK0: pushl %ecx
+  ; CHECK0: pushl %eax
+  ; CHECK0: movl 8(%esp), %eax
+  ; CHECK0: leal 12(%esp), %ecx
+  ; CHECK0: movl 8(%ecx), %ecx
+  ; CHECK0: popl %eax
+  ; CHECK0: popl %ecx
+  ; CHECK0: addl $4, %esp
+  ; CHECK0: iretl
+  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
+  %flags = load i32, i32* %pflags, align 4
+  call x86_fastcallcc void asm sideeffect "", "r,r"(i32 %flags, i32 %ecode)
+  ret void
+}
+
+; All clobbered registers must be saved
+define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i32 %ecode) {
+  call void asm sideeffect "", "~{eax},~{ebx},~{ebp}"()
+  ; CHECK-LABEL: test_isr_clobbers
+  ; CHECK-SSE-NEXT: pushl %ebp
+  ; CHECK-SSE-NEXT: pushl %ebx
+  ; CHECK-SSE-NEXT; pushl %eax
+  ; CHECK-SSE-NEXT: popl %eax
+  ; CHECK-SSE-NEXT: popl %ebx
+  ; CHECK-SSE-NEXT: popl %ebp
+  ; CHECK-SSE-NEXT: addl $4, %esp
+  ; CHECK-SSE-NEXT: iretl
+  ; CHECK0-LABEL: test_isr_clobbers
+  ; CHECK0-SSE-NEXT: pushl %ebp
+  ; CHECK0-SSE-NEXT: pushl %ebx
+  ; CHECK0-SSE-NEXT; pushl %eax
+  ; CHECK0-SSE-NEXT: popl %eax
+  ; CHECK0-SSE-NEXT: popl %ebx
+  ; CHECK0-SSE-NEXT: popl %ebp
+  ; CHECK0-SSE-NEXT: addl $4, %esp
+  ; CHECK0-SSE-NEXT: iretl
+  ret void
+}
+
diff --git a/test/CodeGen/X86/x86-64-baseptr.ll b/test/CodeGen/X86/x86-64-baseptr.ll
index 7fd94fa10f6c1..ad8334719b327 100644
--- a/test/CodeGen/X86/x86-64-baseptr.ll
+++ b/test/CodeGen/X86/x86-64-baseptr.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-pc-linux -force-align-stack -stack-alignment=32 < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-pc-linux-gnux32 -force-align-stack -stack-alignment=32 < %s | FileCheck -check-prefix=X32ABI %s
+; RUN: llc -mtriple=x86_64-pc-linux -stackrealign -stack-alignment=32 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnux32 -stackrealign -stack-alignment=32 < %s | FileCheck -check-prefix=X32ABI %s
 ; This should run with NaCl as well ( -mtriple=x86_64-pc-nacl ) but currently doesn't due to PR22655
 
 ; Make sure the correct register gets set up as the base pointer
diff --git a/test/CodeGen/X86/x86-64-double-precision-shift-left.ll b/test/CodeGen/X86/x86-64-double-precision-shift-left.ll
index f2380f23b8eeb..7515c46f7ceee 100644
--- a/test/CodeGen/X86/x86-64-double-precision-shift-left.ll
+++ b/test/CodeGen/X86/x86-64-double-precision-shift-left.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 | FileCheck %s
 ; Verify that for the architectures that are known to have poor latency
 ; double precision shift instructions we generate alternative sequence 
 ; of instructions with lower latencies instead of shld instruction.
@@ -8,11 +8,9 @@
 ;    return (a << 1) | (b >> 63);
 ;}
 
-; CHECK:             lshift1:
-; CHECK:             addq    {{.*}},{{.*}}
-; CHECK-NEXT:        shrq    $63, {{.*}}
-; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
-
+; CHECK-LABEL:       lshift1:
+; CHECK:             shrq    $63, %rsi
+; CHECK-NEXT:        leaq    (%rsi,%rdi,2), %rax
 
 define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable {
 entry:
@@ -27,10 +25,9 @@ entry:
 ;    return (a << 2) | (b >> 62);
 ;}
 
-; CHECK:             lshift2:
-; CHECK:             shlq    $2, {{.*}}
-; CHECK-NEXT:        shrq    $62, {{.*}}
-; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+; CHECK-LABEL:       lshift2:
+; CHECK:             shrq    $62, %rsi
+; CHECK-NEXT:        leaq    (%rsi,%rdi,4), %rax
 
 define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable {
 entry:
diff --git a/test/CodeGen/X86/x86-64-double-precision-shift-right.ll b/test/CodeGen/X86/x86-64-double-precision-shift-right.ll
index 5edaad89df4cd..5e3f229417130 100644
--- a/test/CodeGen/X86/x86-64-double-precision-shift-right.ll
+++ b/test/CodeGen/X86/x86-64-double-precision-shift-right.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 | FileCheck %s
 ; Verify that for the architectures that are known to have poor latency
 ; double precision shift instructions we generate alternative sequence 
 ; of instructions with lower latencies instead of shrd instruction.
@@ -61,10 +61,9 @@ define i64 @rshift7(i64 %a, i64 %b) nounwind readnone uwtable {
 ;    return (a >> 63) | (b << 1);
 ;}
 
-; CHECK:             rshift63:
-; CHECK:             shrq    $63, {{.*}}
-; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
-; CHECK-NEXT:        orq     {{.*}}, {{.*}}
+; CHECK-LABEL:       rshift63:
+; CHECK:             shrq    $63, %rdi
+; CHECK-NEXT:        leaq    (%rdi,%rsi,2), %rax
 
 define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable {
   %1 = lshr i64 %a, 63
diff --git a/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
index 08d0257a0e5ce..ba559aa2ff0eb 100644
--- a/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -9,7 +9,7 @@
 ;     return (a << 10) | (b >> 54);
 ; }
 
-; Function Attrs: minsize nounwind optsize readnone uwtable
+; Function Attrs: minsize nounwind readnone uwtable
 define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 {
 entry:
 ; CHECK:   shldq   $10
@@ -19,7 +19,7 @@ entry:
   ret i64 %or
 }
 
-attributes #0 = { minsize nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { minsize nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 
 ; clang -Os -c test2.cpp -emit-llvm -S
diff --git a/test/CodeGen/X86/x86-64-intrcc.ll b/test/CodeGen/X86/x86-64-intrcc.ll
new file mode 100644
index 0000000000000..8f70b391fa10e
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-intrcc.ll
@@ -0,0 +1,86 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s -check-prefix=CHECK0
+
+%struct.interrupt_frame = type { i64, i64, i64, i64, i64 }
+
+@llvm.used = appending global [3 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_clobbers to i8*)], section "llvm.metadata"
+
+; Spills rax, putting original esp at +8.
+; No stack adjustment if declared with no error code
+define x86_intrcc void @test_isr_no_ecode(%struct.interrupt_frame* %frame) {
+  ; CHECK-LABEL: test_isr_no_ecode:
+  ; CHECK: pushq %rax
+  ; CHECK: movq 24(%rsp), %rax
+  ; CHECK: popq %rax
+  ; CHECK: iretq
+  ; CHECK0-LABEL: test_isr_no_ecode:
+  ; CHECK0: pushq %rax
+  ; CHECK0: leaq 8(%rsp), %rax
+  ; CHECK0: movq 16(%rax), %rax
+  ; CHECK0: popq %rax
+  ; CHECK0: iretq
+  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
+  %flags = load i64, i64* %pflags, align 4
+  call void asm sideeffect "", "r"(i64 %flags)
+  ret void
+}
+
+; Spills rax and rcx, putting original rsp at +16. Stack is adjusted up another 8 bytes
+; before return, popping the error code.
+define x86_intrcc void @test_isr_ecode(%struct.interrupt_frame* %frame, i64 %ecode) {
+  ; CHECK-LABEL: test_isr_ecode
+  ; CHECK: pushq %rax
+  ; CHECK: pushq %rcx
+  ; CHECK: movq 16(%rsp), %rax
+  ; CHECK: movq 40(%rsp), %rcx
+  ; CHECK: popq %rcx
+  ; CHECK: popq %rax
+  ; CHECK: addq $8, %rsp
+  ; CHECK: iretq
+  ; CHECK0-LABEL: test_isr_ecode
+  ; CHECK0: pushq %rax
+  ; CHECK0: pushq %rcx
+  ; CHECK0: movq 16(%rsp), %rax
+  ; CHECK0: leaq 24(%rsp), %rcx
+  ; CHECK0: movq 16(%rcx), %rcx
+  ; CHECK0: popq %rcx
+  ; CHECK0: popq %rax
+  ; CHECK0: addq $8, %rsp
+  ; CHECK0: iretq
+  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
+  %flags = load i64, i64* %pflags, align 4
+  call void asm sideeffect "", "r,r"(i64 %flags, i64 %ecode)
+  ret void
+}
+
+; All clobbered registers must be saved
+define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i64 %ecode) {
+  call void asm sideeffect "", "~{rax},~{rbx},~{rbp},~{r11},~{xmm0}"()
+  ; CHECK-LABEL: test_isr_clobbers
+  ; CHECK-SSE-NEXT: pushq %rax
+  ; CHECK-SSE-NEXT; pushq %r11
+  ; CHECK-SSE-NEXT: pushq %rbp
+  ; CHECK-SSE-NEXT: pushq %rbx
+  ; CHECK-SSE-NEXT: movaps %xmm0
+  ; CHECK-SSE-NEXT: movaps %xmm0
+  ; CHECK-SSE-NEXT: popq %rbx
+  ; CHECK-SSE-NEXT: popq %rbp
+  ; CHECK-SSE-NEXT: popq %r11
+  ; CHECK-SSE-NEXT: popq %rax
+  ; CHECK-SSE-NEXT: addq $8, %rsp
+  ; CHECK-SSE-NEXT: iretq
+  ; CHECK0-LABEL: test_isr_clobbers
+  ; CHECK0-SSE-NEXT: pushq %rax
+  ; CHECK0-SSE-NEXT; pushq %r11
+  ; CHECK0-SSE-NEXT: pushq %rbp
+  ; CHECK0-SSE-NEXT: pushq %rbx
+  ; CHECK0-SSE-NEXT: movaps %xmm0
+  ; CHECK0-SSE-NEXT: movaps %xmm0
+  ; CHECK0-SSE-NEXT: popq %rbx
+  ; CHECK0-SSE-NEXT: popq %rbp
+  ; CHECK0-SSE-NEXT: popq %r11
+  ; CHECK0-SSE-NEXT: popq %rax
+  ; CHECK0-SSE-NEXT: addq $8, %rsp
+  ; CHECK0-SSE-NEXT: iretq
+  ret void
+}
\ No newline at end of file
diff --git a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
new file mode 100644
index 0000000000000..e3436521a5bda
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
@@ -0,0 +1,108 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux-gnu | FileCheck %s
+
+; Verify that the var arg parameters which are passed in registers are stored
+; in home stack slots allocated by the caller and that AP is correctly
+; calculated.
+define x86_64_win64cc void @average_va(i32 %count, ...) nounwind {
+entry:
+; CHECK: pushq
+; CHECK: movq   %r9, 40(%rsp)
+; CHECK: movq   %r8, 32(%rsp)
+; CHECK: movq   %rdx, 24(%rsp)
+; CHECK: leaq   24(%rsp), %rax
+
+  %ap = alloca i8*, align 8                       ; <i8**> [#uses=1]
+  %ap.0 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap.0)
+  ret void
+}
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_copy(i8*, i8*) nounwind
+declare void @llvm.va_end(i8*) nounwind
+
+; CHECK-LABEL: f5:
+; CHECK: pushq
+; CHECK: leaq 56(%rsp),
+define x86_64_win64cc i8** @f5(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap.0 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap.0)
+  ret i8** %ap
+}
+
+; CHECK-LABEL: f4:
+; CHECK: pushq
+; CHECK: leaq 48(%rsp),
+define x86_64_win64cc i8** @f4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap.0 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap.0)
+  ret i8** %ap
+}
+
+; CHECK-LABEL: f3:
+; CHECK: pushq
+; CHECK: leaq 40(%rsp),
+define x86_64_win64cc i8** @f3(i64 %a0, i64 %a1, i64 %a2, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap.0 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap.0)
+  ret i8** %ap
+}
+
+; WinX86_64 uses char* for va_list. Verify that the correct amount of bytes
+; are copied using va_copy.
+
+; CHECK-LABEL: copy1:
+; CHECK: leaq 32(%rsp), [[REG_copy1:%[a-z]+]]
+; CHECK: movq [[REG_copy1]], 8(%rsp)
+; CHECK: movq [[REG_copy1]], (%rsp)
+; CHECK: ret
+define x86_64_win64cc void @copy1(i64 %a0, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %cp = alloca i8*, align 8
+  %ap.0 = bitcast i8** %ap to i8*
+  %cp.0 = bitcast i8** %cp to i8*
+  call void @llvm.va_start(i8* %ap.0)
+  call void @llvm.va_copy(i8* %cp.0, i8* %ap.0)
+  ret void
+}
+
+; CHECK-LABEL: copy4:
+; CHECK: leaq 56(%rsp), [[REG_copy4:%[a-z]+]]
+; CHECK: movq [[REG_copy4]], 8(%rsp)
+; CHECK: movq [[REG_copy4]], (%rsp)
+; CHECK: ret
+define x86_64_win64cc void @copy4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %cp = alloca i8*, align 8
+  %ap.0 = bitcast i8** %ap to i8*
+  %cp.0 = bitcast i8** %cp to i8*
+  call void @llvm.va_start(i8* %ap.0)
+  call void @llvm.va_copy(i8* %cp.0, i8* %ap.0)
+  ret void
+}
+
+; CHECK-LABEL: arg4:
+; va_start:
+; CHECK: leaq 48(%rsp), [[REG_arg4_1:%[a-z]+]]
+; CHECK: movq [[REG_arg4_1]], (%rsp)
+; va_arg:
+; CHECK: leaq 52(%rsp), [[REG_arg4_2:%[a-z]+]]
+; CHECK: movq [[REG_arg4_2]], (%rsp)
+; CHECK: movl 48(%rsp), %eax
+; CHECK: ret
+define x86_64_win64cc i32 @arg4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap.0 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap.0)
+  %tmp = va_arg i8** %ap, i32
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll
index 8790fa6072b39..d76cf6a17552c 100644
--- a/test/CodeGen/X86/x86-64-pic-10.ll
+++ b/test/CodeGen/X86/x86-64-pic-10.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
 ; RUN: grep "callq	g@PLT" %t1
 
-@g = weak alias i32 ()* @f
+@g = weak alias i32 (), i32 ()* @f
 
 define void @h() {
 entry:
diff --git a/test/CodeGen/X86/x86-fold-pshufb.ll b/test/CodeGen/X86/x86-fold-pshufb.ll
index c29e592bfe83d..84af4f5d4b865 100644
--- a/test/CodeGen/X86/x86-fold-pshufb.ll
+++ b/test/CodeGen/X86/x86-fold-pshufb.ll
@@ -1,11 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -relocation-model=pic -march=x86-64 -mtriple=x86_64-unknown-unknown -mattr=+ssse3 < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-unknown -mattr=+ssse3 < %s | FileCheck %s
 
 ; Verify that the backend correctly folds the shuffle in function 'fold_pshufb'
 ; into a simple load from constant pool.
 
 define <2 x i64> @fold_pshufb() {
 ; CHECK-LABEL: fold_pshufb:
-; CHECK:       # BB#0:
+; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0]
 ; CHECK-NEXT:    retq
 entry:
@@ -14,4 +16,20 @@ entry:
   ret <2 x i64> %1
 }
 
+; The pshufb from function @pr24562 was wrongly folded into its first operand
+; as a result of a late target shuffle combine on the legalized selection dag.
+;
+; Check that the pshufb is correctly folded to a zero vector.
+
+define <2 x i64> @pr24562() {
+; CHECK-LABEL: pr24562:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) #2
+  %1 = bitcast <16 x i8> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
diff --git a/test/CodeGen/X86/x86-sanitizer-shrink-wrapping.ll b/test/CodeGen/X86/x86-sanitizer-shrink-wrapping.ll
new file mode 100644
index 0000000000000..4cb11bf3f5cda
--- /dev/null
+++ b/test/CodeGen/X86/x86-sanitizer-shrink-wrapping.ll
@@ -0,0 +1,40 @@
+; RUN: llc -o - < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; Even if the target supports shrink-wrapping, the prologue and epilogue
+; must not move because a crash can happen anywhere and sanitizers need
+; to be able to unwind from the PC of the crash.
+; CHECK-LABEL: sanitize:
+; CHECK: pushq
+; CHECK: incl 40
+; CHECK: popq
+; CHECK-NEXT: retq
+; CHECK: movl $40, %edi
+; CHECK-NEXT callq ___asan_report_load4
+define  void @sanitize() #0 {
+entry:
+  %tmp = load i8, i8* inttoptr (i64 17592186044421 to i8*)
+  %tmp1 = icmp ne i8 %tmp, 0
+  br i1 %tmp1, label %if.then, label %else
+
+if.then:
+  %tmp3 = icmp sge i8 3, %tmp
+  br i1 %tmp3, label %else, label %end
+
+else:
+  call void @__asan_report_load4(i64 40)
+  call void asm sideeffect "", ""()
+  unreachable
+
+end:
+  %tmp6 = load i32, i32* inttoptr (i64 40 to i32*), align 8
+  %inc = add nsw i32 %tmp6, 1
+  store i32 %inc, i32* inttoptr (i64 40 to i32*), align 8
+  ret void
+}
+
+attributes #0 = { sanitize_address nounwind "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
+
+declare void @__asan_report_load4(i64)
diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
index 248a9202e9979..99b27efe7f54c 100644
--- a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
+++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
@@ -39,10 +39,10 @@ define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwin
 ; Also test the general purpose constant folding of int->fp.
 define void @foo2(<4 x float>* noalias %result) nounwind {
 ; CHECK-LABEL: LCPI2_0:
-; CHECK-NEXT: .long 1082130432              ## float 4.000000e+00
-; CHECK-NEXT: .long 1084227584              ## float 5.000000e+00
-; CHECK-NEXT: .long 1086324736              ## float 6.000000e+00
-; CHECK-NEXT: .long 1088421888              ## float 7.000000e+00
+; CHECK-NEXT: .long 1082130432              ## float 4
+; CHECK-NEXT: .long 1084227584              ## float 5
+; CHECK-NEXT: .long 1086324736              ## float 6
+; CHECK-NEXT: .long 1088421888              ## float 7
 ; CHECK-LABEL: foo2:
 ; CHECK:  movaps LCPI2_0(%rip), %xmm0
 
@@ -72,10 +72,10 @@ define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {
 ; Test the general purpose constant folding of uint->fp.
 define void @foo4(<4 x float>* noalias %result) nounwind {
 ; CHECK-LABEL: LCPI4_0:
-; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
-; CHECK-NEXT: .long 1123942400              ## float 1.270000e+02
-; CHECK-NEXT: .long 1124073472              ## float 1.280000e+02
-; CHECK-NEXT: .long 1132396544              ## float 2.550000e+02
+; CHECK-NEXT: .long 1065353216              ## float 1
+; CHECK-NEXT: .long 1123942400              ## float 127
+; CHECK-NEXT: .long 1124073472              ## float 128
+; CHECK-NEXT: .long 1132396544              ## float 255
 ; CHECK-LABEL: foo4:
 ; CHECK:  movaps LCPI4_0(%rip), %xmm0
 
diff --git a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
new file mode 100644
index 0000000000000..7c00f407b1e07
--- /dev/null
+++ b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -0,0 +1,153 @@
+; RUN: llc %s -o - | FileCheck %s --check-prefix=CHECK
+;
+; This test checks that we do not use shrink-wrapping when
+; the function does not have any frame pointer and may unwind.
+; This is a workaround for a limitation in the emission of
+; the CFI directives, that are not correct in such case.
+; PR25614
+;
+; Note: This test cannot be merged with the shrink-wrapping tests
+; because the booleans set on the command line take precedence on
+; the target logic that disable shrink-wrapping.
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+
+; No shrink-wrapping should occur here, until the CFI information are fixed.
+; CHECK-LABEL: framelessUnwind:
+;
+; Prologue code.
+; (What we push does not matter. It should be some random sratch register.)
+; CHECK: pushq
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
+; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Store %a in the alloca.
+; CHECK: movl [[ARG0CPY]], 4(%rsp)
+; Set the alloca address in the second argument.
+; CHECK-NEXT: leaq 4(%rsp), %rsi
+; Set the first argument to zero.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _doSomething
+;
+; CHECK: [[EXIT_LABEL]]:
+;
+; Without shrink-wrapping, epilogue is in the exit block.
+; Epilogue code. (What we pop does not matter.)
+; CHECK-NEXT: popq
+;
+; CHECK-NEXT: retq
+define i32 @framelessUnwind(i32 %a, i32 %b) #0 {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+declare i32 @doSomething(i32, i32*)
+
+attributes #0 = { "no-frame-pointer-elim"="false" }
+
+; Shrink-wrapping should occur here. We have a frame pointer.
+; CHECK-LABEL: frameUnwind:
+;
+; Compare the arguments and jump to exit.
+; No prologue needed.
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
+; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+;
+; Store %a in the alloca.
+; CHECK: movl [[ARG0CPY]], -4(%rbp)
+; Set the alloca address in the second argument.
+; CHECK-NEXT: leaq -4(%rbp), %rsi
+; Set the first argument to zero.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _doSomething
+;
+; Epilogue code. (What we pop does not matter.)
+; CHECK: popq %rbp
+;
+; CHECK: [[EXIT_LABEL]]:
+; CHECK-NEXT: retq
+define i32 @frameUnwind(i32 %a, i32 %b) #1 {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+attributes #1 = { "no-frame-pointer-elim"="true" }
+
+; Shrink-wrapping should occur here. We do not have to unwind.
+; CHECK-LABEL: framelessnoUnwind:
+;
+; Compare the arguments and jump to exit.
+; No prologue needed.
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
+; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; (What we push does not matter. It should be some random sratch register.)
+; CHECK: pushq
+;
+; Store %a in the alloca.
+; CHECK: movl [[ARG0CPY]], 4(%rsp)
+; Set the alloca address in the second argument.
+; CHECK-NEXT: leaq 4(%rsp), %rsi
+; Set the first argument to zero.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _doSomething
+;
+; Epilogue code.
+; CHECK-NEXT: addq
+;
+; CHECK: [[EXIT_LABEL]]:
+; CHECK-NEXT: retq
+define i32 @framelessnoUnwind(i32 %a, i32 %b) #2 {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+attributes #2 = { "no-frame-pointer-elim"="false" nounwind }
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 8c91335d91a2b..34e56919468be 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -445,9 +445,9 @@ if.end:                                           ; preds = %for.body, %if.else
 ; CHECK-NEXT: xorl %eax, %eax
 ; CHECK-NEXT: %esi, %edi
 ; CHECK-NEXT: %esi, %edx
+; CHECK-NEXT: %esi, %ecx
 ; CHECK-NEXT: %esi, %r8d
 ; CHECK-NEXT: %esi, %r9d
-; CHECK-NEXT: %esi, %ecx
 ; CHECK-NEXT: callq _someVariadicFunc
 ; CHECK-NEXT: movl %eax, %esi
 ; CHECK-NEXT: shll $3, %esi
@@ -532,7 +532,11 @@ declare hidden fastcc %struct.temp_slot* @find_temp_slot_from_address(%struct.rt
 ;
 ; CHECK: movl $24599, [[TMP2:%e[a-z]+]]
 ; CHECK-NEXT: btl [[TMP]], [[TMP2]]
-; CHECK-NEXT: jb [[CLEANUP]]
+; CHECK-NEXT: jae [[LOR_LHS_FALSE:LBB[0-9_]+]]
+;
+; CHECK: [[CLEANUP]]: ## %cleanup
+; DISABLE: popq
+; CHECK-NEXT: retq
 ;
 ; CHECK: [[LOR_LHS_FALSE]]: ## %lor.lhs.false
 ; CHECK: cmpl $134, %e[[BF_LOAD2]]
@@ -551,10 +555,6 @@ declare hidden fastcc %struct.temp_slot* @find_temp_slot_from_address(%struct.rt
 ; CHECK-NEXT: je [[CLEANUP]]
 ;
 ; CHECK: movb $1, 57(%rax)
-;
-; CHECK: [[CLEANUP]]: ## %cleanup
-; DISABLE: popq
-; CHECK-NEXT: retq
 define void @useLEA(%struct.rtx_def* readonly %x) {
 entry:
   %cmp = icmp eq %struct.rtx_def* %x, null
@@ -637,3 +637,245 @@ if.end:
 declare void @abort() #0
 
 attributes #0 = { noreturn nounwind }
+
+
+; Make sure that we handle infinite loops properly When checking that the Save
+; and Restore blocks are control flow equivalent, the loop searches for the
+; immediate (post) dominator for the (restore) save blocks. When either the Save
+; or Restore block is located in an infinite loop the only immediate (post)
+; dominator is itself. In this case, we cannot perform shrink wrapping, but we
+; should return gracefully and continue compilation.
+; The only condition for this test is the compilation finishes correctly.
+;
+; CHECK-LABEL: infiniteloop
+; CHECK: retq
+define void @infiniteloop() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %ptr = alloca i32, i32 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
+  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %add = add nsw i32 %call, %sum.03
+  store i32 %add, i32* %ptr
+  br label %for.body
+
+if.end:
+  ret void
+}
+
+; Another infinite loop test this time with a body bigger than just one block.
+; CHECK-LABEL: infiniteloop2
+; CHECK: retq
+define void @infiniteloop2() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %ptr = alloca i32, i32 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi i32 [ 0, %if.then ], [ %add, %body1 ], [ 1, %body2]
+  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %add = add nsw i32 %call, %sum.03
+  store i32 %add, i32* %ptr
+  br i1 undef, label %body1, label %body2
+
+body1:
+  tail call void asm sideeffect "nop", "~{ebx}"()
+  br label %for.body
+
+body2:
+  tail call void asm sideeffect "nop", "~{ebx}"()
+  br label %for.body
+
+if.end:
+  ret void
+}
+
+; Another infinite loop test this time with two nested infinite loop.
+; CHECK-LABEL: infiniteloop3
+; CHECK: retq
+define void @infiniteloop3() {
+entry:
+  br i1 undef, label %loop2a, label %body
+
+body:                                             ; preds = %entry
+  br i1 undef, label %loop2a, label %end
+
+loop1:                                            ; preds = %loop2a, %loop2b
+  %var.phi = phi i32* [ %next.phi, %loop2b ], [ %var, %loop2a ]
+  %next.phi = phi i32* [ %next.load, %loop2b ], [ %next.var, %loop2a ]
+  %0 = icmp eq i32* %var, null
+  %next.load = load i32*, i32** undef
+  br i1 %0, label %loop2a, label %loop2b
+
+loop2a:                                           ; preds = %loop1, %body, %entry
+  %var = phi i32* [ null, %body ], [ null, %entry ], [ %next.phi, %loop1 ]
+  %next.var = phi i32* [ undef, %body ], [ null, %entry ], [ %next.load, %loop1 ]
+  br label %loop1
+
+loop2b:                                           ; preds = %loop1
+  %gep1 = bitcast i32* %var.phi to i32*
+  %next.ptr = bitcast i32* %gep1 to i32**
+  store i32* %next.phi, i32** %next.ptr
+  br label %loop1
+
+end:
+  ret void
+}
+
+; Check that we just don't bail out on RegMask.
+; In this case, the RegMask does not touch a CSR so we are good to go!
+; CHECK-LABEL: regmask:
+;
+; Compare the arguments and jump to exit.
+; No prologue needed.
+; ENABLE: cmpl %esi, %edi
+; ENABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; (What we push does not matter. It should be some random sratch register.)
+; CHECK: pushq
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; DISABLE: cmpl %esi, %edi
+; DISABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; CHECK: nop
+; Set the first argument to zero.
+; CHECK: xorl %edi, %edi
+; Set the second argument to addr.
+; CHECK-NEXT: movq %rdx, %rsi
+; CHECK-NEXT: callq _doSomething
+; CHECK-NEXT: popq
+; CHECK-NEXT: retq
+;
+; CHECK: [[EXIT_LABEL]]:
+; Set the first argument to 6.
+; CHECK-NEXT: movl $6, %edi
+; Set the second argument to addr.
+; CHECK-NEXT: movq %rdx, %rsi
+;
+; Without shrink-wrapping, we need to restore the stack before
+; making the tail call.
+; Epilogue code.
+; DISABLE-NEXT: popq
+;
+; CHECK-NEXT: jmp _doSomething
+define i32 @regmask(i32 %a, i32 %b, i32* %addr) {
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  ; Clobber a CSR so that we check something on the regmask
+  ; of the tail call.
+  tail call void asm sideeffect "nop", "~{ebx}"()
+  %tmp4 = call i32 @doSomething(i32 0, i32* %addr)
+  br label %end
+
+false:
+  %tmp5 = tail call i32 @doSomething(i32 6, i32* %addr)
+  br label %end
+
+end:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %tmp5, %false ]
+  ret i32 %tmp.0
+}
+
+@b = internal unnamed_addr global i1 false
+@c = internal unnamed_addr global i8 0, align 1
+@a = common global i32 0, align 4
+
+; Make sure the prologue does not clobber the EFLAGS when
+; it is live accross.
+; PR25629.
+; Note: The registers may change in the following patterns, but
+; because they imply register hierarchy (e.g., eax, al) this is
+; tricky to write robust patterns.
+;
+; CHECK-LABEL: useLEAForPrologue:
+;
+; Prologue is at the beginning of the function when shrink-wrapping
+; is disabled.
+; DISABLE: pushq
+; The stack adjustment can use SUB instr because we do not need to
+; preserve the EFLAGS at this point.
+; DISABLE-NEXT: subq $16, %rsp
+;
+; Load the value of b.
+; CHECK: movb _b(%rip), [[BOOL:%cl]]
+; Extract i1 from the loaded value.
+; CHECK-NEXT: andb $1, [[BOOL]]
+; Create the zero value for the select assignment.
+; CHECK-NEXT: xorl [[CMOVE_VAL:%eax]], [[CMOVE_VAL]]
+; CHECK-NEXT: testb [[BOOL]], [[BOOL]]
+; CHECK-NEXT: jne [[STOREC_LABEL:LBB[0-9_]+]]
+;
+; CHECK: movb $48, [[CMOVE_VAL:%al]]
+;
+; CHECK: [[STOREC_LABEL]]:
+;
+; ENABLE-NEXT: pushq
+; For the stack adjustment, we need to preserve the EFLAGS.
+; ENABLE-NEXT: leaq -16(%rsp), %rsp
+;
+; Technically, we should use CMOVE_VAL here or its subregister.
+; CHECK-NEXT: movb %al, _c(%rip)
+; testb set the EFLAGS read here.
+; CHECK-NEXT: je [[VARFUNC_CALL:LBB[0-9_]+]]
+;
+; The code of the loop is not interesting.
+; [...]
+;
+; CHECK: [[VARFUNC_CALL]]:
+; Set the null parameter.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _varfunc
+;
+; Set the return value.
+; CHECK-NEXT: xorl %eax, %eax
+;
+; Epilogue code.
+; CHECK-NEXT: addq $16, %rsp
+; CHECK-NEXT: popq
+; CHECK-NEXT: retq
+define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 {
+entry:
+  %tmp = alloca i3
+  %.b = load i1, i1* @b, align 1
+  %bool = select i1 %.b, i8 0, i8 48
+  store i8 %bool, i8* @c, align 1
+  br i1 %.b, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  tail call void asm sideeffect "nop", "~{ebx}"()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %inc6 = phi i8 [ %c, %for.body.lr.ph ], [ %inc, %for.body ]
+  %cond5 = phi i32 [ %a, %for.body.lr.ph ], [ %conv3, %for.body ]
+  %cmp2 = icmp slt i32 %d, %cond5
+  %conv3 = zext i1 %cmp2 to i32
+  %inc = add i8 %inc6, 1
+  %cmp = icmp slt i8 %inc, 45
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  store i32 %conv3, i32* @a, align 4
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %call = tail call i32 (i8*) @varfunc(i8* null)
+  ret i32 0
+}
+
+declare i32 @varfunc(i8* nocapture readonly)
+
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/X86/x86-win64-shrink-wrapping.ll b/test/CodeGen/X86/x86-win64-shrink-wrapping.ll
new file mode 100644
index 0000000000000..395de686d2e25
--- /dev/null
+++ b/test/CodeGen/X86/x86-win64-shrink-wrapping.ll
@@ -0,0 +1,126 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64--windows-gnu"
+
+; The output of this function with or without shrink-wrapping
+; shouldn't change.
+; Indeed, the epilogue block would have been if.else, meaning
+; after the pops, we will have additional instruction (jump, mov,
+; etc.) prior to the return and this is forbidden for Win64.
+; CHECK-LABEL: loopInfoSaveOutsideLoop:
+; CHECK: push
+; CHECK: push
+; CHECK-NOT: popq
+; CHECK: popq
+; CHECK: popq
+; CHECK-NOT: popq
+; CHECK-NEXT: retq
+define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) #0 {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:                                    ; preds = %entry
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.preheader
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
+  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  tail call void asm "nop", "~{ebx}"()
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+; When we can sink the epilogue of the function into an existing exit block,
+; this is Ok for shrink-wrapping to kicks in.
+; CHECK-LABEL: loopInfoSaveOutsideLoop2:
+; ENABLE: testl %ecx, %ecx
+; ENABLE-NEXT: je [[ELSE_LABEL:.LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: rbx.
+; CHECK: pushq %rbp
+; CHECK: pushq %rbx
+;
+; DISABLE: testl %ecx, %ecx
+; DISABLE-NEXT: je [[ELSE_LABEL:.LBB[0-9_]+]]
+;
+; CHECK: nop
+; CHECK: xorl [[SUM:%eax]], [[SUM]]
+; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
+;
+; CHECK: [[LOOP_LABEL:.LBB[0-9_]+]]: # %for.body
+; CHECK: movl $1, [[TMP:%e[a-z]+]]
+; CHECK: addl [[TMP]], [[SUM]]
+; CHECK-NEXT: decl [[IV]]
+; CHECK-NEXT: jne [[LOOP_LABEL]]
+; Next BB.
+; CHECK: nop
+; CHECK: shll $3, [[SUM]]
+;
+; DISABLE: jmp [[EPILOG_BB:.LBB[0-9_]+]]
+;
+; ENABLE-NEXT: popq %rbx
+; ENABLE-NEXT: popq %rbp
+; ENABLE-NEXT: retq
+;
+; CHECK: [[ELSE_LABEL]]: # %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: addl %edx, %edx
+; CHECK: movl %edx, %eax
+;
+; DISABLE: [[EPILOG_BB]]: # %if.end
+; DISABLE-NEXT: popq %rbx
+;
+; CHECK: retq
+;
+define i32 @loopInfoSaveOutsideLoop2(i32 %cond, i32 %N) #0 {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:                                    ; preds = %entry
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.preheader
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
+  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  tail call void asm "nop", "~{ebx}"()
+  %shl = shl i32 %add, 3
+  ret i32 %shl
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  ret i32 %mul
+}
+
+attributes #0 = { uwtable }
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index 2516116f7697e..3b4c6ea121074 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -61,15 +61,14 @@ define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float>
 declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
-  ; CHECK: vpcmov
+  ; CHECK: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
   %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
 
 define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
-  ; CHECK: vpcmov
-  ; CHECK: ymm
+  ; CHECK: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
   %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ;
   ret <4 x i64> %res
 }
@@ -805,6 +804,34 @@ define <8 x i16> @test_int_x86_xop_vprotw(<8 x i16> %a0, <8 x i16> %a1) {
 }
 declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone
 
+define <16 x i8> @test_int_x86_xop_vprotbi(<16 x i8> %a0) {
+  ; CHECK: vprotb
+  %res = call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %a0, i8 1) ;
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vprotdi(<4 x i32> %a0) {
+  ; CHECK: vprotd
+  %res = call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %a0, i8 -2) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vprotqi(<2 x i64> %a0) {
+  ; CHECK: vprotq
+  %res = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 3) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vprotwi(<8 x i16> %a0) {
+  ; CHECK: vprotw
+  %res = call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %a0, i8 -4) ;
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone
+
 define <16 x i8> @test_int_x86_xop_vpshab(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpshab
   %res = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %a0, <16 x i8> %a1) ;
diff --git a/test/CodeGen/X86/xop-pcmov.ll b/test/CodeGen/X86/xop-pcmov.ll
new file mode 100644
index 0000000000000..77aefe993b29c
--- /dev/null
+++ b/test/CodeGen/X86/xop-pcmov.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s
+
+define <4 x double> @pcmov_4f64(<4 x double> %a, <4 x double> %b, <4 x double> %m) {
+; CHECK-LABEL: pcmov_4f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = bitcast <4 x double> %m to <4 x i64>
+  %2 = bitcast <4 x double> %a to <4 x i64>
+  %3 = and <4 x i64> %1, %2
+  %4 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %5 = bitcast <4 x double> %b to <4 x i64>
+  %6 = and <4 x i64> %4, %5
+  %7 = or <4 x i64> %3, %6
+  %8 = bitcast <4 x i64> %7 to <4 x double>
+  ret <4 x double> %8
+}
+
+define <2 x double> @pcmov_2f64(<2 x double> %a, <2 x double> %b, <2 x double> %m) {
+; CHECK-LABEL: pcmov_2f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = bitcast <2 x double> %m to <2 x i64>
+  %2 = bitcast <2 x double> %a to <2 x i64>
+  %3 = and <2 x i64> %1, %2
+  %4 = xor <2 x i64> %1, <i64 -1, i64 -1>
+  %5 = bitcast <2 x double> %b to <2 x i64>
+  %6 = and <2 x i64> %4, %5
+  %7 = or <2 x i64> %3, %6
+  %8 = bitcast <2 x i64> %7 to <2 x double>
+  ret <2 x double> %8
+}
+
+define <8 x float> @pcmov_8f32(<8 x float> %a, <8 x float> %b, <8 x float> %m) {
+; CHECK-LABEL: pcmov_8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = bitcast <8 x float> %m to <8 x i32>
+  %2 = bitcast <8 x float> %a to <8 x i32>
+  %3 = and <8 x i32> %1, %2
+  %4 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = bitcast <8 x float> %b to <8 x i32>
+  %6 = and <8 x i32> %4, %5
+  %7 = or <8 x i32> %3, %6
+  %8 = bitcast <8 x i32> %7 to <8 x float>
+  ret <8 x float> %8
+}
+
+define <4 x float> @pcmov_4f32(<4 x float> %a, <4 x float> %b, <4 x float> %m) {
+; CHECK-LABEL: pcmov_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = bitcast <4 x float> %m to <4 x i32>
+  %2 = bitcast <4 x float> %a to <4 x i32>
+  %3 = and <4 x i32> %1, %2
+  %4 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = bitcast <4 x float> %b to <4 x i32>
+  %6 = and <4 x i32> %4, %5
+  %7 = or <4 x i32> %3, %6
+  %8 = bitcast <4 x i32> %7 to <4 x float>
+  ret <4 x float> %8
+}
+
+define <4 x i64> @pcmov_4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %m) {
+; CHECK-LABEL: pcmov_4i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = and <4 x i64> %a, %m
+  %2 = xor <4 x i64> %m, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %3 = and <4 x i64> %b, %2
+  %4 = or <4 x i64> %1, %3
+  ret <4 x i64> %4
+}
+
+define <2 x i64> @pcmov_2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %m) {
+; CHECK-LABEL: pcmov_2i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = and <2 x i64> %a, %m
+  %2 = xor <2 x i64> %m, <i64 -1, i64 -1>
+  %3 = and <2 x i64> %b, %2
+  %4 = or <2 x i64> %1, %3
+  ret <2 x i64> %4
+}
+
+define <8 x i32> @pcmov_8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %m) {
+; CHECK-LABEL: pcmov_8i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = and <8 x i32> %a, %m
+  %2 = xor <8 x i32> %m, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %3 = and <8 x i32> %b, %2
+  %4 = or <8 x i32> %1, %3
+  ret <8 x i32> %4
+}
+
+define <4 x i32> @pcmov_4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %m) {
+; CHECK-LABEL: pcmov_4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = and <4 x i32> %a, %m
+  %2 = xor <4 x i32> %m, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %3 = and <4 x i32> %b, %2
+  %4 = or <4 x i32> %1, %3
+  ret <4 x i32> %4
+}
+
+define <16 x i16> @pcmov_16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %m) {
+; CHECK-LABEL: pcmov_16i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = and <16 x i16> %a, %m
+  %2 = xor <16 x i16> %m, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %3 = and <16 x i16> %b, %2
+  %4 = or <16 x i16> %1, %3
+  ret <16 x i16> %4
+}
+
+define <8 x i16> @pcmov_8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %m) {
+; CHECK-LABEL: pcmov_8i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = and <8 x i16> %a, %m
+  %2 = xor <8 x i16> %m, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %3 = and <8 x i16> %b, %2
+  %4 = or <8 x i16> %1, %3
+  ret <8 x i16> %4
+}
+
+define <32 x i8> @pcmov_32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %m) {
+; CHECK-LABEL: pcmov_32i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = and <32 x i8> %a, %m
+  %2 = xor <32 x i8> %m, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <32 x i8> %b, %2
+  %4 = or <32 x i8> %1, %3
+  ret <32 x i8> %4
+}
+
+define <16 x i8> @pcmov_16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %m) {
+; CHECK-LABEL: pcmov_16i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = and <16 x i8> %a, %m
+  %2 = xor <16 x i8> %m, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <16 x i8> %b, %2
+  %4 = or <16 x i8> %1, %3
+  ret <16 x i8> %4
+}
diff --git a/test/CodeGen/XCore/aliases.ll b/test/CodeGen/XCore/aliases.ll
index b7ad416968f45..62427dad9b7ce 100644
--- a/test/CodeGen/XCore/aliases.ll
+++ b/test/CodeGen/XCore/aliases.ll
@@ -5,9 +5,9 @@ define void @a_val() nounwind {
 @b_val = constant i32 42, section ".cp.rodata"
 @c_val = global i32 42
 
-@a = alias void ()* @a_val
-@b = alias i32* @b_val
-@c = alias i32* @c_val
+@a = alias void (), void ()* @a_val
+@b = alias i32, i32* @b_val
+@c = alias i32, i32* @c_val
 
 ; CHECK-LABEL: a_addr:
 ; CHECK: ldap r11, a
diff --git a/test/CodeGen/XCore/dwarf_debug.ll b/test/CodeGen/XCore/dwarf_debug.ll
index ba71dc798a04e..6c8f389e8a98a 100644
--- a/test/CodeGen/XCore/dwarf_debug.ll
+++ b/test/CodeGen/XCore/dwarf_debug.ll
@@ -9,7 +9,7 @@
 ; CHECK: .loc 1 2 0 prologue_end      # test.c:2:0
 ; CHECK: add r0, r0, 1
 ; CHECK: retsp 2
-define i32 @f(i32 %a) {
+define i32 @f(i32 %a) !dbg !4 {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
@@ -23,16 +23,16 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
-!0 = !DICompileUnit(language: DW_LANG_C99, isOptimized: false, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, isOptimized: false, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
 !1 = !DIFile(filename: "test.c", directory: "")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "f", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 2, file: !1, scope: !1, type: !6, function: i32 (i32)* @f, variables: !2)
+!4 = distinct !DISubprogram(name: "f", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 2, file: !1, scope: !1, type: !6, variables: !2)
 !6 = !DISubroutineType(types: !7)
 !7 = !{!8, !8}
 !8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !9 = !{i32 2, !"Dwarf Version", i32 4}
 !10 = !{i32 2, !"Debug Info Version", i32 3}
-!11 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 2, arg: 1, scope: !4, file: !1, type: !8)
+!11 = !DILocalVariable(name: "a", line: 2, arg: 1, scope: !4, file: !1, type: !8)
 !12 = !DILocation(line: 2, scope: !4)
 
-- 
cgit v1.3